webrobots 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/Gemfile.lock +4 -4
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/webrobots.rb +6 -1
- data/lib/webrobots/robotstxt.rb +118 -127
- data/lib/webrobots/robotstxt.ry +5 -9
- data/test/test_webrobots.rb +73 -0
- data/webrobots.gemspec +7 -6
- metadata +18 -18
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -2,14 +2,14 @@ GEM
|
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
4
|
git (1.2.5)
|
5
|
-
jeweler (1.6.
|
5
|
+
jeweler (1.6.4)
|
6
6
|
bundler (~> 1.0)
|
7
7
|
git (>= 1.2.5)
|
8
8
|
rake
|
9
|
-
nokogiri (1.
|
9
|
+
nokogiri (1.5.0)
|
10
10
|
racc (1.4.6)
|
11
11
|
rake (0.9.2)
|
12
|
-
rcov (0.9.
|
12
|
+
rcov (0.9.10)
|
13
13
|
shoulda (2.11.3)
|
14
14
|
|
15
15
|
PLATFORMS
|
@@ -17,7 +17,7 @@ PLATFORMS
|
|
17
17
|
|
18
18
|
DEPENDENCIES
|
19
19
|
bundler (~> 1.0.0)
|
20
|
-
jeweler (~> 1.6.
|
20
|
+
jeweler (~> 1.6.4)
|
21
21
|
nokogiri (>= 1.4.4)
|
22
22
|
racc
|
23
23
|
rcov
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ require 'jeweler'
|
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
17
|
gem.name = "webrobots"
|
18
|
-
|
18
|
+
gem.homepage = "https://github.com/knu/webrobots"
|
19
19
|
gem.license = "2-clause BSDL"
|
20
20
|
gem.summary = %Q{A Ruby library to help write robots.txt compliant web robots}
|
21
21
|
gem.description = <<-'EOS'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.11
|
data/lib/webrobots.rb
CHANGED
@@ -30,7 +30,12 @@ class WebRobots
|
|
30
30
|
|
31
31
|
# :nodoc:
|
32
32
|
def create_cache
|
33
|
-
Hash.new # Must respond to [], []=, and
|
33
|
+
Hash.new # Must respond to [], []=, delete and clear.
|
34
|
+
end
|
35
|
+
|
36
|
+
# Flushes robots.txt cache.
|
37
|
+
def flush_cache
|
38
|
+
@robotstxt.clear
|
34
39
|
end
|
35
40
|
|
36
41
|
# Returns the robot name initially given.
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -19,7 +19,7 @@ class WebRobots
|
|
19
19
|
class RobotsTxt
|
20
20
|
class Parser < Racc::Parser
|
21
21
|
|
22
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
22
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 158)
|
23
23
|
|
24
24
|
def initialize(target = nil)
|
25
25
|
super()
|
@@ -38,7 +38,7 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
|
|
38
38
|
def parse(input, site)
|
39
39
|
@q ||= []
|
40
40
|
@errors = []
|
41
|
-
@lineno =
|
41
|
+
@lineno = 0
|
42
42
|
@site = site
|
43
43
|
|
44
44
|
string = input.respond_to?(:read) ? input.read : input
|
@@ -46,6 +46,7 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
|
|
46
46
|
value_expected = false
|
47
47
|
|
48
48
|
until s.eos?
|
49
|
+
@lineno += 1 if s.bol?
|
49
50
|
if t = s.scan(/[ \t]*(?:\r?\n|\z)/)
|
50
51
|
if value_expected
|
51
52
|
@q << [:VALUE, '']
|
@@ -115,70 +116,70 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
|
|
115
116
|
##### State transition tables begin ###
|
116
117
|
|
117
118
|
racc_action_table = [
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
119
|
+
5, 12, -10, 16, 52, 40, -12, 36, 37, 38,
|
120
|
+
39, 12, -10, 16, 46, 27, 27, 36, 37, 38,
|
121
|
+
39, 12, -10, 16, 49, 50, 51, 36, 37, 38,
|
122
|
+
39, 12, -10, 16, 12, 53, 24, 36, 37, 38,
|
123
|
+
39, 12, -10, 16, 12, 12, -12, 12, -10, 16,
|
124
|
+
60, 12, -13, 16, 60, 12, 12, 16, 60, 12,
|
125
|
+
12, 16, 60, 12, 12, 16, 60, 12, 23, 16,
|
126
|
+
60, 12, 62, 16, 63, 64, 65, 66, 5, 9,
|
127
|
+
5, 6, 5 ]
|
127
128
|
|
128
129
|
racc_action_check = [
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
130
|
+
21, 21, 21, 21, 39, 23, 21, 21, 21, 21,
|
131
|
+
21, 25, 25, 25, 27, 19, 25, 25, 25, 25,
|
132
|
+
25, 45, 45, 45, 36, 37, 38, 45, 45, 45,
|
133
|
+
45, 29, 29, 29, 24, 41, 16, 29, 29, 29,
|
134
|
+
29, 7, 7, 7, 46, 49, 7, 13, 13, 13,
|
135
|
+
62, 62, 13, 62, 53, 53, 50, 53, 63, 63,
|
136
|
+
51, 63, 64, 64, 52, 64, 65, 65, 15, 65,
|
137
|
+
66, 66, 54, 66, 55, 56, 57, 58, 11, 6,
|
137
138
|
3, 1, 0 ]
|
138
139
|
|
139
140
|
racc_action_pointer = [
|
140
|
-
80, 81, nil, 78, nil, nil,
|
141
|
-
nil,
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
nil, nil,
|
147
|
-
nil, nil
|
141
|
+
80, 81, nil, 78, nil, nil, 79, 38, nil, nil,
|
142
|
+
nil, 76, nil, 44, nil, 64, 30, nil, nil, 7,
|
143
|
+
nil, -2, nil, 3, 31, 8, nil, 8, nil, 28,
|
144
|
+
nil, nil, nil, nil, nil, nil, 18, 19, 20, -2,
|
145
|
+
nil, 28, nil, nil, nil, 18, 41, nil, nil, 42,
|
146
|
+
53, 57, 61, 52, 65, 67, 68, 69, 70, nil,
|
147
|
+
nil, nil, 48, 56, 60, 64, 68, nil, nil, nil,
|
148
|
+
nil, nil ]
|
148
149
|
|
149
150
|
racc_action_default = [
|
150
|
-
-5, -
|
151
|
-
|
152
|
-
|
153
|
-
-22, -
|
154
|
-
-
|
155
|
-
-
|
156
|
-
-
|
157
|
-
-40, -41
|
151
|
+
-5, -44, -1, -6, -7, -9, -44, -3, -8, 72,
|
152
|
+
-2, -5, -11, -23, -14, -44, -44, -18, -19, -44,
|
153
|
+
-4, -6, -15, -44, -10, -29, -25, -44, -20, -21,
|
154
|
+
-22, -31, -34, -35, -36, -37, -44, -44, -44, -44,
|
155
|
+
-16, -44, -24, -26, -27, -30, -10, -32, -33, -10,
|
156
|
+
-10, -10, -10, -10, -44, -44, -44, -44, -44, -17,
|
157
|
+
-42, -43, -10, -10, -10, -10, -10, -28, -38, -39,
|
158
|
+
-40, -41 ]
|
158
159
|
|
159
160
|
racc_goto_table = [
|
160
|
-
|
161
|
-
18,
|
162
|
-
|
163
|
-
|
164
|
-
|
161
|
+
14, 41, 8, 47, 3, 2, 22, 17, 29, 11,
|
162
|
+
18, 26, 45, 10, 14, 21, 20, 43, 44, 47,
|
163
|
+
8, 28, 48, 54, 30, 25, 55, 56, 57, 58,
|
164
|
+
59, 42, 7, 1, nil, nil, nil, nil, 48, 67,
|
165
|
+
68, 69, 70, 71 ]
|
165
166
|
|
166
167
|
racc_goto_check = [
|
167
|
-
|
168
|
-
14,
|
169
|
-
|
170
|
-
|
171
|
-
|
168
|
+
11, 8, 7, 19, 6, 2, 11, 13, 15, 5,
|
169
|
+
14, 18, 15, 3, 11, 6, 2, 18, 11, 19,
|
170
|
+
7, 13, 11, 8, 14, 16, 8, 8, 8, 8,
|
171
|
+
12, 17, 4, 1, nil, nil, nil, nil, 11, 12,
|
172
|
+
12, 12, 12, 12 ]
|
172
173
|
|
173
174
|
racc_goto_pointer = [
|
174
|
-
nil,
|
175
|
-
nil,
|
176
|
-
|
175
|
+
nil, 33, 5, 6, 30, 2, 4, -1, -23, nil,
|
176
|
+
nil, -7, -23, 0, 3, -13, 6, 6, -8, -26,
|
177
|
+
nil, nil, nil, nil ]
|
177
178
|
|
178
179
|
racc_goto_default = [
|
179
|
-
nil, nil, nil, nil, nil, nil, nil, 4,
|
180
|
-
|
181
|
-
32, 33, 34, 35
|
180
|
+
nil, nil, nil, nil, nil, nil, nil, 4, 15, 19,
|
181
|
+
13, 61, nil, nil, nil, nil, nil, nil, nil, 31,
|
182
|
+
32, 33, 34, 35 ]
|
182
183
|
|
183
184
|
racc_reduce_table = [
|
184
185
|
0, 0, :racc_error,
|
@@ -191,45 +192,44 @@ racc_reduce_table = [
|
|
191
192
|
1, 19, :_reduce_none,
|
192
193
|
2, 19, :_reduce_none,
|
193
194
|
1, 20, :_reduce_none,
|
194
|
-
|
195
|
+
0, 21, :_reduce_none,
|
196
|
+
1, 21, :_reduce_none,
|
195
197
|
0, 22, :_reduce_none,
|
196
198
|
1, 22, :_reduce_none,
|
197
|
-
0, 23, :_reduce_none,
|
198
199
|
1, 23, :_reduce_none,
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
200
|
+
2, 23, :_reduce_none,
|
201
|
+
3, 24, :_reduce_none,
|
202
|
+
5, 24, :_reduce_17,
|
203
|
+
1, 18, :_reduce_18,
|
203
204
|
1, 18, :_reduce_19,
|
204
|
-
|
205
|
+
3, 18, :_reduce_20,
|
205
206
|
3, 18, :_reduce_21,
|
206
|
-
3, 18, :_reduce_22,
|
207
207
|
3, 18, :_reduce_none,
|
208
|
-
1,
|
209
|
-
3,
|
210
|
-
1,
|
211
|
-
2,
|
212
|
-
2, 30, :_reduce_none,
|
213
|
-
5, 32, :_reduce_29,
|
214
|
-
0, 31, :_reduce_none,
|
215
|
-
1, 31, :_reduce_none,
|
216
|
-
1, 29, :_reduce_32,
|
217
|
-
2, 29, :_reduce_33,
|
208
|
+
1, 27, :_reduce_none,
|
209
|
+
3, 26, :_reduce_24,
|
210
|
+
1, 29, :_reduce_25,
|
211
|
+
2, 29, :_reduce_26,
|
218
212
|
2, 29, :_reduce_none,
|
219
|
-
|
220
|
-
|
221
|
-
1,
|
222
|
-
1,
|
213
|
+
5, 31, :_reduce_28,
|
214
|
+
0, 30, :_reduce_none,
|
215
|
+
1, 30, :_reduce_none,
|
216
|
+
1, 28, :_reduce_31,
|
217
|
+
2, 28, :_reduce_32,
|
218
|
+
2, 28, :_reduce_none,
|
219
|
+
1, 32, :_reduce_none,
|
220
|
+
1, 32, :_reduce_none,
|
221
|
+
1, 32, :_reduce_none,
|
222
|
+
1, 32, :_reduce_none,
|
223
|
+
5, 33, :_reduce_38,
|
223
224
|
5, 34, :_reduce_39,
|
224
225
|
5, 35, :_reduce_40,
|
225
226
|
5, 36, :_reduce_41,
|
226
|
-
|
227
|
-
1,
|
228
|
-
1, 26, :_reduce_none ]
|
227
|
+
1, 25, :_reduce_none,
|
228
|
+
1, 25, :_reduce_none ]
|
229
229
|
|
230
|
-
racc_reduce_n =
|
230
|
+
racc_reduce_n = 44
|
231
231
|
|
232
|
-
racc_shift_n =
|
232
|
+
racc_shift_n = 72
|
233
233
|
|
234
234
|
racc_token_table = {
|
235
235
|
false => 0,
|
@@ -288,7 +288,6 @@ Racc_token_to_s_table = [
|
|
288
288
|
"records",
|
289
289
|
"blanklines",
|
290
290
|
"blankline",
|
291
|
-
"eol",
|
292
291
|
"opt_space",
|
293
292
|
"opt_commentlines",
|
294
293
|
"commentlines",
|
@@ -344,13 +343,7 @@ module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
|
344
343
|
|
345
344
|
# reduce 9 omitted
|
346
345
|
|
347
|
-
|
348
|
-
def _reduce_10(val, _values, result)
|
349
|
-
@lineno += 1
|
350
|
-
|
351
|
-
result
|
352
|
-
end
|
353
|
-
.,.,
|
346
|
+
# reduce 10 omitted
|
354
347
|
|
355
348
|
# reduce 11 omitted
|
356
349
|
|
@@ -364,18 +357,16 @@ module_eval(<<'.,.,', 'robotstxt.ry', 31)
|
|
364
357
|
|
365
358
|
# reduce 16 omitted
|
366
359
|
|
367
|
-
|
368
|
-
|
369
|
-
module_eval(<<'.,.,', 'robotstxt.ry', 47)
|
370
|
-
def _reduce_18(val, _values, result)
|
360
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 42)
|
361
|
+
def _reduce_17(val, _values, result)
|
371
362
|
@sitemaps << val[3]
|
372
363
|
|
373
364
|
result
|
374
365
|
end
|
375
366
|
.,.,
|
376
367
|
|
377
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
378
|
-
def
|
368
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 47)
|
369
|
+
def _reduce_18(val, _values, result)
|
379
370
|
result = []
|
380
371
|
result << val[0]
|
381
372
|
|
@@ -383,24 +374,24 @@ module_eval(<<'.,.,', 'robotstxt.ry', 52)
|
|
383
374
|
end
|
384
375
|
.,.,
|
385
376
|
|
386
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
387
|
-
def
|
377
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 52)
|
378
|
+
def _reduce_19(val, _values, result)
|
388
379
|
result = []
|
389
380
|
|
390
381
|
result
|
391
382
|
end
|
392
383
|
.,.,
|
393
384
|
|
394
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
395
|
-
def
|
385
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 58)
|
386
|
+
def _reduce_20(val, _values, result)
|
396
387
|
result << val[2]
|
397
388
|
|
398
389
|
result
|
399
390
|
end
|
400
391
|
.,.,
|
401
392
|
|
402
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
403
|
-
def
|
393
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 64)
|
394
|
+
def _reduce_21(val, _values, result)
|
404
395
|
val[2].each_with_index { |line, i|
|
405
396
|
warn "%s line %d: %s: orphan rule line" %
|
406
397
|
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
|
@@ -410,50 +401,50 @@ module_eval(<<'.,.,', 'robotstxt.ry', 69)
|
|
410
401
|
end
|
411
402
|
.,.,
|
412
403
|
|
413
|
-
# reduce
|
404
|
+
# reduce 22 omitted
|
414
405
|
|
415
|
-
# reduce
|
406
|
+
# reduce 23 omitted
|
416
407
|
|
417
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
418
|
-
def
|
408
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 79)
|
409
|
+
def _reduce_24(val, _values, result)
|
419
410
|
result = Record.new(val[1], val[2])
|
420
411
|
|
421
412
|
result
|
422
413
|
end
|
423
414
|
.,.,
|
424
415
|
|
425
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
426
|
-
def
|
416
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 84)
|
417
|
+
def _reduce_25(val, _values, result)
|
427
418
|
result = [val[0]]
|
428
419
|
|
429
420
|
result
|
430
421
|
end
|
431
422
|
.,.,
|
432
423
|
|
433
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
434
|
-
def
|
424
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 89)
|
425
|
+
def _reduce_26(val, _values, result)
|
435
426
|
result << val[1]
|
436
427
|
|
437
428
|
result
|
438
429
|
end
|
439
430
|
.,.,
|
440
431
|
|
441
|
-
# reduce
|
432
|
+
# reduce 27 omitted
|
442
433
|
|
443
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
444
|
-
def
|
434
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 96)
|
435
|
+
def _reduce_28(val, _values, result)
|
445
436
|
result = AgentLine.new(val[0], val[3])
|
446
437
|
|
447
438
|
result
|
448
439
|
end
|
449
440
|
.,.,
|
450
441
|
|
451
|
-
# reduce
|
442
|
+
# reduce 29 omitted
|
452
443
|
|
453
|
-
# reduce
|
444
|
+
# reduce 30 omitted
|
454
445
|
|
455
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
456
|
-
def
|
446
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 104)
|
447
|
+
def _reduce_31(val, _values, result)
|
457
448
|
result = [result]
|
458
449
|
@rulelinenos = []
|
459
450
|
|
@@ -461,8 +452,8 @@ module_eval(<<'.,.,', 'robotstxt.ry', 109)
|
|
461
452
|
end
|
462
453
|
.,.,
|
463
454
|
|
464
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
465
|
-
def
|
455
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 110)
|
456
|
+
def _reduce_32(val, _values, result)
|
466
457
|
result << val[1]
|
467
458
|
@rulelinenos << @lineno
|
468
459
|
|
@@ -470,6 +461,8 @@ module_eval(<<'.,.,', 'robotstxt.ry', 115)
|
|
470
461
|
end
|
471
462
|
.,.,
|
472
463
|
|
464
|
+
# reduce 33 omitted
|
465
|
+
|
473
466
|
# reduce 34 omitted
|
474
467
|
|
475
468
|
# reduce 35 omitted
|
@@ -478,43 +471,41 @@ module_eval(<<'.,.,', 'robotstxt.ry', 115)
|
|
478
471
|
|
479
472
|
# reduce 37 omitted
|
480
473
|
|
481
|
-
|
482
|
-
|
483
|
-
module_eval(<<'.,.,', 'robotstxt.ry', 128)
|
484
|
-
def _reduce_39(val, _values, result)
|
474
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 123)
|
475
|
+
def _reduce_38(val, _values, result)
|
485
476
|
result = AllowLine.new(val[0], val[3])
|
486
477
|
|
487
478
|
result
|
488
479
|
end
|
489
480
|
.,.,
|
490
481
|
|
491
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
492
|
-
def
|
482
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 128)
|
483
|
+
def _reduce_39(val, _values, result)
|
493
484
|
result = DisallowLine.new(val[0], val[3])
|
494
485
|
|
495
486
|
result
|
496
487
|
end
|
497
488
|
.,.,
|
498
489
|
|
499
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
500
|
-
def
|
490
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 133)
|
491
|
+
def _reduce_40(val, _values, result)
|
501
492
|
result = CrawlDelayLine.new(val[0], val[3])
|
502
493
|
|
503
494
|
result
|
504
495
|
end
|
505
496
|
.,.,
|
506
497
|
|
507
|
-
module_eval(<<'.,.,', 'robotstxt.ry',
|
508
|
-
def
|
498
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 138)
|
499
|
+
def _reduce_41(val, _values, result)
|
509
500
|
result = ExtentionLine.new(val[0], val[3])
|
510
501
|
|
511
502
|
result
|
512
503
|
end
|
513
504
|
.,.,
|
514
505
|
|
515
|
-
# reduce
|
506
|
+
# reduce 42 omitted
|
516
507
|
|
517
|
-
# reduce
|
508
|
+
# reduce 43 omitted
|
518
509
|
|
519
510
|
def _reduce_none(val, _values, result)
|
520
511
|
val[0]
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -25,12 +25,7 @@ rule
|
|
25
25
|
| blanklines
|
26
26
|
blankline
|
27
27
|
|
28
|
-
blankline :
|
29
|
-
|
30
|
-
eol : EOL
|
31
|
-
{
|
32
|
-
@lineno += 1
|
33
|
-
}
|
28
|
+
blankline : EOL
|
34
29
|
|
35
30
|
opt_space :
|
36
31
|
| SPACE
|
@@ -42,7 +37,7 @@ rule
|
|
42
37
|
| commentlines
|
43
38
|
comment
|
44
39
|
|
45
|
-
comment : opt_space COMMENT
|
40
|
+
comment : opt_space COMMENT EOL
|
46
41
|
| 'sitemap' ':' opt_space VALUE eol_opt_comment
|
47
42
|
{
|
48
43
|
@sitemaps << val[3]
|
@@ -144,7 +139,7 @@ rule
|
|
144
139
|
result = ExtentionLine.new(val[0], val[3])
|
145
140
|
}
|
146
141
|
|
147
|
-
eol_opt_comment :
|
142
|
+
eol_opt_comment : EOL
|
148
143
|
| comment
|
149
144
|
|
150
145
|
---- header
|
@@ -178,7 +173,7 @@ class WebRobots
|
|
178
173
|
def parse(input, site)
|
179
174
|
@q ||= []
|
180
175
|
@errors = []
|
181
|
-
@lineno =
|
176
|
+
@lineno = 0
|
182
177
|
@site = site
|
183
178
|
|
184
179
|
string = input.respond_to?(:read) ? input.read : input
|
@@ -186,6 +181,7 @@ class WebRobots
|
|
186
181
|
value_expected = false
|
187
182
|
|
188
183
|
until s.eos?
|
184
|
+
@lineno += 1 if s.bol?
|
189
185
|
if t = s.scan(/[ \t]*(?:\r?\n|\z)/)
|
190
186
|
if value_expected
|
191
187
|
@q << [:VALUE, '']
|
data/test/test_webrobots.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'helper'
|
2
3
|
|
3
4
|
class TestWebRobots < Test::Unit::TestCase
|
@@ -582,4 +583,76 @@ TXT
|
|
582
583
|
end
|
583
584
|
end
|
584
585
|
|
586
|
+
context "robots.txt cache" do
|
587
|
+
setup do
|
588
|
+
@fetched = false
|
589
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
590
|
+
case uri.to_s
|
591
|
+
when 'http://site1.example.org/robots.txt'
|
592
|
+
@fetched = true
|
593
|
+
<<-'TXT'
|
594
|
+
User-Agent: *
|
595
|
+
Disallow: /foo
|
596
|
+
TXT
|
597
|
+
when 'http://site2.example.org/robots.txt'
|
598
|
+
@fetched = true
|
599
|
+
nil
|
600
|
+
end
|
601
|
+
})
|
602
|
+
end
|
603
|
+
|
604
|
+
should "persist unless cache is cleared" do
|
605
|
+
assert !@fetched
|
606
|
+
assert !@robots.allowed?('http://site1.example.org/foo')
|
607
|
+
assert @fetched
|
608
|
+
|
609
|
+
@fetched = false
|
610
|
+
assert @robots.allowed?('http://site1.example.org/bar')
|
611
|
+
assert !@fetched
|
612
|
+
assert @robots.allowed?('http://site1.example.org/baz')
|
613
|
+
assert !@fetched
|
614
|
+
assert !@robots.allowed?('http://site1.example.org/foo')
|
615
|
+
assert !@fetched
|
616
|
+
|
617
|
+
@robots.flush_cache
|
618
|
+
assert !@fetched
|
619
|
+
assert !@robots.allowed?('http://site1.example.org/foo')
|
620
|
+
assert @fetched
|
621
|
+
|
622
|
+
@fetched = false
|
623
|
+
assert @robots.allowed?('http://site1.example.org/bar')
|
624
|
+
assert !@fetched
|
625
|
+
assert @robots.allowed?('http://site1.example.org/baz')
|
626
|
+
assert !@fetched
|
627
|
+
assert !@robots.allowed?('http://site1.example.org/foo')
|
628
|
+
assert !@fetched
|
629
|
+
end
|
630
|
+
|
631
|
+
should "persist for non-existent robots.txt unless cache is cleared" do
|
632
|
+
assert !@fetched
|
633
|
+
assert !@robots.allowed?('http://site2.example.org/foo')
|
634
|
+
assert @fetched
|
635
|
+
|
636
|
+
@fetched = false
|
637
|
+
assert !@robots.allowed?('http://site2.example.org/bar')
|
638
|
+
assert !@fetched
|
639
|
+
assert !@robots.allowed?('http://site2.example.org/baz')
|
640
|
+
assert !@fetched
|
641
|
+
assert !@robots.allowed?('http://site2.example.org/foo')
|
642
|
+
assert !@fetched
|
643
|
+
|
644
|
+
@robots.flush_cache
|
645
|
+
assert !@fetched
|
646
|
+
assert !@robots.allowed?('http://site2.example.org/foo')
|
647
|
+
assert @fetched
|
648
|
+
|
649
|
+
@fetched = false
|
650
|
+
assert !@robots.allowed?('http://site2.example.org/bar')
|
651
|
+
assert !@fetched
|
652
|
+
assert !@robots.allowed?('http://site2.example.org/baz')
|
653
|
+
assert !@fetched
|
654
|
+
assert !@robots.allowed?('http://site2.example.org/foo')
|
655
|
+
assert !@fetched
|
656
|
+
end
|
657
|
+
end
|
585
658
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.11"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = [%q{Akinori MUSHA}]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-08-10}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
@@ -33,9 +33,10 @@ Gem::Specification.new do |s|
|
|
33
33
|
"test/test_webrobots.rb",
|
34
34
|
"webrobots.gemspec"
|
35
35
|
]
|
36
|
+
s.homepage = %q{https://github.com/knu/webrobots}
|
36
37
|
s.licenses = [%q{2-clause BSDL}]
|
37
38
|
s.require_paths = [%q{lib}]
|
38
|
-
s.rubygems_version = %q{1.8.
|
39
|
+
s.rubygems_version = %q{1.8.7}
|
39
40
|
s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
|
40
41
|
|
41
42
|
if s.respond_to? :specification_version then
|
@@ -46,14 +47,14 @@ Gem::Specification.new do |s|
|
|
46
47
|
s.add_development_dependency(%q<racc>, [">= 0"])
|
47
48
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
49
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
49
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1.6.
|
50
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
50
51
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
51
52
|
else
|
52
53
|
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
53
54
|
s.add_dependency(%q<racc>, [">= 0"])
|
54
55
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
55
56
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
56
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.
|
57
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
57
58
|
s.add_dependency(%q<rcov>, [">= 0"])
|
58
59
|
end
|
59
60
|
else
|
@@ -61,7 +62,7 @@ Gem::Specification.new do |s|
|
|
61
62
|
s.add_dependency(%q<racc>, [">= 0"])
|
62
63
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
64
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.
|
65
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
66
|
s.add_dependency(%q<rcov>, [">= 0"])
|
66
67
|
end
|
67
68
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-08-10 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70285160147560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.4.4
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70285160147560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: racc
|
27
|
-
requirement: &
|
27
|
+
requirement: &70285160147080 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70285160147080
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: shoulda
|
38
|
-
requirement: &
|
38
|
+
requirement: &70285160146600 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70285160146600
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: bundler
|
49
|
-
requirement: &
|
49
|
+
requirement: &70285160146120 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,21 +54,21 @@ dependencies:
|
|
54
54
|
version: 1.0.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70285160146120
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: jeweler
|
60
|
-
requirement: &
|
60
|
+
requirement: &70285160145640 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
64
64
|
- !ruby/object:Gem::Version
|
65
|
-
version: 1.6.
|
65
|
+
version: 1.6.4
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70285160145640
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rcov
|
71
|
-
requirement: &
|
71
|
+
requirement: &70285160145160 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70285160145160
|
80
80
|
description: ! 'This library helps write robots.txt compliant web robots in Ruby.
|
81
81
|
|
82
82
|
'
|
@@ -101,7 +101,7 @@ files:
|
|
101
101
|
- test/helper.rb
|
102
102
|
- test/test_webrobots.rb
|
103
103
|
- webrobots.gemspec
|
104
|
-
homepage:
|
104
|
+
homepage: https://github.com/knu/webrobots
|
105
105
|
licenses:
|
106
106
|
- 2-clause BSDL
|
107
107
|
post_install_message:
|
@@ -116,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
116
116
|
version: '0'
|
117
117
|
segments:
|
118
118
|
- 0
|
119
|
-
hash:
|
119
|
+
hash: 3895009630851215598
|
120
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
121
|
none: false
|
122
122
|
requirements:
|
@@ -125,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
125
|
version: '0'
|
126
126
|
requirements: []
|
127
127
|
rubyforge_project:
|
128
|
-
rubygems_version: 1.8.
|
128
|
+
rubygems_version: 1.8.7
|
129
129
|
signing_key:
|
130
130
|
specification_version: 3
|
131
131
|
summary: A Ruby library to help write robots.txt compliant web robots
|