webrobots 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/lib/webrobots.rb +135 -0
- data/lib/webrobots/robotstxt.rb +714 -0
- data/lib/webrobots/robotstxt.ry +444 -0
- data/test/helper.rb +18 -0
- data/test/test_webrobots.rb +291 -0
- metadata +155 -0
@@ -0,0 +1,444 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
rule
|
6
|
+
robotstxt : opt_blanklines
|
7
|
+
{
|
8
|
+
@sitemaps = []
|
9
|
+
}
|
10
|
+
body
|
11
|
+
{
|
12
|
+
body = val[2]
|
13
|
+
result = RobotsTxt.new(@site, body,
|
14
|
+
:target => @target, :sitemaps => @sitemaps)
|
15
|
+
}
|
16
|
+
|
17
|
+
body :
|
18
|
+
| blocks
|
19
|
+
opt_blanklines
|
20
|
+
|
21
|
+
opt_blanklines :
|
22
|
+
| blanklines
|
23
|
+
|
24
|
+
blanklines : blankline
|
25
|
+
| blanklines
|
26
|
+
blankline
|
27
|
+
|
28
|
+
blankline : eol
|
29
|
+
|
30
|
+
eol : EOL
|
31
|
+
{
|
32
|
+
@lineno += 1
|
33
|
+
}
|
34
|
+
|
35
|
+
opt_space :
|
36
|
+
| SPACE
|
37
|
+
|
38
|
+
opt_commentlines :
|
39
|
+
| commentlines
|
40
|
+
|
41
|
+
commentlines : comment
|
42
|
+
| commentlines
|
43
|
+
comment
|
44
|
+
|
45
|
+
comment : opt_space COMMENT eol
|
46
|
+
| 'sitemap' ':' opt_space VALUE eol_opt_comment
|
47
|
+
{
|
48
|
+
@sitemaps << val[3]
|
49
|
+
}
|
50
|
+
|
51
|
+
blocks : record
|
52
|
+
{
|
53
|
+
result = []
|
54
|
+
result << val[0]
|
55
|
+
}
|
56
|
+
| commentblock
|
57
|
+
{
|
58
|
+
result = []
|
59
|
+
}
|
60
|
+
| blocks
|
61
|
+
blanklines
|
62
|
+
record
|
63
|
+
{
|
64
|
+
result << val[2]
|
65
|
+
}
|
66
|
+
| blocks
|
67
|
+
blanklines
|
68
|
+
rulelines
|
69
|
+
{
|
70
|
+
val[2].each_with_index { |line, i|
|
71
|
+
warn "%s line %d: %s: orphan rule line" %
|
72
|
+
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
|
73
|
+
}
|
74
|
+
}
|
75
|
+
| blocks
|
76
|
+
blanklines
|
77
|
+
commentblock
|
78
|
+
|
79
|
+
commentblock : commentlines
|
80
|
+
|
81
|
+
record : opt_commentlines
|
82
|
+
agentlines
|
83
|
+
opt_rulelines
|
84
|
+
{
|
85
|
+
result = Record.new(val[1], val[2])
|
86
|
+
}
|
87
|
+
|
88
|
+
agentlines : agentline
|
89
|
+
{
|
90
|
+
result = [val[0]]
|
91
|
+
}
|
92
|
+
| agentlines
|
93
|
+
agentline
|
94
|
+
{
|
95
|
+
result << val[1]
|
96
|
+
}
|
97
|
+
| agentlines
|
98
|
+
comment
|
99
|
+
|
100
|
+
agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment
|
101
|
+
{
|
102
|
+
result = AgentLine.new(val[0], val[3])
|
103
|
+
}
|
104
|
+
|
105
|
+
opt_rulelines :
|
106
|
+
| rulelines
|
107
|
+
|
108
|
+
rulelines : ruleline
|
109
|
+
{
|
110
|
+
result = [result]
|
111
|
+
@rulelinenos = []
|
112
|
+
}
|
113
|
+
| rulelines
|
114
|
+
ruleline
|
115
|
+
{
|
116
|
+
result << val[1]
|
117
|
+
@rulelinenos << @lineno
|
118
|
+
}
|
119
|
+
| rulelines
|
120
|
+
comment
|
121
|
+
|
122
|
+
ruleline : allowline
|
123
|
+
| disallowline
|
124
|
+
| crawldelayline
|
125
|
+
| extension
|
126
|
+
|
127
|
+
allowline : 'allow' ':' opt_space VALUE eol_opt_comment
|
128
|
+
{
|
129
|
+
result = AllowLine.new(val[0], val[3])
|
130
|
+
}
|
131
|
+
|
132
|
+
disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment
|
133
|
+
{
|
134
|
+
result = DisallowLine.new(val[0], val[3])
|
135
|
+
}
|
136
|
+
|
137
|
+
crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment
|
138
|
+
{
|
139
|
+
result = CrawlDelayLine.new(val[0], val[3])
|
140
|
+
}
|
141
|
+
|
142
|
+
extension : TOKEN ':' opt_space VALUE eol_opt_comment
|
143
|
+
{
|
144
|
+
result = ExtentionLine.new(val[0], val[3])
|
145
|
+
}
|
146
|
+
|
147
|
+
eol_opt_comment : eol
|
148
|
+
| comment
|
149
|
+
|
150
|
+
---- header
|
151
|
+
|
152
|
+
require 'strscan'
|
153
|
+
require 'uri'
|
154
|
+
|
155
|
+
class WebRobots
|
156
|
+
class Error < StandardError
|
157
|
+
end
|
158
|
+
|
159
|
+
class ParseError < Error
|
160
|
+
end
|
161
|
+
|
162
|
+
class RobotsTxt
|
163
|
+
---- inner
|
164
|
+
|
165
|
+
def initialize(target = nil)
|
166
|
+
super()
|
167
|
+
@target = target
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.parse(input, target = nil)
|
171
|
+
new(target).parse(input)
|
172
|
+
end
|
173
|
+
|
174
|
+
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
175
|
+
RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
|
176
|
+
|
177
|
+
def parse(input, site)
|
178
|
+
@q = []
|
179
|
+
@errors = []
|
180
|
+
@lineno = 1
|
181
|
+
@site = site
|
182
|
+
|
183
|
+
string = input.respond_to?(:read) ? input.read : input
|
184
|
+
s = StringScanner.new(string)
|
185
|
+
value_expected = false
|
186
|
+
|
187
|
+
until s.eos?
|
188
|
+
if t = s.scan(/[ \t]*\r?\n/)
|
189
|
+
@q << [:EOL, t]
|
190
|
+
value_expected = false
|
191
|
+
elsif t = s.scan(/[ \t]+/)
|
192
|
+
@q << [:SPACE, t]
|
193
|
+
elsif t = s.scan(/:/)
|
194
|
+
@q << [t, t]
|
195
|
+
value_expected = true
|
196
|
+
elsif t = s.scan(/#.*/)
|
197
|
+
@q << [:COMMENT, t]
|
198
|
+
else
|
199
|
+
if value_expected
|
200
|
+
if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
|
201
|
+
@q << [:VALUE, t]
|
202
|
+
else
|
203
|
+
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
204
|
+
end
|
205
|
+
value_expected = false
|
206
|
+
else
|
207
|
+
if t = s.scan(RE_KNOWN_TOKENS)
|
208
|
+
@q << [t.downcase, t]
|
209
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
210
|
+
@q << [:TOKEN, t]
|
211
|
+
else
|
212
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
@q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
|
219
|
+
|
220
|
+
@pos = -1
|
221
|
+
|
222
|
+
do_parse
|
223
|
+
rescue Racc::ParseError => e
|
224
|
+
raise ParseError, e.message
|
225
|
+
end
|
226
|
+
|
227
|
+
def next_token
|
228
|
+
@q[@pos += 1]
|
229
|
+
end
|
230
|
+
|
231
|
+
def on_error(token_id, value, stack)
|
232
|
+
parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
|
233
|
+
end
|
234
|
+
|
235
|
+
def parse_error(message)
|
236
|
+
message = "%s line %d: %s" % [@site.to_s, @lineno, message]
|
237
|
+
if @lax
|
238
|
+
@errors << message
|
239
|
+
else
|
240
|
+
raise Racc::ParseError, message
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
---- footer
|
245
|
+
def initialize(site, records, options = nil)
|
246
|
+
super()
|
247
|
+
@site = site
|
248
|
+
@options = options || {}
|
249
|
+
@last_checked = nil
|
250
|
+
|
251
|
+
@target = @options[:target]
|
252
|
+
@sitemaps = @options[:sitemaps] || []
|
253
|
+
|
254
|
+
if records && !records.empty?
|
255
|
+
@records, defaults = [], []
|
256
|
+
records.each { |record|
|
257
|
+
if record.default?
|
258
|
+
defaults << record
|
259
|
+
elsif !@target || record.match?(@target)
|
260
|
+
@records << record
|
261
|
+
end
|
262
|
+
}
|
263
|
+
@records.concat(defaults)
|
264
|
+
else
|
265
|
+
@records = []
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
attr_reader :site, :sitemaps
|
270
|
+
|
271
|
+
def target(user_agent = nil)
|
272
|
+
if user_agent
|
273
|
+
raise ArgumentError, "this instance is targeted for #{@target}" if @target
|
274
|
+
user_agent
|
275
|
+
else
|
276
|
+
raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
|
277
|
+
@target
|
278
|
+
end
|
279
|
+
end
|
280
|
+
private :target
|
281
|
+
|
282
|
+
def find_record(user_agent = nil)
|
283
|
+
user_agent = target(user_agent)
|
284
|
+
@records.find { |record|
|
285
|
+
record.match?(user_agent)
|
286
|
+
}
|
287
|
+
end
|
288
|
+
private :find_record
|
289
|
+
|
290
|
+
def allow?(request_uri, user_agent = nil)
|
291
|
+
record = find_record(user_agent) or return true
|
292
|
+
allow = record.allow?(request_uri)
|
293
|
+
if @last_checked and delay = record.delay
|
294
|
+
delay -= Time.now - @last_checked
|
295
|
+
sleep delay if delay > 0
|
296
|
+
end
|
297
|
+
@last_checked = Time.now
|
298
|
+
return allow
|
299
|
+
end
|
300
|
+
|
301
|
+
def options(user_agent = nil)
|
302
|
+
record = find_record(user_agent) or return {}
|
303
|
+
record.options
|
304
|
+
end
|
305
|
+
|
306
|
+
class Record
|
307
|
+
def initialize(agentlines, rulelines)
|
308
|
+
@patterns = agentlines.map { |agentline| agentline.pattern }
|
309
|
+
@acls = []
|
310
|
+
@delay = nil
|
311
|
+
@options = {}
|
312
|
+
rulelines.each { |ruleline|
|
313
|
+
case ruleline
|
314
|
+
when AccessControlLine
|
315
|
+
@acls << ruleline
|
316
|
+
when CrawlDelayLine
|
317
|
+
@delay = ruleline.delay
|
318
|
+
else
|
319
|
+
@options[ruleline.token.downcase] = ruleline.value
|
320
|
+
end
|
321
|
+
}
|
322
|
+
@acls.sort! { |a, b|
|
323
|
+
[
|
324
|
+
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
325
|
+
] <=> [
|
326
|
+
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
327
|
+
]
|
328
|
+
}
|
329
|
+
end
|
330
|
+
|
331
|
+
attr_reader :delay, :options
|
332
|
+
|
333
|
+
def match?(user_agent)
|
334
|
+
@patterns.any? { |pattern|
|
335
|
+
pattern.match(user_agent)
|
336
|
+
}
|
337
|
+
end
|
338
|
+
|
339
|
+
def default?
|
340
|
+
@patterns.include?(//)
|
341
|
+
end
|
342
|
+
|
343
|
+
def allow?(request_uri)
|
344
|
+
@acls.each { |acl|
|
345
|
+
if acl.match?(request_uri)
|
346
|
+
return acl.allow?
|
347
|
+
end
|
348
|
+
}
|
349
|
+
return true
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
class Line
|
354
|
+
def initialize(token, value)
|
355
|
+
@token = token
|
356
|
+
@value = value
|
357
|
+
compile
|
358
|
+
end
|
359
|
+
|
360
|
+
attr_reader :token, :value
|
361
|
+
|
362
|
+
def compile
|
363
|
+
self
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
class AgentLine < Line
|
368
|
+
def compile
|
369
|
+
if @value == '*'
|
370
|
+
@pattern = //
|
371
|
+
else
|
372
|
+
@pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
|
373
|
+
end
|
374
|
+
self
|
375
|
+
end
|
376
|
+
|
377
|
+
attr_reader :pattern
|
378
|
+
end
|
379
|
+
|
380
|
+
class AccessControlLine < Line
|
381
|
+
def compile
|
382
|
+
@empty = @value.empty?
|
383
|
+
re_src = '\A'
|
384
|
+
s = StringScanner.new(@value)
|
385
|
+
until s.eos?
|
386
|
+
if t = s.scan(/[^%*$]+/)
|
387
|
+
re_src << Regexp.quote(t)
|
388
|
+
elsif t = s.scan(/%([0-9a-f]{2})/i)
|
389
|
+
c = s[1].to_i(16)
|
390
|
+
if c == 0x2f
|
391
|
+
re_src << '%2[fF]'
|
392
|
+
else
|
393
|
+
re_src << Regexp.quote('%c' % c)
|
394
|
+
end
|
395
|
+
elsif t = s.scan(/\*/)
|
396
|
+
re_src << '.*'
|
397
|
+
elsif t = s.scan(/\$/)
|
398
|
+
re_src << '\z'
|
399
|
+
break
|
400
|
+
else
|
401
|
+
raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
|
402
|
+
end
|
403
|
+
end
|
404
|
+
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
405
|
+
self
|
406
|
+
end
|
407
|
+
|
408
|
+
def match?(request_uri)
|
409
|
+
!@empty && !!@pattern.match(request_uri)
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
class AllowLine < AccessControlLine
|
414
|
+
def allow?
|
415
|
+
true
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
class DisallowLine < AccessControlLine
|
420
|
+
def allow?
|
421
|
+
false
|
422
|
+
end
|
423
|
+
end
|
424
|
+
|
425
|
+
class CrawlDelayLine < Line
|
426
|
+
def compile
|
427
|
+
case @value
|
428
|
+
when /\A((0|[1-9][0-9]*)\.[0-9]+)/
|
429
|
+
@delay = @value.to_f
|
430
|
+
when /\A(0|[1-9][0-9]*)/
|
431
|
+
@delay = @value.to_i
|
432
|
+
else
|
433
|
+
@delay = nil
|
434
|
+
end
|
435
|
+
self
|
436
|
+
end
|
437
|
+
|
438
|
+
attr_reader :delay
|
439
|
+
end
|
440
|
+
|
441
|
+
class ExtentionLine < Line
|
442
|
+
end
|
443
|
+
end
|
444
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'webrobots'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|