webrobots 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,444 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class Parser
4
+
5
+ rule
6
+ robotstxt : opt_blanklines
7
+ {
8
+ @sitemaps = []
9
+ }
10
+ body
11
+ {
12
+ body = val[2]
13
+ result = RobotsTxt.new(@site, body,
14
+ :target => @target, :sitemaps => @sitemaps)
15
+ }
16
+
17
+ body :
18
+ | blocks
19
+ opt_blanklines
20
+
21
+ opt_blanklines :
22
+ | blanklines
23
+
24
+ blanklines : blankline
25
+ | blanklines
26
+ blankline
27
+
28
+ blankline : eol
29
+
30
+ eol : EOL
31
+ {
32
+ @lineno += 1
33
+ }
34
+
35
+ opt_space :
36
+ | SPACE
37
+
38
+ opt_commentlines :
39
+ | commentlines
40
+
41
+ commentlines : comment
42
+ | commentlines
43
+ comment
44
+
45
+ comment : opt_space COMMENT eol
46
+ | 'sitemap' ':' opt_space VALUE eol_opt_comment
47
+ {
48
+ @sitemaps << val[3]
49
+ }
50
+
51
+ blocks : record
52
+ {
53
+ result = []
54
+ result << val[0]
55
+ }
56
+ | commentblock
57
+ {
58
+ result = []
59
+ }
60
+ | blocks
61
+ blanklines
62
+ record
63
+ {
64
+ result << val[2]
65
+ }
66
+ | blocks
67
+ blanklines
68
+ rulelines
69
+ {
70
+ val[2].each_with_index { |line, i|
71
+ warn "%s line %d: %s: orphan rule line" %
72
+ [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
73
+ }
74
+ }
75
+ | blocks
76
+ blanklines
77
+ commentblock
78
+
79
+ commentblock : commentlines
80
+
81
+ record : opt_commentlines
82
+ agentlines
83
+ opt_rulelines
84
+ {
85
+ result = Record.new(val[1], val[2])
86
+ }
87
+
88
+ agentlines : agentline
89
+ {
90
+ result = [val[0]]
91
+ }
92
+ | agentlines
93
+ agentline
94
+ {
95
+ result << val[1]
96
+ }
97
+ | agentlines
98
+ comment
99
+
100
+ agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment
101
+ {
102
+ result = AgentLine.new(val[0], val[3])
103
+ }
104
+
105
+ opt_rulelines :
106
+ | rulelines
107
+
108
+ rulelines : ruleline
109
+ {
110
+ result = [result]
111
+ @rulelinenos = []
112
+ }
113
+ | rulelines
114
+ ruleline
115
+ {
116
+ result << val[1]
117
+ @rulelinenos << @lineno
118
+ }
119
+ | rulelines
120
+ comment
121
+
122
+ ruleline : allowline
123
+ | disallowline
124
+ | crawldelayline
125
+ | extension
126
+
127
+ allowline : 'allow' ':' opt_space VALUE eol_opt_comment
128
+ {
129
+ result = AllowLine.new(val[0], val[3])
130
+ }
131
+
132
+ disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment
133
+ {
134
+ result = DisallowLine.new(val[0], val[3])
135
+ }
136
+
137
+ crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment
138
+ {
139
+ result = CrawlDelayLine.new(val[0], val[3])
140
+ }
141
+
142
+ extension : TOKEN ':' opt_space VALUE eol_opt_comment
143
+ {
144
+ result = ExtentionLine.new(val[0], val[3])
145
+ }
146
+
147
+ eol_opt_comment : eol
148
+ | comment
149
+
150
+ ---- header
151
+
152
+ require 'strscan'
153
+ require 'uri'
154
+
155
+ class WebRobots
156
+ class Error < StandardError
157
+ end
158
+
159
+ class ParseError < Error
160
+ end
161
+
162
+ class RobotsTxt
163
+ ---- inner
164
+
165
+ def initialize(target = nil)
166
+ super()
167
+ @target = target
168
+ end
169
+
170
+ def self.parse(input, target = nil)
171
+ new(target).parse(input)
172
+ end
173
+
174
+ KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
175
+ RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
176
+
177
+ def parse(input, site)
178
+ @q = []
179
+ @errors = []
180
+ @lineno = 1
181
+ @site = site
182
+
183
+ string = input.respond_to?(:read) ? input.read : input
184
+ s = StringScanner.new(string)
185
+ value_expected = false
186
+
187
+ until s.eos?
188
+ if t = s.scan(/[ \t]*\r?\n/)
189
+ @q << [:EOL, t]
190
+ value_expected = false
191
+ elsif t = s.scan(/[ \t]+/)
192
+ @q << [:SPACE, t]
193
+ elsif t = s.scan(/:/)
194
+ @q << [t, t]
195
+ value_expected = true
196
+ elsif t = s.scan(/#.*/)
197
+ @q << [:COMMENT, t]
198
+ else
199
+ if value_expected
200
+ if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
201
+ @q << [:VALUE, t]
202
+ else
203
+ parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
204
+ end
205
+ value_expected = false
206
+ else
207
+ if t = s.scan(RE_KNOWN_TOKENS)
208
+ @q << [t.downcase, t]
209
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
210
+ @q << [:TOKEN, t]
211
+ else
212
+ parse_error "unexpected characters: %s" % s.check(/.*/)
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
219
+
220
+ @pos = -1
221
+
222
+ do_parse
223
+ rescue Racc::ParseError => e
224
+ raise ParseError, e.message
225
+ end
226
+
227
+ def next_token
228
+ @q[@pos += 1]
229
+ end
230
+
231
+ def on_error(token_id, value, stack)
232
+ parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
233
+ end
234
+
235
+ def parse_error(message)
236
+ message = "%s line %d: %s" % [@site.to_s, @lineno, message]
237
+ if @lax
238
+ @errors << message
239
+ else
240
+ raise Racc::ParseError, message
241
+ end
242
+ end
243
+
244
+ ---- footer
245
+ def initialize(site, records, options = nil)
246
+ super()
247
+ @site = site
248
+ @options = options || {}
249
+ @last_checked = nil
250
+
251
+ @target = @options[:target]
252
+ @sitemaps = @options[:sitemaps] || []
253
+
254
+ if records && !records.empty?
255
+ @records, defaults = [], []
256
+ records.each { |record|
257
+ if record.default?
258
+ defaults << record
259
+ elsif !@target || record.match?(@target)
260
+ @records << record
261
+ end
262
+ }
263
+ @records.concat(defaults)
264
+ else
265
+ @records = []
266
+ end
267
+ end
268
+
269
+ attr_reader :site, :sitemaps
270
+
271
+ def target(user_agent = nil)
272
+ if user_agent
273
+ raise ArgumentError, "this instance is targeted for #{@target}" if @target
274
+ user_agent
275
+ else
276
+ raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
277
+ @target
278
+ end
279
+ end
280
+ private :target
281
+
282
+ def find_record(user_agent = nil)
283
+ user_agent = target(user_agent)
284
+ @records.find { |record|
285
+ record.match?(user_agent)
286
+ }
287
+ end
288
+ private :find_record
289
+
290
+ def allow?(request_uri, user_agent = nil)
291
+ record = find_record(user_agent) or return true
292
+ allow = record.allow?(request_uri)
293
+ if @last_checked and delay = record.delay
294
+ delay -= Time.now - @last_checked
295
+ sleep delay if delay > 0
296
+ end
297
+ @last_checked = Time.now
298
+ return allow
299
+ end
300
+
301
+ def options(user_agent = nil)
302
+ record = find_record(user_agent) or return {}
303
+ record.options
304
+ end
305
+
306
+ class Record
307
+ def initialize(agentlines, rulelines)
308
+ @patterns = agentlines.map { |agentline| agentline.pattern }
309
+ @acls = []
310
+ @delay = nil
311
+ @options = {}
312
+ rulelines.each { |ruleline|
313
+ case ruleline
314
+ when AccessControlLine
315
+ @acls << ruleline
316
+ when CrawlDelayLine
317
+ @delay = ruleline.delay
318
+ else
319
+ @options[ruleline.token.downcase] = ruleline.value
320
+ end
321
+ }
322
+ @acls.sort! { |a, b|
323
+ [
324
+ b.value.length, b.is_a?(AllowLine) ? 1 : 0
325
+ ] <=> [
326
+ a.value.length, a.is_a?(AllowLine) ? 1 : 0
327
+ ]
328
+ }
329
+ end
330
+
331
+ attr_reader :delay, :options
332
+
333
+ def match?(user_agent)
334
+ @patterns.any? { |pattern|
335
+ pattern.match(user_agent)
336
+ }
337
+ end
338
+
339
+ def default?
340
+ @patterns.include?(//)
341
+ end
342
+
343
+ def allow?(request_uri)
344
+ @acls.each { |acl|
345
+ if acl.match?(request_uri)
346
+ return acl.allow?
347
+ end
348
+ }
349
+ return true
350
+ end
351
+ end
352
+
353
+ class Line
354
+ def initialize(token, value)
355
+ @token = token
356
+ @value = value
357
+ compile
358
+ end
359
+
360
+ attr_reader :token, :value
361
+
362
+ def compile
363
+ self
364
+ end
365
+ end
366
+
367
+ class AgentLine < Line
368
+ def compile
369
+ if @value == '*'
370
+ @pattern = //
371
+ else
372
+ @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
373
+ end
374
+ self
375
+ end
376
+
377
+ attr_reader :pattern
378
+ end
379
+
380
+ class AccessControlLine < Line
381
+ def compile
382
+ @empty = @value.empty?
383
+ re_src = '\A'
384
+ s = StringScanner.new(@value)
385
+ until s.eos?
386
+ if t = s.scan(/[^%*$]+/)
387
+ re_src << Regexp.quote(t)
388
+ elsif t = s.scan(/%([0-9a-f]{2})/i)
389
+ c = s[1].to_i(16)
390
+ if c == 0x2f
391
+ re_src << '%2[fF]'
392
+ else
393
+ re_src << Regexp.quote('%c' % c)
394
+ end
395
+ elsif t = s.scan(/\*/)
396
+ re_src << '.*'
397
+ elsif t = s.scan(/\$/)
398
+ re_src << '\z'
399
+ break
400
+ else
401
+ raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
402
+ end
403
+ end
404
+ @pattern = Regexp.new(re_src, Regexp::MULTILINE)
405
+ self
406
+ end
407
+
408
+ def match?(request_uri)
409
+ !@empty && !!@pattern.match(request_uri)
410
+ end
411
+ end
412
+
413
+ class AllowLine < AccessControlLine
414
+ def allow?
415
+ true
416
+ end
417
+ end
418
+
419
+ class DisallowLine < AccessControlLine
420
+ def allow?
421
+ false
422
+ end
423
+ end
424
+
425
+ class CrawlDelayLine < Line
426
+ def compile
427
+ case @value
428
+ when /\A((0|[1-9][0-9]*)\.[0-9]+)/
429
+ @delay = @value.to_f
430
+ when /\A(0|[1-9][0-9]*)/
431
+ @delay = @value.to_i
432
+ else
433
+ @delay = nil
434
+ end
435
+ self
436
+ end
437
+
438
+ attr_reader :delay
439
+ end
440
+
441
+ class ExtentionLine < Line
442
+ end
443
+ end
444
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'webrobots'
16
+
17
+ class Test::Unit::TestCase
18
+ end