webrobots 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,444 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class Parser
4
+
5
+ rule
6
+ robotstxt : opt_blanklines
7
+ {
8
+ @sitemaps = []
9
+ }
10
+ body
11
+ {
12
+ body = val[2]
13
+ result = RobotsTxt.new(@site, body,
14
+ :target => @target, :sitemaps => @sitemaps)
15
+ }
16
+
17
+ body :
18
+ | blocks
19
+ opt_blanklines
20
+
21
+ opt_blanklines :
22
+ | blanklines
23
+
24
+ blanklines : blankline
25
+ | blanklines
26
+ blankline
27
+
28
+ blankline : eol
29
+
30
+ eol : EOL
31
+ {
32
+ @lineno += 1
33
+ }
34
+
35
+ opt_space :
36
+ | SPACE
37
+
38
+ opt_commentlines :
39
+ | commentlines
40
+
41
+ commentlines : comment
42
+ | commentlines
43
+ comment
44
+
45
+ comment : opt_space COMMENT eol
46
+ | 'sitemap' ':' opt_space VALUE eol_opt_comment
47
+ {
48
+ @sitemaps << val[3]
49
+ }
50
+
51
+ blocks : record
52
+ {
53
+ result = []
54
+ result << val[0]
55
+ }
56
+ | commentblock
57
+ {
58
+ result = []
59
+ }
60
+ | blocks
61
+ blanklines
62
+ record
63
+ {
64
+ result << val[2]
65
+ }
66
+ | blocks
67
+ blanklines
68
+ rulelines
69
+ {
70
+ val[2].each_with_index { |line, i|
71
+ warn "%s line %d: %s: orphan rule line" %
72
+ [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
73
+ }
74
+ }
75
+ | blocks
76
+ blanklines
77
+ commentblock
78
+
79
+ commentblock : commentlines
80
+
81
+ record : opt_commentlines
82
+ agentlines
83
+ opt_rulelines
84
+ {
85
+ result = Record.new(val[1], val[2])
86
+ }
87
+
88
+ agentlines : agentline
89
+ {
90
+ result = [val[0]]
91
+ }
92
+ | agentlines
93
+ agentline
94
+ {
95
+ result << val[1]
96
+ }
97
+ | agentlines
98
+ comment
99
+
100
+ agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment
101
+ {
102
+ result = AgentLine.new(val[0], val[3])
103
+ }
104
+
105
+ opt_rulelines :
106
+ | rulelines
107
+
108
+ rulelines : ruleline
109
+ {
110
+ result = [result]
111
+ @rulelinenos = []
112
+ }
113
+ | rulelines
114
+ ruleline
115
+ {
116
+ result << val[1]
117
+ @rulelinenos << @lineno
118
+ }
119
+ | rulelines
120
+ comment
121
+
122
+ ruleline : allowline
123
+ | disallowline
124
+ | crawldelayline
125
+ | extension
126
+
127
+ allowline : 'allow' ':' opt_space VALUE eol_opt_comment
128
+ {
129
+ result = AllowLine.new(val[0], val[3])
130
+ }
131
+
132
+ disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment
133
+ {
134
+ result = DisallowLine.new(val[0], val[3])
135
+ }
136
+
137
+ crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment
138
+ {
139
+ result = CrawlDelayLine.new(val[0], val[3])
140
+ }
141
+
142
+ extension : TOKEN ':' opt_space VALUE eol_opt_comment
143
+ {
144
+ result = ExtentionLine.new(val[0], val[3])
145
+ }
146
+
147
+ eol_opt_comment : eol
148
+ | comment
149
+
150
+ ---- header
151
+
152
+ require 'strscan'
153
+ require 'uri'
154
+
155
+ class WebRobots
156
+ class Error < StandardError
157
+ end
158
+
159
+ class ParseError < Error
160
+ end
161
+
162
+ class RobotsTxt
163
+ ---- inner
164
+
165
+ def initialize(target = nil)
166
+ super()
167
+ @target = target
168
+ end
169
+
170
+ def self.parse(input, target = nil)
171
+ new(target).parse(input)
172
+ end
173
+
174
+ KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
175
+ RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
176
+
177
+ def parse(input, site)
178
+ @q = []
179
+ @errors = []
180
+ @lineno = 1
181
+ @site = site
182
+
183
+ string = input.respond_to?(:read) ? input.read : input
184
+ s = StringScanner.new(string)
185
+ value_expected = false
186
+
187
+ until s.eos?
188
+ if t = s.scan(/[ \t]*\r?\n/)
189
+ @q << [:EOL, t]
190
+ value_expected = false
191
+ elsif t = s.scan(/[ \t]+/)
192
+ @q << [:SPACE, t]
193
+ elsif t = s.scan(/:/)
194
+ @q << [t, t]
195
+ value_expected = true
196
+ elsif t = s.scan(/#.*/)
197
+ @q << [:COMMENT, t]
198
+ else
199
+ if value_expected
200
+ if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
201
+ @q << [:VALUE, t]
202
+ else
203
+ parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
204
+ end
205
+ value_expected = false
206
+ else
207
+ if t = s.scan(RE_KNOWN_TOKENS)
208
+ @q << [t.downcase, t]
209
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
210
+ @q << [:TOKEN, t]
211
+ else
212
+ parse_error "unexpected characters: %s" % s.check(/.*/)
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
219
+
220
+ @pos = -1
221
+
222
+ do_parse
223
+ rescue Racc::ParseError => e
224
+ raise ParseError, e.message
225
+ end
226
+
227
+ def next_token
228
+ @q[@pos += 1]
229
+ end
230
+
231
+ def on_error(token_id, value, stack)
232
+ parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
233
+ end
234
+
235
+ def parse_error(message)
236
+ message = "%s line %d: %s" % [@site.to_s, @lineno, message]
237
+ if @lax
238
+ @errors << message
239
+ else
240
+ raise Racc::ParseError, message
241
+ end
242
+ end
243
+
244
+ ---- footer
245
+ def initialize(site, records, options = nil)
246
+ super()
247
+ @site = site
248
+ @options = options || {}
249
+ @last_checked = nil
250
+
251
+ @target = @options[:target]
252
+ @sitemaps = @options[:sitemaps] || []
253
+
254
+ if records && !records.empty?
255
+ @records, defaults = [], []
256
+ records.each { |record|
257
+ if record.default?
258
+ defaults << record
259
+ elsif !@target || record.match?(@target)
260
+ @records << record
261
+ end
262
+ }
263
+ @records.concat(defaults)
264
+ else
265
+ @records = []
266
+ end
267
+ end
268
+
269
+ attr_reader :site, :sitemaps
270
+
271
+ def target(user_agent = nil)
272
+ if user_agent
273
+ raise ArgumentError, "this instance is targeted for #{@target}" if @target
274
+ user_agent
275
+ else
276
+ raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
277
+ @target
278
+ end
279
+ end
280
+ private :target
281
+
282
+ def find_record(user_agent = nil)
283
+ user_agent = target(user_agent)
284
+ @records.find { |record|
285
+ record.match?(user_agent)
286
+ }
287
+ end
288
+ private :find_record
289
+
290
+ def allow?(request_uri, user_agent = nil)
291
+ record = find_record(user_agent) or return true
292
+ allow = record.allow?(request_uri)
293
+ if @last_checked and delay = record.delay
294
+ delay -= Time.now - @last_checked
295
+ sleep delay if delay > 0
296
+ end
297
+ @last_checked = Time.now
298
+ return allow
299
+ end
300
+
301
+ def options(user_agent = nil)
302
+ record = find_record(user_agent) or return {}
303
+ record.options
304
+ end
305
+
306
+ class Record
307
+ def initialize(agentlines, rulelines)
308
+ @patterns = agentlines.map { |agentline| agentline.pattern }
309
+ @acls = []
310
+ @delay = nil
311
+ @options = {}
312
+ rulelines.each { |ruleline|
313
+ case ruleline
314
+ when AccessControlLine
315
+ @acls << ruleline
316
+ when CrawlDelayLine
317
+ @delay = ruleline.delay
318
+ else
319
+ @options[ruleline.token.downcase] = ruleline.value
320
+ end
321
+ }
322
+ @acls.sort! { |a, b|
323
+ [
324
+ b.value.length, b.is_a?(AllowLine) ? 1 : 0
325
+ ] <=> [
326
+ a.value.length, a.is_a?(AllowLine) ? 1 : 0
327
+ ]
328
+ }
329
+ end
330
+
331
+ attr_reader :delay, :options
332
+
333
+ def match?(user_agent)
334
+ @patterns.any? { |pattern|
335
+ pattern.match(user_agent)
336
+ }
337
+ end
338
+
339
+ def default?
340
+ @patterns.include?(//)
341
+ end
342
+
343
+ def allow?(request_uri)
344
+ @acls.each { |acl|
345
+ if acl.match?(request_uri)
346
+ return acl.allow?
347
+ end
348
+ }
349
+ return true
350
+ end
351
+ end
352
+
353
+ class Line
354
+ def initialize(token, value)
355
+ @token = token
356
+ @value = value
357
+ compile
358
+ end
359
+
360
+ attr_reader :token, :value
361
+
362
+ def compile
363
+ self
364
+ end
365
+ end
366
+
367
+ class AgentLine < Line
368
+ def compile
369
+ if @value == '*'
370
+ @pattern = //
371
+ else
372
+ @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
373
+ end
374
+ self
375
+ end
376
+
377
+ attr_reader :pattern
378
+ end
379
+
380
+ class AccessControlLine < Line
381
+ def compile
382
+ @empty = @value.empty?
383
+ re_src = '\A'
384
+ s = StringScanner.new(@value)
385
+ until s.eos?
386
+ if t = s.scan(/[^%*$]+/)
387
+ re_src << Regexp.quote(t)
388
+ elsif t = s.scan(/%([0-9a-f]{2})/i)
389
+ c = s[1].to_i(16)
390
+ if c == 0x2f
391
+ re_src << '%2[fF]'
392
+ else
393
+ re_src << Regexp.quote('%c' % c)
394
+ end
395
+ elsif t = s.scan(/\*/)
396
+ re_src << '.*'
397
+ elsif t = s.scan(/\$/)
398
+ re_src << '\z'
399
+ break
400
+ else
401
+ raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
402
+ end
403
+ end
404
+ @pattern = Regexp.new(re_src, Regexp::MULTILINE)
405
+ self
406
+ end
407
+
408
+ def match?(request_uri)
409
+ !@empty && !!@pattern.match(request_uri)
410
+ end
411
+ end
412
+
413
+ class AllowLine < AccessControlLine
414
+ def allow?
415
+ true
416
+ end
417
+ end
418
+
419
+ class DisallowLine < AccessControlLine
420
+ def allow?
421
+ false
422
+ end
423
+ end
424
+
425
+ class CrawlDelayLine < Line
426
+ def compile
427
+ case @value
428
+ when /\A((0|[1-9][0-9]*)\.[0-9]+)/
429
+ @delay = @value.to_f
430
+ when /\A(0|[1-9][0-9]*)/
431
+ @delay = @value.to_i
432
+ else
433
+ @delay = nil
434
+ end
435
+ self
436
+ end
437
+
438
+ attr_reader :delay
439
+ end
440
+
441
+ class ExtentionLine < Line
442
+ end
443
+ end
444
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'webrobots'
16
+
17
+ class Test::Unit::TestCase
18
+ end