webrobots 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem "racc", ">= 0"
6
+
7
+ # Add dependencies to develop your gem here.
8
+ # Include everything needed to run rake, tests, features, etc.
9
+ group :development do
10
+ gem "shoulda", ">= 0"
11
+ gem "bundler", "~> 1.0.0"
12
+ gem "jeweler", "~> 1.5.1"
13
+ gem "rcov", ">= 0"
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.5.2)
6
+ bundler (~> 1.0.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ racc (1.4.6)
10
+ rake (0.8.7)
11
+ rcov (0.9.9)
12
+ shoulda (2.11.3)
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ bundler (~> 1.0.0)
19
+ jeweler (~> 1.5.1)
20
+ racc
21
+ rcov
22
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Akinori MUSHA
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = webrobots
2
+
3
+ This is a library to help write robots.txt compliant web robots.
4
+
5
+ == Contributing to webrobots
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010, 2011 Akinori MUSHA. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,61 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "webrobots"
16
+ # gem.homepage = "http://github.com/knu/webrobots"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{A library to help write robots.txt compliant web robots}
19
+ gem.description = <<-'EOS'
20
+ This library helps write robots.txt compliant web robots.
21
+ EOS
22
+ gem.email = "knu@idaemons.org"
23
+ gem.authors = ["Akinori MUSHA"]
24
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
25
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
26
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
27
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
30
+
31
+ require 'rake/testtask'
32
+ Rake::TestTask.new(:test) do |test|
33
+ test.libs << 'lib' << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+
38
+ require 'rcov/rcovtask'
39
+ Rcov::RcovTask.new do |test|
40
+ test.libs << 'test'
41
+ test.pattern = 'test/**/test_*.rb'
42
+ test.verbose = true
43
+ end
44
+
45
+ task :default => :test
46
+
47
+ task :test => 'lib/webrobots/robotstxt.rb'
48
+
49
+ file 'lib/webrobots/robotstxt.rb' => 'lib/webrobots/robotstxt.ry' do
50
+ sh 'racc', '-o', 'lib/webrobots/robotstxt.rb', 'lib/webrobots/robotstxt.ry'
51
+ end
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
+
57
+ rdoc.rdoc_dir = 'rdoc'
58
+ rdoc.title = "webrobots #{version}"
59
+ rdoc.rdoc_files.include('README*')
60
+ rdoc.rdoc_files.include('lib/**/*.rb')
61
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/lib/webrobots.rb ADDED
@@ -0,0 +1,135 @@
1
+ require 'webrobots/robotstxt'
2
+ require 'uri'
3
+ require 'net/https'
4
+
5
+ class WebRobots
6
+ # Creates a WebRobots object for a robot named +user_agent+, with
7
+ # optional +options+.
8
+ #
9
+ # * :http_get => a custom method, proc, or anything that responds to
10
+ # .call(uri), to be used for fetching robots.txt. It must return
11
+ # the response body if successful, or raise Net::HTTPNotFound if
12
+ # the resource is not found. Any other errror is regarded as
13
+ # blanket ban.
14
+ def initialize(user_agent, options = nil)
15
+ @user_agent = user_agent
16
+ @parser = RobotsTxt::Parser.new(user_agent)
17
+
18
+ options ||= {}
19
+ @http_get = options[:http_get] || method(:http_get)
20
+
21
+ @robotstxt = {}
22
+ end
23
+
24
+ # Returns the robot name initially given.
25
+ attr_reader :user_agent
26
+
27
+ # Tests if the robot is allowed to access a resource at +url+. If a
28
+ # malformed URI string is given, URI::InvalidURIError is raised. If
29
+ # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
30
+ # raised.
31
+ def allowed?(url)
32
+ site, request_uri = split_uri(url)
33
+ return true if request_uri == '/robots.txt'
34
+ robots_txt(site).allow?(request_uri)
35
+ end
36
+
37
+ # Equivalent to !allowed?(url).
38
+ def disallowed?(url)
39
+ !allowed?(url)
40
+ end
41
+
42
+ # Returns extended option values for a resource at +url+ in a hash
43
+ # with each field name lower-cased. See allowed?() for a list of
44
+ # errors that may be raised.
45
+ def options(url)
46
+ site, = split_uri(url)
47
+ robots_txt(site).options
48
+ end
49
+
50
+ # Equivalent to option(url)[token.downcase].
51
+ def option(url, token)
52
+ options(url)[token.downcase]
53
+ end
54
+
55
+ # Returns an array of Sitemap URLs. See allowed?() for a list of
56
+ # errors that may be raised.
57
+ def sitemaps(url)
58
+ site, = split_uri(url)
59
+ robots_txt(site).sitemaps
60
+ end
61
+
62
+ private
63
+
64
+ def split_uri(url)
65
+ site =
66
+ if url.is_a?(URI)
67
+ url.dup
68
+ else
69
+ begin
70
+ URI.parse(url)
71
+ rescue => e
72
+ raise ArgumentError, e.message
73
+ end
74
+ end
75
+
76
+ site.scheme && site.host or
77
+ raise ArgumentError, "non-absolute URI: #{url}"
78
+
79
+ site.is_a?(URI::HTTP) or
80
+ raise ArgumentError, "non-HTTP/HTTPS URI: #{url}"
81
+
82
+ request_uri = site.request_uri
83
+ if (host = site.host).match(/[[:upper:]]/)
84
+ site.host = host.downcase
85
+ end
86
+ site.path = '/'
87
+ return site, request_uri
88
+ end
89
+
90
+ def robots_txt(site)
91
+ cache_robots_txt(site) {
92
+ fetch_robots_txt(site)
93
+ }
94
+ end
95
+
96
+ def fetch_robots_txt(site)
97
+ begin
98
+ body = @http_get.call(site + 'robots.txt')
99
+ rescue Net::HTTPNotFound
100
+ return ''
101
+ end
102
+ @parser.parse(body, site)
103
+ end
104
+
105
+ def cache_robots_txt(site, &block)
106
+ if @robotstxt.key?(site)
107
+ @robotstxt[site]
108
+ else
109
+ @robotstxt[site] = block.call(site)
110
+ end
111
+ end
112
+
113
+ def http_get(uri)
114
+ referer = nil
115
+ 10.times {
116
+ http = Net::HTTP.new(uri.host, uri.port)
117
+ http.use_ssl = uri.is_a?(URI::HTTPS)
118
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
119
+ header = { 'User-Agent' => @user_agent }
120
+ header['Referer'] = referer if referer
121
+ # header is destroyed by this in ruby 1.9.2!
122
+ response = http.get(uri.request_uri, header)
123
+ case response
124
+ when Net::HTTPSuccess
125
+ return response.body
126
+ when Net::HTTPRedirection
127
+ referer = uri.to_s
128
+ uri = URI(response['location'])
129
+ else
130
+ response.value
131
+ end
132
+ }
133
+ raise 'too many HTTP redirects'
134
+ end
135
+ end
@@ -0,0 +1,714 @@
1
+ #
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by Racc 1.4.6
4
+ # from Racc grammer file "".
5
+ #
6
+
7
+ require 'racc/parser.rb'
8
+
9
+
10
+ require 'strscan'
11
+ require 'uri'
12
+
13
+ class WebRobots
14
+ class Error < StandardError
15
+ end
16
+
17
+ class ParseError < Error
18
+ end
19
+
20
+ class RobotsTxt
21
+ class Parser < Racc::Parser
22
+
23
+ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
24
+
25
+ def initialize(target = nil)
26
+ super()
27
+ @target = target
28
+ end
29
+
30
+ def self.parse(input, target = nil)
31
+ new(target).parse(input)
32
+ end
33
+
34
+ KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
35
+ RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
36
+
37
+ def parse(input, site)
38
+ @q = []
39
+ @errors = []
40
+ @lineno = 1
41
+ @site = site
42
+
43
+ string = input.respond_to?(:read) ? input.read : input
44
+ s = StringScanner.new(string)
45
+ value_expected = false
46
+
47
+ until s.eos?
48
+ if t = s.scan(/[ \t]*\r?\n/)
49
+ @q << [:EOL, t]
50
+ value_expected = false
51
+ elsif t = s.scan(/[ \t]+/)
52
+ @q << [:SPACE, t]
53
+ elsif t = s.scan(/:/)
54
+ @q << [t, t]
55
+ value_expected = true
56
+ elsif t = s.scan(/#.*/)
57
+ @q << [:COMMENT, t]
58
+ else
59
+ if value_expected
60
+ if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
61
+ @q << [:VALUE, t]
62
+ else
63
+ parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
64
+ end
65
+ value_expected = false
66
+ else
67
+ if t = s.scan(RE_KNOWN_TOKENS)
68
+ @q << [t.downcase, t]
69
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
70
+ @q << [:TOKEN, t]
71
+ else
72
+ parse_error "unexpected characters: %s" % s.check(/.*/)
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
79
+
80
+ @pos = -1
81
+
82
+ do_parse
83
+ rescue Racc::ParseError => e
84
+ raise ParseError, e.message
85
+ end
86
+
87
+ def next_token
88
+ @q[@pos += 1]
89
+ end
90
+
91
+ def on_error(token_id, value, stack)
92
+ parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
93
+ end
94
+
95
+ def parse_error(message)
96
+ message = "%s line %d: %s" % [@site.to_s, @lineno, message]
97
+ if @lax
98
+ @errors << message
99
+ else
100
+ raise Racc::ParseError, message
101
+ end
102
+ end
103
+
104
+ ...end robotstxt.ry/module_eval...
105
+ ##### State transition tables begin ###
106
+
107
+ racc_action_table = [
108
+ 6, 13, -11, 17, 53, 6, -13, 37, 38, 39,
109
+ 40, 13, -11, 17, 47, 28, 28, 37, 38, 39,
110
+ 40, 13, -11, 17, 50, 51, 52, 37, 38, 39,
111
+ 40, 13, -11, 17, 13, 54, 25, 37, 38, 39,
112
+ 40, 13, -11, 17, 13, 13, -13, 13, -11, 17,
113
+ 6, 13, -14, 17, 6, 13, 13, 17, 6, 13,
114
+ 13, 17, 6, 13, 13, 17, 6, 13, 24, 17,
115
+ 6, 13, 63, 17, 64, 65, 66, 67, 6, 10,
116
+ 6, 7, 6 ]
117
+
118
+ racc_action_check = [
119
+ 22, 22, 22, 22, 40, 24, 22, 22, 22, 22,
120
+ 22, 26, 26, 26, 28, 20, 26, 26, 26, 26,
121
+ 26, 46, 46, 46, 37, 38, 39, 46, 46, 46,
122
+ 46, 30, 30, 30, 25, 42, 17, 30, 30, 30,
123
+ 30, 8, 8, 8, 47, 50, 8, 14, 14, 14,
124
+ 63, 63, 14, 63, 54, 54, 51, 54, 64, 64,
125
+ 52, 64, 65, 65, 53, 65, 66, 66, 16, 66,
126
+ 67, 67, 55, 67, 56, 57, 58, 59, 12, 7,
127
+ 3, 1, 0 ]
128
+
129
+ racc_action_pointer = [
130
+ 80, 81, nil, 78, nil, nil, nil, 79, 38, nil,
131
+ nil, nil, 76, nil, 44, nil, 64, 30, nil, nil,
132
+ 7, nil, -2, nil, 3, 31, 8, nil, 8, nil,
133
+ 28, nil, nil, nil, nil, nil, nil, 18, 19, 20,
134
+ -2, nil, 28, nil, nil, nil, 18, 41, nil, nil,
135
+ 42, 53, 57, 61, 52, 65, 67, 68, 69, 70,
136
+ nil, nil, nil, 48, 56, 60, 64, 68, nil, nil,
137
+ nil, nil, nil ]
138
+
139
+ racc_action_default = [
140
+ -5, -45, -1, -6, -7, -9, -10, -45, -3, -8,
141
+ 73, -2, -5, -12, -24, -15, -45, -45, -19, -20,
142
+ -45, -4, -6, -16, -45, -11, -30, -26, -45, -21,
143
+ -22, -23, -32, -35, -36, -37, -38, -45, -45, -45,
144
+ -45, -17, -45, -25, -27, -28, -31, -11, -33, -34,
145
+ -11, -11, -11, -11, -11, -45, -45, -45, -45, -45,
146
+ -18, -43, -44, -11, -11, -11, -11, -11, -29, -39,
147
+ -40, -41, -42 ]
148
+
149
+ racc_goto_table = [
150
+ 15, 42, 9, 48, 3, 12, 23, 11, 5, 27,
151
+ 18, 5, 26, 2, 15, 44, 22, 19, 45, 48,
152
+ 5, 9, 49, 55, 29, 21, 56, 57, 58, 59,
153
+ 5, 31, 41, 60, 43, 30, 8, 1, 49, 46,
154
+ nil, nil, 68, 69, 70, 71, 72 ]
155
+
156
+ racc_goto_check = [
157
+ 12, 9, 7, 20, 6, 5, 12, 3, 8, 19,
158
+ 14, 8, 17, 2, 12, 19, 6, 15, 12, 20,
159
+ 8, 7, 12, 9, 14, 2, 9, 9, 9, 9,
160
+ 8, 15, 8, 13, 18, 16, 4, 1, 12, 16,
161
+ nil, nil, 13, 13, 13, 13, 13 ]
162
+
163
+ racc_goto_pointer = [
164
+ nil, 37, 13, -1, 34, -3, 4, -1, 8, -24,
165
+ nil, nil, -8, -21, 2, 9, 13, -8, 8, -11,
166
+ -27, nil, nil, nil, nil ]
167
+
168
+ racc_goto_default = [
169
+ nil, nil, nil, nil, nil, nil, nil, 4, 61, 16,
170
+ 20, 14, 62, nil, nil, nil, nil, nil, nil, nil,
171
+ 32, 33, 34, 35, 36 ]
172
+
173
+ racc_reduce_table = [
174
+ 0, 0, :racc_error,
175
+ 0, 17, :_reduce_1,
176
+ 3, 14, :_reduce_2,
177
+ 0, 16, :_reduce_none,
178
+ 2, 16, :_reduce_none,
179
+ 0, 15, :_reduce_none,
180
+ 1, 15, :_reduce_none,
181
+ 1, 19, :_reduce_none,
182
+ 2, 19, :_reduce_none,
183
+ 1, 20, :_reduce_none,
184
+ 1, 21, :_reduce_10,
185
+ 0, 22, :_reduce_none,
186
+ 1, 22, :_reduce_none,
187
+ 0, 23, :_reduce_none,
188
+ 1, 23, :_reduce_none,
189
+ 1, 24, :_reduce_none,
190
+ 2, 24, :_reduce_none,
191
+ 3, 25, :_reduce_none,
192
+ 5, 25, :_reduce_18,
193
+ 1, 18, :_reduce_19,
194
+ 1, 18, :_reduce_20,
195
+ 3, 18, :_reduce_21,
196
+ 3, 18, :_reduce_22,
197
+ 3, 18, :_reduce_none,
198
+ 1, 28, :_reduce_none,
199
+ 3, 27, :_reduce_25,
200
+ 1, 30, :_reduce_26,
201
+ 2, 30, :_reduce_27,
202
+ 2, 30, :_reduce_none,
203
+ 5, 32, :_reduce_29,
204
+ 0, 31, :_reduce_none,
205
+ 1, 31, :_reduce_none,
206
+ 1, 29, :_reduce_32,
207
+ 2, 29, :_reduce_33,
208
+ 2, 29, :_reduce_none,
209
+ 1, 33, :_reduce_none,
210
+ 1, 33, :_reduce_none,
211
+ 1, 33, :_reduce_none,
212
+ 1, 33, :_reduce_none,
213
+ 5, 34, :_reduce_39,
214
+ 5, 35, :_reduce_40,
215
+ 5, 36, :_reduce_41,
216
+ 5, 37, :_reduce_42,
217
+ 1, 26, :_reduce_none,
218
+ 1, 26, :_reduce_none ]
219
+
220
+ racc_reduce_n = 45
221
+
222
+ racc_shift_n = 73
223
+
224
+ racc_token_table = {
225
+ false => 0,
226
+ :error => 1,
227
+ :EOL => 2,
228
+ :SPACE => 3,
229
+ :COMMENT => 4,
230
+ "sitemap" => 5,
231
+ ":" => 6,
232
+ :VALUE => 7,
233
+ "user-agent" => 8,
234
+ "allow" => 9,
235
+ "disallow" => 10,
236
+ "crawl-delay" => 11,
237
+ :TOKEN => 12 }
238
+
239
+ racc_nt_base = 13
240
+
241
+ racc_use_result_var = true
242
+
243
+ Racc_arg = [
244
+ racc_action_table,
245
+ racc_action_check,
246
+ racc_action_default,
247
+ racc_action_pointer,
248
+ racc_goto_table,
249
+ racc_goto_check,
250
+ racc_goto_default,
251
+ racc_goto_pointer,
252
+ racc_nt_base,
253
+ racc_reduce_table,
254
+ racc_token_table,
255
+ racc_shift_n,
256
+ racc_reduce_n,
257
+ racc_use_result_var ]
258
+
259
+ Racc_token_to_s_table = [
260
+ "$end",
261
+ "error",
262
+ "EOL",
263
+ "SPACE",
264
+ "COMMENT",
265
+ "\"sitemap\"",
266
+ "\":\"",
267
+ "VALUE",
268
+ "\"user-agent\"",
269
+ "\"allow\"",
270
+ "\"disallow\"",
271
+ "\"crawl-delay\"",
272
+ "TOKEN",
273
+ "$start",
274
+ "robotstxt",
275
+ "opt_blanklines",
276
+ "body",
277
+ "@1",
278
+ "blocks",
279
+ "blanklines",
280
+ "blankline",
281
+ "eol",
282
+ "opt_space",
283
+ "opt_commentlines",
284
+ "commentlines",
285
+ "comment",
286
+ "eol_opt_comment",
287
+ "record",
288
+ "commentblock",
289
+ "rulelines",
290
+ "agentlines",
291
+ "opt_rulelines",
292
+ "agentline",
293
+ "ruleline",
294
+ "allowline",
295
+ "disallowline",
296
+ "crawldelayline",
297
+ "extension" ]
298
+
299
+ Racc_debug_parser = false
300
+
301
+ ##### State transition tables end #####
302
+
303
+ # reduce 0 omitted
304
+
305
+ module_eval(<<'.,.,', 'robotstxt.ry', 7)
306
+ def _reduce_1(val, _values, result)
307
+ @sitemaps = []
308
+
309
+ result
310
+ end
311
+ .,.,
312
+
313
+ module_eval(<<'.,.,', 'robotstxt.ry', 11)
314
+ def _reduce_2(val, _values, result)
315
+ body = val[2]
316
+ result = RobotsTxt.new(@site, body,
317
+ :target => @target, :sitemaps => @sitemaps)
318
+
319
+ result
320
+ end
321
+ .,.,
322
+
323
+ # reduce 3 omitted
324
+
325
+ # reduce 4 omitted
326
+
327
+ # reduce 5 omitted
328
+
329
+ # reduce 6 omitted
330
+
331
+ # reduce 7 omitted
332
+
333
+ # reduce 8 omitted
334
+
335
+ # reduce 9 omitted
336
+
337
+ module_eval(<<'.,.,', 'robotstxt.ry', 31)
338
+ def _reduce_10(val, _values, result)
339
+ @lineno += 1
340
+
341
+ result
342
+ end
343
+ .,.,
344
+
345
+ # reduce 11 omitted
346
+
347
+ # reduce 12 omitted
348
+
349
+ # reduce 13 omitted
350
+
351
+ # reduce 14 omitted
352
+
353
+ # reduce 15 omitted
354
+
355
+ # reduce 16 omitted
356
+
357
+ # reduce 17 omitted
358
+
359
+ module_eval(<<'.,.,', 'robotstxt.ry', 47)
360
+ def _reduce_18(val, _values, result)
361
+ @sitemaps << val[3]
362
+
363
+ result
364
+ end
365
+ .,.,
366
+
367
+ module_eval(<<'.,.,', 'robotstxt.ry', 52)
368
+ def _reduce_19(val, _values, result)
369
+ result = []
370
+ result << val[0]
371
+
372
+ result
373
+ end
374
+ .,.,
375
+
376
+ module_eval(<<'.,.,', 'robotstxt.ry', 57)
377
+ def _reduce_20(val, _values, result)
378
+ result = []
379
+
380
+ result
381
+ end
382
+ .,.,
383
+
384
+ module_eval(<<'.,.,', 'robotstxt.ry', 63)
385
+ def _reduce_21(val, _values, result)
386
+ result << val[2]
387
+
388
+ result
389
+ end
390
+ .,.,
391
+
392
+ module_eval(<<'.,.,', 'robotstxt.ry', 69)
393
+ def _reduce_22(val, _values, result)
394
+ val[2].each_with_index { |line, i|
395
+ warn "%s line %d: %s: orphan rule line" %
396
+ [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
397
+ }
398
+
399
+ result
400
+ end
401
+ .,.,
402
+
403
+ # reduce 23 omitted
404
+
405
+ # reduce 24 omitted
406
+
407
+ module_eval(<<'.,.,', 'robotstxt.ry', 84)
408
+ def _reduce_25(val, _values, result)
409
+ result = Record.new(val[1], val[2])
410
+
411
+ result
412
+ end
413
+ .,.,
414
+
415
+ module_eval(<<'.,.,', 'robotstxt.ry', 89)
416
+ def _reduce_26(val, _values, result)
417
+ result = [val[0]]
418
+
419
+ result
420
+ end
421
+ .,.,
422
+
423
+ module_eval(<<'.,.,', 'robotstxt.ry', 94)
424
+ def _reduce_27(val, _values, result)
425
+ result << val[1]
426
+
427
+ result
428
+ end
429
+ .,.,
430
+
431
+ # reduce 28 omitted
432
+
433
+ module_eval(<<'.,.,', 'robotstxt.ry', 101)
434
+ def _reduce_29(val, _values, result)
435
+ result = AgentLine.new(val[0], val[3])
436
+
437
+ result
438
+ end
439
+ .,.,
440
+
441
+ # reduce 30 omitted
442
+
443
+ # reduce 31 omitted
444
+
445
+ module_eval(<<'.,.,', 'robotstxt.ry', 109)
446
+ def _reduce_32(val, _values, result)
447
+ result = [result]
448
+ @rulelinenos = []
449
+
450
+ result
451
+ end
452
+ .,.,
453
+
454
+ module_eval(<<'.,.,', 'robotstxt.ry', 115)
455
+ def _reduce_33(val, _values, result)
456
+ result << val[1]
457
+ @rulelinenos << @lineno
458
+
459
+ result
460
+ end
461
+ .,.,
462
+
463
+ # reduce 34 omitted
464
+
465
+ # reduce 35 omitted
466
+
467
+ # reduce 36 omitted
468
+
469
+ # reduce 37 omitted
470
+
471
+ # reduce 38 omitted
472
+
473
+ module_eval(<<'.,.,', 'robotstxt.ry', 128)
474
+ def _reduce_39(val, _values, result)
475
+ result = AllowLine.new(val[0], val[3])
476
+
477
+ result
478
+ end
479
+ .,.,
480
+
481
+ module_eval(<<'.,.,', 'robotstxt.ry', 133)
482
+ def _reduce_40(val, _values, result)
483
+ result = DisallowLine.new(val[0], val[3])
484
+
485
+ result
486
+ end
487
+ .,.,
488
+
489
+ module_eval(<<'.,.,', 'robotstxt.ry', 138)
490
+ def _reduce_41(val, _values, result)
491
+ result = CrawlDelayLine.new(val[0], val[3])
492
+
493
+ result
494
+ end
495
+ .,.,
496
+
497
+ module_eval(<<'.,.,', 'robotstxt.ry', 143)
498
+ def _reduce_42(val, _values, result)
499
+ result = ExtentionLine.new(val[0], val[3])
500
+
501
+ result
502
+ end
503
+ .,.,
504
+
505
+ # reduce 43 omitted
506
+
507
+ # reduce 44 omitted
508
+
509
+ def _reduce_none(val, _values, result)
510
+ val[0]
511
+ end
512
+
513
+ end # class Parser
514
+
515
+ def initialize(site, records, options = nil)
516
+ super()
517
+ @site = site
518
+ @options = options || {}
519
+ @last_checked = nil
520
+
521
+ @target = @options[:target]
522
+ @sitemaps = @options[:sitemaps] || []
523
+
524
+ if records && !records.empty?
525
+ @records, defaults = [], []
526
+ records.each { |record|
527
+ if record.default?
528
+ defaults << record
529
+ elsif !@target || record.match?(@target)
530
+ @records << record
531
+ end
532
+ }
533
+ @records.concat(defaults)
534
+ else
535
+ @records = []
536
+ end
537
+ end
538
+
539
+ attr_reader :site, :sitemaps
540
+
541
+ def target(user_agent = nil)
542
+ if user_agent
543
+ raise ArgumentError, "this instance is targeted for #{@target}" if @target
544
+ user_agent
545
+ else
546
+ raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
547
+ @target
548
+ end
549
+ end
550
+ private :target
551
+
552
+ def find_record(user_agent = nil)
553
+ user_agent = target(user_agent)
554
+ @records.find { |record|
555
+ record.match?(user_agent)
556
+ }
557
+ end
558
+ private :find_record
559
+
560
+ def allow?(request_uri, user_agent = nil)
561
+ record = find_record(user_agent) or return true
562
+ allow = record.allow?(request_uri)
563
+ if @last_checked and delay = record.delay
564
+ delay -= Time.now - @last_checked
565
+ sleep delay if delay > 0
566
+ end
567
+ @last_checked = Time.now
568
+ return allow
569
+ end
570
+
571
+ def options(user_agent = nil)
572
+ record = find_record(user_agent) or return {}
573
+ record.options
574
+ end
575
+
576
+ class Record
577
+ def initialize(agentlines, rulelines)
578
+ @patterns = agentlines.map { |agentline| agentline.pattern }
579
+ @acls = []
580
+ @delay = nil
581
+ @options = {}
582
+ rulelines.each { |ruleline|
583
+ case ruleline
584
+ when AccessControlLine
585
+ @acls << ruleline
586
+ when CrawlDelayLine
587
+ @delay = ruleline.delay
588
+ else
589
+ @options[ruleline.token.downcase] = ruleline.value
590
+ end
591
+ }
592
+ @acls.sort! { |a, b|
593
+ [
594
+ b.value.length, b.is_a?(AllowLine) ? 1 : 0
595
+ ] <=> [
596
+ a.value.length, a.is_a?(AllowLine) ? 1 : 0
597
+ ]
598
+ }
599
+ end
600
+
601
+ attr_reader :delay, :options
602
+
603
+ def match?(user_agent)
604
+ @patterns.any? { |pattern|
605
+ pattern.match(user_agent)
606
+ }
607
+ end
608
+
609
+ def default?
610
+ @patterns.include?(//)
611
+ end
612
+
613
+ def allow?(request_uri)
614
+ @acls.each { |acl|
615
+ if acl.match?(request_uri)
616
+ return acl.allow?
617
+ end
618
+ }
619
+ return true
620
+ end
621
+ end
622
+
623
+ class Line
624
+ def initialize(token, value)
625
+ @token = token
626
+ @value = value
627
+ compile
628
+ end
629
+
630
+ attr_reader :token, :value
631
+
632
+ def compile
633
+ self
634
+ end
635
+ end
636
+
637
+ class AgentLine < Line
638
+ def compile
639
+ if @value == '*'
640
+ @pattern = //
641
+ else
642
+ @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
643
+ end
644
+ self
645
+ end
646
+
647
+ attr_reader :pattern
648
+ end
649
+
650
+ class AccessControlLine < Line
651
+ def compile
652
+ @empty = @value.empty?
653
+ re_src = '\A'
654
+ s = StringScanner.new(@value)
655
+ until s.eos?
656
+ if t = s.scan(/[^%*$]+/)
657
+ re_src << Regexp.quote(t)
658
+ elsif t = s.scan(/%([0-9a-f]{2})/i)
659
+ c = s[1].to_i(16)
660
+ if c == 0x2f
661
+ re_src << '%2[fF]'
662
+ else
663
+ re_src << Regexp.quote('%c' % c)
664
+ end
665
+ elsif t = s.scan(/\*/)
666
+ re_src << '.*'
667
+ elsif t = s.scan(/\$/)
668
+ re_src << '\z'
669
+ break
670
+ else
671
+ raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
672
+ end
673
+ end
674
+ @pattern = Regexp.new(re_src, Regexp::MULTILINE)
675
+ self
676
+ end
677
+
678
+ def match?(request_uri)
679
+ !@empty && !!@pattern.match(request_uri)
680
+ end
681
+ end
682
+
683
+ class AllowLine < AccessControlLine
684
+ def allow?
685
+ true
686
+ end
687
+ end
688
+
689
+ class DisallowLine < AccessControlLine
690
+ def allow?
691
+ false
692
+ end
693
+ end
694
+
695
+ class CrawlDelayLine < Line
696
+ def compile
697
+ case @value
698
+ when /\A((0|[1-9][0-9]*)\.[0-9]+)/
699
+ @delay = @value.to_f
700
+ when /\A(0|[1-9][0-9]*)/
701
+ @delay = @value.to_i
702
+ else
703
+ @delay = nil
704
+ end
705
+ self
706
+ end
707
+
708
+ attr_reader :delay
709
+ end
710
+
711
+ class ExtentionLine < Line
712
+ end
713
+ end
714
+ end