webrobots 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/lib/webrobots.rb +135 -0
- data/lib/webrobots/robotstxt.rb +714 -0
- data/lib/webrobots/robotstxt.ry +444 -0
- data/test/helper.rb +18 -0
- data/test/test_webrobots.rb +291 -0
- metadata +155 -0
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "racc", ">= 0"
|
6
|
+
|
7
|
+
# Add dependencies to develop your gem here.
|
8
|
+
# Include everything needed to run rake, tests, features, etc.
|
9
|
+
group :development do
|
10
|
+
gem "shoulda", ">= 0"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.5.1"
|
13
|
+
gem "rcov", ">= 0"
|
14
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.5.2)
|
6
|
+
bundler (~> 1.0.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
racc (1.4.6)
|
10
|
+
rake (0.8.7)
|
11
|
+
rcov (0.9.9)
|
12
|
+
shoulda (2.11.3)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
bundler (~> 1.0.0)
|
19
|
+
jeweler (~> 1.5.1)
|
20
|
+
racc
|
21
|
+
rcov
|
22
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Akinori MUSHA
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= webrobots
|
2
|
+
|
3
|
+
This is a library to help write robots.txt compliant web robots.
|
4
|
+
|
5
|
+
== Contributing to webrobots
|
6
|
+
|
7
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
8
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
9
|
+
* Fork the project
|
10
|
+
* Start a feature/bugfix branch
|
11
|
+
* Commit and push until you are happy with your contribution
|
12
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2010, 2011 Akinori MUSHA. See LICENSE.txt for
|
18
|
+
further details.
|
19
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "webrobots"
|
16
|
+
# gem.homepage = "http://github.com/knu/webrobots"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{A library to help write robots.txt compliant web robots}
|
19
|
+
gem.description = <<-'EOS'
|
20
|
+
This library helps write robots.txt compliant web robots.
|
21
|
+
EOS
|
22
|
+
gem.email = "knu@idaemons.org"
|
23
|
+
gem.authors = ["Akinori MUSHA"]
|
24
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'rcov/rcovtask'
|
39
|
+
Rcov::RcovTask.new do |test|
|
40
|
+
test.libs << 'test'
|
41
|
+
test.pattern = 'test/**/test_*.rb'
|
42
|
+
test.verbose = true
|
43
|
+
end
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
task :test => 'lib/webrobots/robotstxt.rb'
|
48
|
+
|
49
|
+
file 'lib/webrobots/robotstxt.rb' => 'lib/webrobots/robotstxt.ry' do
|
50
|
+
sh 'racc', '-o', 'lib/webrobots/robotstxt.rb', 'lib/webrobots/robotstxt.ry'
|
51
|
+
end
|
52
|
+
|
53
|
+
require 'rake/rdoctask'
|
54
|
+
Rake::RDocTask.new do |rdoc|
|
55
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
+
|
57
|
+
rdoc.rdoc_dir = 'rdoc'
|
58
|
+
rdoc.title = "webrobots #{version}"
|
59
|
+
rdoc.rdoc_files.include('README*')
|
60
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/lib/webrobots.rb
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'webrobots/robotstxt'
|
2
|
+
require 'uri'
|
3
|
+
require 'net/https'
|
4
|
+
|
5
|
+
class WebRobots
|
6
|
+
# Creates a WebRobots object for a robot named +user_agent+, with
|
7
|
+
# optional +options+.
|
8
|
+
#
|
9
|
+
# * :http_get => a custom method, proc, or anything that responds to
|
10
|
+
# .call(uri), to be used for fetching robots.txt. It must return
|
11
|
+
# the response body if successful, or raise Net::HTTPNotFound if
|
12
|
+
# the resource is not found. Any other errror is regarded as
|
13
|
+
# blanket ban.
|
14
|
+
def initialize(user_agent, options = nil)
|
15
|
+
@user_agent = user_agent
|
16
|
+
@parser = RobotsTxt::Parser.new(user_agent)
|
17
|
+
|
18
|
+
options ||= {}
|
19
|
+
@http_get = options[:http_get] || method(:http_get)
|
20
|
+
|
21
|
+
@robotstxt = {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the robot name initially given.
|
25
|
+
attr_reader :user_agent
|
26
|
+
|
27
|
+
# Tests if the robot is allowed to access a resource at +url+. If a
|
28
|
+
# malformed URI string is given, URI::InvalidURIError is raised. If
|
29
|
+
# a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is
|
30
|
+
# raised.
|
31
|
+
def allowed?(url)
|
32
|
+
site, request_uri = split_uri(url)
|
33
|
+
return true if request_uri == '/robots.txt'
|
34
|
+
robots_txt(site).allow?(request_uri)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Equivalent to !allowed?(url).
|
38
|
+
def disallowed?(url)
|
39
|
+
!allowed?(url)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns extended option values for a resource at +url+ in a hash
|
43
|
+
# with each field name lower-cased. See allowed?() for a list of
|
44
|
+
# errors that may be raised.
|
45
|
+
def options(url)
|
46
|
+
site, = split_uri(url)
|
47
|
+
robots_txt(site).options
|
48
|
+
end
|
49
|
+
|
50
|
+
# Equivalent to option(url)[token.downcase].
|
51
|
+
def option(url, token)
|
52
|
+
options(url)[token.downcase]
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns an array of Sitemap URLs. See allowed?() for a list of
|
56
|
+
# errors that may be raised.
|
57
|
+
def sitemaps(url)
|
58
|
+
site, = split_uri(url)
|
59
|
+
robots_txt(site).sitemaps
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def split_uri(url)
|
65
|
+
site =
|
66
|
+
if url.is_a?(URI)
|
67
|
+
url.dup
|
68
|
+
else
|
69
|
+
begin
|
70
|
+
URI.parse(url)
|
71
|
+
rescue => e
|
72
|
+
raise ArgumentError, e.message
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
site.scheme && site.host or
|
77
|
+
raise ArgumentError, "non-absolute URI: #{url}"
|
78
|
+
|
79
|
+
site.is_a?(URI::HTTP) or
|
80
|
+
raise ArgumentError, "non-HTTP/HTTPS URI: #{url}"
|
81
|
+
|
82
|
+
request_uri = site.request_uri
|
83
|
+
if (host = site.host).match(/[[:upper:]]/)
|
84
|
+
site.host = host.downcase
|
85
|
+
end
|
86
|
+
site.path = '/'
|
87
|
+
return site, request_uri
|
88
|
+
end
|
89
|
+
|
90
|
+
def robots_txt(site)
|
91
|
+
cache_robots_txt(site) {
|
92
|
+
fetch_robots_txt(site)
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
def fetch_robots_txt(site)
|
97
|
+
begin
|
98
|
+
body = @http_get.call(site + 'robots.txt')
|
99
|
+
rescue Net::HTTPNotFound
|
100
|
+
return ''
|
101
|
+
end
|
102
|
+
@parser.parse(body, site)
|
103
|
+
end
|
104
|
+
|
105
|
+
def cache_robots_txt(site, &block)
|
106
|
+
if @robotstxt.key?(site)
|
107
|
+
@robotstxt[site]
|
108
|
+
else
|
109
|
+
@robotstxt[site] = block.call(site)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def http_get(uri)
|
114
|
+
referer = nil
|
115
|
+
10.times {
|
116
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
117
|
+
http.use_ssl = uri.is_a?(URI::HTTPS)
|
118
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
119
|
+
header = { 'User-Agent' => @user_agent }
|
120
|
+
header['Referer'] = referer if referer
|
121
|
+
# header is destroyed by this in ruby 1.9.2!
|
122
|
+
response = http.get(uri.request_uri, header)
|
123
|
+
case response
|
124
|
+
when Net::HTTPSuccess
|
125
|
+
return response.body
|
126
|
+
when Net::HTTPRedirection
|
127
|
+
referer = uri.to_s
|
128
|
+
uri = URI(response['location'])
|
129
|
+
else
|
130
|
+
response.value
|
131
|
+
end
|
132
|
+
}
|
133
|
+
raise 'too many HTTP redirects'
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,714 @@
|
|
1
|
+
#
|
2
|
+
# DO NOT MODIFY!!!!
|
3
|
+
# This file is automatically generated by Racc 1.4.6
|
4
|
+
# from Racc grammer file "".
|
5
|
+
#
|
6
|
+
|
7
|
+
require 'racc/parser.rb'
|
8
|
+
|
9
|
+
|
10
|
+
require 'strscan'
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
class WebRobots
|
14
|
+
class Error < StandardError
|
15
|
+
end
|
16
|
+
|
17
|
+
class ParseError < Error
|
18
|
+
end
|
19
|
+
|
20
|
+
class RobotsTxt
|
21
|
+
class Parser < Racc::Parser
|
22
|
+
|
23
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
24
|
+
|
25
|
+
def initialize(target = nil)
|
26
|
+
super()
|
27
|
+
@target = target
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.parse(input, target = nil)
|
31
|
+
new(target).parse(input)
|
32
|
+
end
|
33
|
+
|
34
|
+
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
35
|
+
RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
|
36
|
+
|
37
|
+
def parse(input, site)
|
38
|
+
@q = []
|
39
|
+
@errors = []
|
40
|
+
@lineno = 1
|
41
|
+
@site = site
|
42
|
+
|
43
|
+
string = input.respond_to?(:read) ? input.read : input
|
44
|
+
s = StringScanner.new(string)
|
45
|
+
value_expected = false
|
46
|
+
|
47
|
+
until s.eos?
|
48
|
+
if t = s.scan(/[ \t]*\r?\n/)
|
49
|
+
@q << [:EOL, t]
|
50
|
+
value_expected = false
|
51
|
+
elsif t = s.scan(/[ \t]+/)
|
52
|
+
@q << [:SPACE, t]
|
53
|
+
elsif t = s.scan(/:/)
|
54
|
+
@q << [t, t]
|
55
|
+
value_expected = true
|
56
|
+
elsif t = s.scan(/#.*/)
|
57
|
+
@q << [:COMMENT, t]
|
58
|
+
else
|
59
|
+
if value_expected
|
60
|
+
if t = s.scan(/.*?(?=[ \t]*(?:#|$))/)
|
61
|
+
@q << [:VALUE, t]
|
62
|
+
else
|
63
|
+
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
64
|
+
end
|
65
|
+
value_expected = false
|
66
|
+
else
|
67
|
+
if t = s.scan(RE_KNOWN_TOKENS)
|
68
|
+
@q << [t.downcase, t]
|
69
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
70
|
+
@q << [:TOKEN, t]
|
71
|
+
else
|
72
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
@q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL
|
79
|
+
|
80
|
+
@pos = -1
|
81
|
+
|
82
|
+
do_parse
|
83
|
+
rescue Racc::ParseError => e
|
84
|
+
raise ParseError, e.message
|
85
|
+
end
|
86
|
+
|
87
|
+
def next_token
|
88
|
+
@q[@pos += 1]
|
89
|
+
end
|
90
|
+
|
91
|
+
def on_error(token_id, value, stack)
|
92
|
+
parse_error "unexpected %s: %s" % [token_to_str(token_id), value]
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse_error(message)
|
96
|
+
message = "%s line %d: %s" % [@site.to_s, @lineno, message]
|
97
|
+
if @lax
|
98
|
+
@errors << message
|
99
|
+
else
|
100
|
+
raise Racc::ParseError, message
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
...end robotstxt.ry/module_eval...
|
105
|
+
##### State transition tables begin ###
|
106
|
+
|
107
|
+
racc_action_table = [
|
108
|
+
6, 13, -11, 17, 53, 6, -13, 37, 38, 39,
|
109
|
+
40, 13, -11, 17, 47, 28, 28, 37, 38, 39,
|
110
|
+
40, 13, -11, 17, 50, 51, 52, 37, 38, 39,
|
111
|
+
40, 13, -11, 17, 13, 54, 25, 37, 38, 39,
|
112
|
+
40, 13, -11, 17, 13, 13, -13, 13, -11, 17,
|
113
|
+
6, 13, -14, 17, 6, 13, 13, 17, 6, 13,
|
114
|
+
13, 17, 6, 13, 13, 17, 6, 13, 24, 17,
|
115
|
+
6, 13, 63, 17, 64, 65, 66, 67, 6, 10,
|
116
|
+
6, 7, 6 ]
|
117
|
+
|
118
|
+
racc_action_check = [
|
119
|
+
22, 22, 22, 22, 40, 24, 22, 22, 22, 22,
|
120
|
+
22, 26, 26, 26, 28, 20, 26, 26, 26, 26,
|
121
|
+
26, 46, 46, 46, 37, 38, 39, 46, 46, 46,
|
122
|
+
46, 30, 30, 30, 25, 42, 17, 30, 30, 30,
|
123
|
+
30, 8, 8, 8, 47, 50, 8, 14, 14, 14,
|
124
|
+
63, 63, 14, 63, 54, 54, 51, 54, 64, 64,
|
125
|
+
52, 64, 65, 65, 53, 65, 66, 66, 16, 66,
|
126
|
+
67, 67, 55, 67, 56, 57, 58, 59, 12, 7,
|
127
|
+
3, 1, 0 ]
|
128
|
+
|
129
|
+
racc_action_pointer = [
|
130
|
+
80, 81, nil, 78, nil, nil, nil, 79, 38, nil,
|
131
|
+
nil, nil, 76, nil, 44, nil, 64, 30, nil, nil,
|
132
|
+
7, nil, -2, nil, 3, 31, 8, nil, 8, nil,
|
133
|
+
28, nil, nil, nil, nil, nil, nil, 18, 19, 20,
|
134
|
+
-2, nil, 28, nil, nil, nil, 18, 41, nil, nil,
|
135
|
+
42, 53, 57, 61, 52, 65, 67, 68, 69, 70,
|
136
|
+
nil, nil, nil, 48, 56, 60, 64, 68, nil, nil,
|
137
|
+
nil, nil, nil ]
|
138
|
+
|
139
|
+
racc_action_default = [
|
140
|
+
-5, -45, -1, -6, -7, -9, -10, -45, -3, -8,
|
141
|
+
73, -2, -5, -12, -24, -15, -45, -45, -19, -20,
|
142
|
+
-45, -4, -6, -16, -45, -11, -30, -26, -45, -21,
|
143
|
+
-22, -23, -32, -35, -36, -37, -38, -45, -45, -45,
|
144
|
+
-45, -17, -45, -25, -27, -28, -31, -11, -33, -34,
|
145
|
+
-11, -11, -11, -11, -11, -45, -45, -45, -45, -45,
|
146
|
+
-18, -43, -44, -11, -11, -11, -11, -11, -29, -39,
|
147
|
+
-40, -41, -42 ]
|
148
|
+
|
149
|
+
racc_goto_table = [
|
150
|
+
15, 42, 9, 48, 3, 12, 23, 11, 5, 27,
|
151
|
+
18, 5, 26, 2, 15, 44, 22, 19, 45, 48,
|
152
|
+
5, 9, 49, 55, 29, 21, 56, 57, 58, 59,
|
153
|
+
5, 31, 41, 60, 43, 30, 8, 1, 49, 46,
|
154
|
+
nil, nil, 68, 69, 70, 71, 72 ]
|
155
|
+
|
156
|
+
racc_goto_check = [
|
157
|
+
12, 9, 7, 20, 6, 5, 12, 3, 8, 19,
|
158
|
+
14, 8, 17, 2, 12, 19, 6, 15, 12, 20,
|
159
|
+
8, 7, 12, 9, 14, 2, 9, 9, 9, 9,
|
160
|
+
8, 15, 8, 13, 18, 16, 4, 1, 12, 16,
|
161
|
+
nil, nil, 13, 13, 13, 13, 13 ]
|
162
|
+
|
163
|
+
racc_goto_pointer = [
|
164
|
+
nil, 37, 13, -1, 34, -3, 4, -1, 8, -24,
|
165
|
+
nil, nil, -8, -21, 2, 9, 13, -8, 8, -11,
|
166
|
+
-27, nil, nil, nil, nil ]
|
167
|
+
|
168
|
+
racc_goto_default = [
|
169
|
+
nil, nil, nil, nil, nil, nil, nil, 4, 61, 16,
|
170
|
+
20, 14, 62, nil, nil, nil, nil, nil, nil, nil,
|
171
|
+
32, 33, 34, 35, 36 ]
|
172
|
+
|
173
|
+
racc_reduce_table = [
|
174
|
+
0, 0, :racc_error,
|
175
|
+
0, 17, :_reduce_1,
|
176
|
+
3, 14, :_reduce_2,
|
177
|
+
0, 16, :_reduce_none,
|
178
|
+
2, 16, :_reduce_none,
|
179
|
+
0, 15, :_reduce_none,
|
180
|
+
1, 15, :_reduce_none,
|
181
|
+
1, 19, :_reduce_none,
|
182
|
+
2, 19, :_reduce_none,
|
183
|
+
1, 20, :_reduce_none,
|
184
|
+
1, 21, :_reduce_10,
|
185
|
+
0, 22, :_reduce_none,
|
186
|
+
1, 22, :_reduce_none,
|
187
|
+
0, 23, :_reduce_none,
|
188
|
+
1, 23, :_reduce_none,
|
189
|
+
1, 24, :_reduce_none,
|
190
|
+
2, 24, :_reduce_none,
|
191
|
+
3, 25, :_reduce_none,
|
192
|
+
5, 25, :_reduce_18,
|
193
|
+
1, 18, :_reduce_19,
|
194
|
+
1, 18, :_reduce_20,
|
195
|
+
3, 18, :_reduce_21,
|
196
|
+
3, 18, :_reduce_22,
|
197
|
+
3, 18, :_reduce_none,
|
198
|
+
1, 28, :_reduce_none,
|
199
|
+
3, 27, :_reduce_25,
|
200
|
+
1, 30, :_reduce_26,
|
201
|
+
2, 30, :_reduce_27,
|
202
|
+
2, 30, :_reduce_none,
|
203
|
+
5, 32, :_reduce_29,
|
204
|
+
0, 31, :_reduce_none,
|
205
|
+
1, 31, :_reduce_none,
|
206
|
+
1, 29, :_reduce_32,
|
207
|
+
2, 29, :_reduce_33,
|
208
|
+
2, 29, :_reduce_none,
|
209
|
+
1, 33, :_reduce_none,
|
210
|
+
1, 33, :_reduce_none,
|
211
|
+
1, 33, :_reduce_none,
|
212
|
+
1, 33, :_reduce_none,
|
213
|
+
5, 34, :_reduce_39,
|
214
|
+
5, 35, :_reduce_40,
|
215
|
+
5, 36, :_reduce_41,
|
216
|
+
5, 37, :_reduce_42,
|
217
|
+
1, 26, :_reduce_none,
|
218
|
+
1, 26, :_reduce_none ]
|
219
|
+
|
220
|
+
racc_reduce_n = 45
|
221
|
+
|
222
|
+
racc_shift_n = 73
|
223
|
+
|
224
|
+
racc_token_table = {
|
225
|
+
false => 0,
|
226
|
+
:error => 1,
|
227
|
+
:EOL => 2,
|
228
|
+
:SPACE => 3,
|
229
|
+
:COMMENT => 4,
|
230
|
+
"sitemap" => 5,
|
231
|
+
":" => 6,
|
232
|
+
:VALUE => 7,
|
233
|
+
"user-agent" => 8,
|
234
|
+
"allow" => 9,
|
235
|
+
"disallow" => 10,
|
236
|
+
"crawl-delay" => 11,
|
237
|
+
:TOKEN => 12 }
|
238
|
+
|
239
|
+
racc_nt_base = 13
|
240
|
+
|
241
|
+
racc_use_result_var = true
|
242
|
+
|
243
|
+
Racc_arg = [
|
244
|
+
racc_action_table,
|
245
|
+
racc_action_check,
|
246
|
+
racc_action_default,
|
247
|
+
racc_action_pointer,
|
248
|
+
racc_goto_table,
|
249
|
+
racc_goto_check,
|
250
|
+
racc_goto_default,
|
251
|
+
racc_goto_pointer,
|
252
|
+
racc_nt_base,
|
253
|
+
racc_reduce_table,
|
254
|
+
racc_token_table,
|
255
|
+
racc_shift_n,
|
256
|
+
racc_reduce_n,
|
257
|
+
racc_use_result_var ]
|
258
|
+
|
259
|
+
Racc_token_to_s_table = [
|
260
|
+
"$end",
|
261
|
+
"error",
|
262
|
+
"EOL",
|
263
|
+
"SPACE",
|
264
|
+
"COMMENT",
|
265
|
+
"\"sitemap\"",
|
266
|
+
"\":\"",
|
267
|
+
"VALUE",
|
268
|
+
"\"user-agent\"",
|
269
|
+
"\"allow\"",
|
270
|
+
"\"disallow\"",
|
271
|
+
"\"crawl-delay\"",
|
272
|
+
"TOKEN",
|
273
|
+
"$start",
|
274
|
+
"robotstxt",
|
275
|
+
"opt_blanklines",
|
276
|
+
"body",
|
277
|
+
"@1",
|
278
|
+
"blocks",
|
279
|
+
"blanklines",
|
280
|
+
"blankline",
|
281
|
+
"eol",
|
282
|
+
"opt_space",
|
283
|
+
"opt_commentlines",
|
284
|
+
"commentlines",
|
285
|
+
"comment",
|
286
|
+
"eol_opt_comment",
|
287
|
+
"record",
|
288
|
+
"commentblock",
|
289
|
+
"rulelines",
|
290
|
+
"agentlines",
|
291
|
+
"opt_rulelines",
|
292
|
+
"agentline",
|
293
|
+
"ruleline",
|
294
|
+
"allowline",
|
295
|
+
"disallowline",
|
296
|
+
"crawldelayline",
|
297
|
+
"extension" ]
|
298
|
+
|
299
|
+
Racc_debug_parser = false
|
300
|
+
|
301
|
+
##### State transition tables end #####
|
302
|
+
|
303
|
+
# reduce 0 omitted
|
304
|
+
|
305
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 7)
|
306
|
+
def _reduce_1(val, _values, result)
|
307
|
+
@sitemaps = []
|
308
|
+
|
309
|
+
result
|
310
|
+
end
|
311
|
+
.,.,
|
312
|
+
|
313
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 11)
|
314
|
+
def _reduce_2(val, _values, result)
|
315
|
+
body = val[2]
|
316
|
+
result = RobotsTxt.new(@site, body,
|
317
|
+
:target => @target, :sitemaps => @sitemaps)
|
318
|
+
|
319
|
+
result
|
320
|
+
end
|
321
|
+
.,.,
|
322
|
+
|
323
|
+
# reduce 3 omitted
|
324
|
+
|
325
|
+
# reduce 4 omitted
|
326
|
+
|
327
|
+
# reduce 5 omitted
|
328
|
+
|
329
|
+
# reduce 6 omitted
|
330
|
+
|
331
|
+
# reduce 7 omitted
|
332
|
+
|
333
|
+
# reduce 8 omitted
|
334
|
+
|
335
|
+
# reduce 9 omitted
|
336
|
+
|
337
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 31)
|
338
|
+
def _reduce_10(val, _values, result)
|
339
|
+
@lineno += 1
|
340
|
+
|
341
|
+
result
|
342
|
+
end
|
343
|
+
.,.,
|
344
|
+
|
345
|
+
# reduce 11 omitted
|
346
|
+
|
347
|
+
# reduce 12 omitted
|
348
|
+
|
349
|
+
# reduce 13 omitted
|
350
|
+
|
351
|
+
# reduce 14 omitted
|
352
|
+
|
353
|
+
# reduce 15 omitted
|
354
|
+
|
355
|
+
# reduce 16 omitted
|
356
|
+
|
357
|
+
# reduce 17 omitted
|
358
|
+
|
359
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 47)
|
360
|
+
def _reduce_18(val, _values, result)
|
361
|
+
@sitemaps << val[3]
|
362
|
+
|
363
|
+
result
|
364
|
+
end
|
365
|
+
.,.,
|
366
|
+
|
367
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 52)
|
368
|
+
def _reduce_19(val, _values, result)
|
369
|
+
result = []
|
370
|
+
result << val[0]
|
371
|
+
|
372
|
+
result
|
373
|
+
end
|
374
|
+
.,.,
|
375
|
+
|
376
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 57)
|
377
|
+
def _reduce_20(val, _values, result)
|
378
|
+
result = []
|
379
|
+
|
380
|
+
result
|
381
|
+
end
|
382
|
+
.,.,
|
383
|
+
|
384
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 63)
|
385
|
+
def _reduce_21(val, _values, result)
|
386
|
+
result << val[2]
|
387
|
+
|
388
|
+
result
|
389
|
+
end
|
390
|
+
.,.,
|
391
|
+
|
392
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 69)
|
393
|
+
def _reduce_22(val, _values, result)
|
394
|
+
val[2].each_with_index { |line, i|
|
395
|
+
warn "%s line %d: %s: orphan rule line" %
|
396
|
+
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
|
397
|
+
}
|
398
|
+
|
399
|
+
result
|
400
|
+
end
|
401
|
+
.,.,
|
402
|
+
|
403
|
+
# reduce 23 omitted
|
404
|
+
|
405
|
+
# reduce 24 omitted
|
406
|
+
|
407
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 84)
|
408
|
+
def _reduce_25(val, _values, result)
|
409
|
+
result = Record.new(val[1], val[2])
|
410
|
+
|
411
|
+
result
|
412
|
+
end
|
413
|
+
.,.,
|
414
|
+
|
415
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 89)
|
416
|
+
def _reduce_26(val, _values, result)
|
417
|
+
result = [val[0]]
|
418
|
+
|
419
|
+
result
|
420
|
+
end
|
421
|
+
.,.,
|
422
|
+
|
423
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 94)
|
424
|
+
def _reduce_27(val, _values, result)
|
425
|
+
result << val[1]
|
426
|
+
|
427
|
+
result
|
428
|
+
end
|
429
|
+
.,.,
|
430
|
+
|
431
|
+
# reduce 28 omitted
|
432
|
+
|
433
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 101)
|
434
|
+
def _reduce_29(val, _values, result)
|
435
|
+
result = AgentLine.new(val[0], val[3])
|
436
|
+
|
437
|
+
result
|
438
|
+
end
|
439
|
+
.,.,
|
440
|
+
|
441
|
+
# reduce 30 omitted
|
442
|
+
|
443
|
+
# reduce 31 omitted
|
444
|
+
|
445
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 109)
|
446
|
+
def _reduce_32(val, _values, result)
|
447
|
+
result = [result]
|
448
|
+
@rulelinenos = []
|
449
|
+
|
450
|
+
result
|
451
|
+
end
|
452
|
+
.,.,
|
453
|
+
|
454
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 115)
|
455
|
+
def _reduce_33(val, _values, result)
|
456
|
+
result << val[1]
|
457
|
+
@rulelinenos << @lineno
|
458
|
+
|
459
|
+
result
|
460
|
+
end
|
461
|
+
.,.,
|
462
|
+
|
463
|
+
# reduce 34 omitted
|
464
|
+
|
465
|
+
# reduce 35 omitted
|
466
|
+
|
467
|
+
# reduce 36 omitted
|
468
|
+
|
469
|
+
# reduce 37 omitted
|
470
|
+
|
471
|
+
# reduce 38 omitted
|
472
|
+
|
473
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 128)
|
474
|
+
def _reduce_39(val, _values, result)
|
475
|
+
result = AllowLine.new(val[0], val[3])
|
476
|
+
|
477
|
+
result
|
478
|
+
end
|
479
|
+
.,.,
|
480
|
+
|
481
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 133)
|
482
|
+
def _reduce_40(val, _values, result)
|
483
|
+
result = DisallowLine.new(val[0], val[3])
|
484
|
+
|
485
|
+
result
|
486
|
+
end
|
487
|
+
.,.,
|
488
|
+
|
489
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 138)
|
490
|
+
def _reduce_41(val, _values, result)
|
491
|
+
result = CrawlDelayLine.new(val[0], val[3])
|
492
|
+
|
493
|
+
result
|
494
|
+
end
|
495
|
+
.,.,
|
496
|
+
|
497
|
+
module_eval(<<'.,.,', 'robotstxt.ry', 143)
|
498
|
+
def _reduce_42(val, _values, result)
|
499
|
+
result = ExtentionLine.new(val[0], val[3])
|
500
|
+
|
501
|
+
result
|
502
|
+
end
|
503
|
+
.,.,
|
504
|
+
|
505
|
+
# reduce 43 omitted
|
506
|
+
|
507
|
+
# reduce 44 omitted
|
508
|
+
|
509
|
+
def _reduce_none(val, _values, result)
|
510
|
+
val[0]
|
511
|
+
end
|
512
|
+
|
513
|
+
end # class Parser
|
514
|
+
|
515
|
+
def initialize(site, records, options = nil)
|
516
|
+
super()
|
517
|
+
@site = site
|
518
|
+
@options = options || {}
|
519
|
+
@last_checked = nil
|
520
|
+
|
521
|
+
@target = @options[:target]
|
522
|
+
@sitemaps = @options[:sitemaps] || []
|
523
|
+
|
524
|
+
if records && !records.empty?
|
525
|
+
@records, defaults = [], []
|
526
|
+
records.each { |record|
|
527
|
+
if record.default?
|
528
|
+
defaults << record
|
529
|
+
elsif !@target || record.match?(@target)
|
530
|
+
@records << record
|
531
|
+
end
|
532
|
+
}
|
533
|
+
@records.concat(defaults)
|
534
|
+
else
|
535
|
+
@records = []
|
536
|
+
end
|
537
|
+
end
|
538
|
+
|
539
|
+
attr_reader :site, :sitemaps
|
540
|
+
|
541
|
+
def target(user_agent = nil)
|
542
|
+
if user_agent
|
543
|
+
raise ArgumentError, "this instance is targeted for #{@target}" if @target
|
544
|
+
user_agent
|
545
|
+
else
|
546
|
+
raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target
|
547
|
+
@target
|
548
|
+
end
|
549
|
+
end
|
550
|
+
private :target
|
551
|
+
|
552
|
+
def find_record(user_agent = nil)
|
553
|
+
user_agent = target(user_agent)
|
554
|
+
@records.find { |record|
|
555
|
+
record.match?(user_agent)
|
556
|
+
}
|
557
|
+
end
|
558
|
+
private :find_record
|
559
|
+
|
560
|
+
def allow?(request_uri, user_agent = nil)
|
561
|
+
record = find_record(user_agent) or return true
|
562
|
+
allow = record.allow?(request_uri)
|
563
|
+
if @last_checked and delay = record.delay
|
564
|
+
delay -= Time.now - @last_checked
|
565
|
+
sleep delay if delay > 0
|
566
|
+
end
|
567
|
+
@last_checked = Time.now
|
568
|
+
return allow
|
569
|
+
end
|
570
|
+
|
571
|
+
def options(user_agent = nil)
|
572
|
+
record = find_record(user_agent) or return {}
|
573
|
+
record.options
|
574
|
+
end
|
575
|
+
|
576
|
+
class Record
|
577
|
+
def initialize(agentlines, rulelines)
|
578
|
+
@patterns = agentlines.map { |agentline| agentline.pattern }
|
579
|
+
@acls = []
|
580
|
+
@delay = nil
|
581
|
+
@options = {}
|
582
|
+
rulelines.each { |ruleline|
|
583
|
+
case ruleline
|
584
|
+
when AccessControlLine
|
585
|
+
@acls << ruleline
|
586
|
+
when CrawlDelayLine
|
587
|
+
@delay = ruleline.delay
|
588
|
+
else
|
589
|
+
@options[ruleline.token.downcase] = ruleline.value
|
590
|
+
end
|
591
|
+
}
|
592
|
+
@acls.sort! { |a, b|
|
593
|
+
[
|
594
|
+
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
595
|
+
] <=> [
|
596
|
+
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
597
|
+
]
|
598
|
+
}
|
599
|
+
end
|
600
|
+
|
601
|
+
attr_reader :delay, :options
|
602
|
+
|
603
|
+
def match?(user_agent)
|
604
|
+
@patterns.any? { |pattern|
|
605
|
+
pattern.match(user_agent)
|
606
|
+
}
|
607
|
+
end
|
608
|
+
|
609
|
+
def default?
|
610
|
+
@patterns.include?(//)
|
611
|
+
end
|
612
|
+
|
613
|
+
def allow?(request_uri)
|
614
|
+
@acls.each { |acl|
|
615
|
+
if acl.match?(request_uri)
|
616
|
+
return acl.allow?
|
617
|
+
end
|
618
|
+
}
|
619
|
+
return true
|
620
|
+
end
|
621
|
+
end
|
622
|
+
|
623
|
+
class Line
|
624
|
+
def initialize(token, value)
|
625
|
+
@token = token
|
626
|
+
@value = value
|
627
|
+
compile
|
628
|
+
end
|
629
|
+
|
630
|
+
attr_reader :token, :value
|
631
|
+
|
632
|
+
def compile
|
633
|
+
self
|
634
|
+
end
|
635
|
+
end
|
636
|
+
|
637
|
+
class AgentLine < Line
|
638
|
+
def compile
|
639
|
+
if @value == '*'
|
640
|
+
@pattern = //
|
641
|
+
else
|
642
|
+
@pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE)
|
643
|
+
end
|
644
|
+
self
|
645
|
+
end
|
646
|
+
|
647
|
+
attr_reader :pattern
|
648
|
+
end
|
649
|
+
|
650
|
+
class AccessControlLine < Line
|
651
|
+
def compile
|
652
|
+
@empty = @value.empty?
|
653
|
+
re_src = '\A'
|
654
|
+
s = StringScanner.new(@value)
|
655
|
+
until s.eos?
|
656
|
+
if t = s.scan(/[^%*$]+/)
|
657
|
+
re_src << Regexp.quote(t)
|
658
|
+
elsif t = s.scan(/%([0-9a-f]{2})/i)
|
659
|
+
c = s[1].to_i(16)
|
660
|
+
if c == 0x2f
|
661
|
+
re_src << '%2[fF]'
|
662
|
+
else
|
663
|
+
re_src << Regexp.quote('%c' % c)
|
664
|
+
end
|
665
|
+
elsif t = s.scan(/\*/)
|
666
|
+
re_src << '.*'
|
667
|
+
elsif t = s.scan(/\$/)
|
668
|
+
re_src << '\z'
|
669
|
+
break
|
670
|
+
else
|
671
|
+
raise ParseError, 'unexpected characters: %s' % s.check(/.*/)
|
672
|
+
end
|
673
|
+
end
|
674
|
+
@pattern = Regexp.new(re_src, Regexp::MULTILINE)
|
675
|
+
self
|
676
|
+
end
|
677
|
+
|
678
|
+
def match?(request_uri)
|
679
|
+
!@empty && !!@pattern.match(request_uri)
|
680
|
+
end
|
681
|
+
end
|
682
|
+
|
683
|
+
class AllowLine < AccessControlLine
|
684
|
+
def allow?
|
685
|
+
true
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
class DisallowLine < AccessControlLine
|
690
|
+
def allow?
|
691
|
+
false
|
692
|
+
end
|
693
|
+
end
|
694
|
+
|
695
|
+
class CrawlDelayLine < Line
|
696
|
+
def compile
|
697
|
+
case @value
|
698
|
+
when /\A((0|[1-9][0-9]*)\.[0-9]+)/
|
699
|
+
@delay = @value.to_f
|
700
|
+
when /\A(0|[1-9][0-9]*)/
|
701
|
+
@delay = @value.to_i
|
702
|
+
else
|
703
|
+
@delay = nil
|
704
|
+
end
|
705
|
+
self
|
706
|
+
end
|
707
|
+
|
708
|
+
attr_reader :delay
|
709
|
+
end
|
710
|
+
|
711
|
+
class ExtentionLine < Line
|
712
|
+
end
|
713
|
+
end
|
714
|
+
end
|