webrobots 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/webrobots/robotstxt.rb +14 -16
- data/lib/webrobots/robotstxt.ry +17 -19
- data/lib/webrobots.rb +5 -1
- data/test/test_webrobots.rb +4 -1
- data/webrobots.gemspec +3 -3
- metadata +19 -19
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -8,7 +8,6 @@ require 'racc/parser.rb'
|
|
8
8
|
|
9
9
|
|
10
10
|
require 'strscan'
|
11
|
-
require 'uri'
|
12
11
|
|
13
12
|
class WebRobots
|
14
13
|
class Error < StandardError
|
@@ -20,7 +19,7 @@ class WebRobots
|
|
20
19
|
class RobotsTxt
|
21
20
|
class Parser < Racc::Parser
|
22
21
|
|
23
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
22
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
|
24
23
|
|
25
24
|
def initialize(target = nil)
|
26
25
|
super()
|
@@ -34,10 +33,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
34
33
|
end
|
35
34
|
|
36
35
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
37
|
-
RE_KNOWN_TOKENS =
|
36
|
+
RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
|
38
37
|
|
39
38
|
def parse(input, site)
|
40
|
-
@q
|
39
|
+
@q ||= []
|
41
40
|
@errors = []
|
42
41
|
@lineno = 1
|
43
42
|
@site = site
|
@@ -71,14 +70,15 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
71
70
|
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
72
71
|
end
|
73
72
|
value_expected = false
|
74
|
-
|
75
|
-
|
73
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
74
|
+
case t
|
75
|
+
when RE_KNOWN_TOKENS
|
76
76
|
@q << [t.downcase, t]
|
77
|
-
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
78
|
-
@q << [:TOKEN, t]
|
79
77
|
else
|
80
|
-
|
78
|
+
@q << [:TOKEN, t]
|
81
79
|
end
|
80
|
+
else
|
81
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
@@ -90,6 +90,8 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
90
90
|
do_parse
|
91
91
|
rescue Racc::ParseError => e
|
92
92
|
raise ParseError, e.message
|
93
|
+
ensure
|
94
|
+
@q.clear
|
93
95
|
end
|
94
96
|
|
95
97
|
def next_token
|
@@ -283,7 +285,7 @@ Racc_token_to_s_table = [
|
|
283
285
|
"opt_blanklines",
|
284
286
|
"body",
|
285
287
|
"@1",
|
286
|
-
"
|
288
|
+
"records",
|
287
289
|
"blanklines",
|
288
290
|
"blankline",
|
289
291
|
"eol",
|
@@ -614,12 +616,8 @@ Disallow: /
|
|
614
616
|
@options[ruleline.token.downcase] = ruleline.value
|
615
617
|
end
|
616
618
|
}
|
617
|
-
@acls.
|
618
|
-
[
|
619
|
-
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
620
|
-
] <=> [
|
621
|
-
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
622
|
-
]
|
619
|
+
@acls.replace @acls.sort_by { |x|
|
620
|
+
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
623
621
|
}
|
624
622
|
end
|
625
623
|
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -15,7 +15,7 @@ rule
|
|
15
15
|
}
|
16
16
|
|
17
17
|
body :
|
18
|
-
|
|
18
|
+
| records
|
19
19
|
opt_blanklines
|
20
20
|
|
21
21
|
opt_blanklines :
|
@@ -48,7 +48,7 @@ rule
|
|
48
48
|
@sitemaps << val[3]
|
49
49
|
}
|
50
50
|
|
51
|
-
|
51
|
+
records : record
|
52
52
|
{
|
53
53
|
result = []
|
54
54
|
result << val[0]
|
@@ -57,13 +57,13 @@ rule
|
|
57
57
|
{
|
58
58
|
result = []
|
59
59
|
}
|
60
|
-
|
|
60
|
+
| records
|
61
61
|
blanklines
|
62
62
|
record
|
63
63
|
{
|
64
64
|
result << val[2]
|
65
65
|
}
|
66
|
-
|
|
66
|
+
| records
|
67
67
|
blanklines
|
68
68
|
rulelines
|
69
69
|
{
|
@@ -72,7 +72,7 @@ rule
|
|
72
72
|
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
|
73
73
|
}
|
74
74
|
}
|
75
|
-
|
|
75
|
+
| records
|
76
76
|
blanklines
|
77
77
|
commentblock
|
78
78
|
|
@@ -150,7 +150,6 @@ rule
|
|
150
150
|
---- header
|
151
151
|
|
152
152
|
require 'strscan'
|
153
|
-
require 'uri'
|
154
153
|
|
155
154
|
class WebRobots
|
156
155
|
class Error < StandardError
|
@@ -174,10 +173,10 @@ class WebRobots
|
|
174
173
|
end
|
175
174
|
|
176
175
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
177
|
-
RE_KNOWN_TOKENS =
|
176
|
+
RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
|
178
177
|
|
179
178
|
def parse(input, site)
|
180
|
-
@q
|
179
|
+
@q ||= []
|
181
180
|
@errors = []
|
182
181
|
@lineno = 1
|
183
182
|
@site = site
|
@@ -211,14 +210,15 @@ class WebRobots
|
|
211
210
|
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
212
211
|
end
|
213
212
|
value_expected = false
|
214
|
-
|
215
|
-
|
213
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
214
|
+
case t
|
215
|
+
when RE_KNOWN_TOKENS
|
216
216
|
@q << [t.downcase, t]
|
217
|
-
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
218
|
-
@q << [:TOKEN, t]
|
219
217
|
else
|
220
|
-
|
218
|
+
@q << [:TOKEN, t]
|
221
219
|
end
|
220
|
+
else
|
221
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
222
222
|
end
|
223
223
|
end
|
224
224
|
end
|
@@ -230,6 +230,8 @@ class WebRobots
|
|
230
230
|
do_parse
|
231
231
|
rescue Racc::ParseError => e
|
232
232
|
raise ParseError, e.message
|
233
|
+
ensure
|
234
|
+
@q.clear
|
233
235
|
end
|
234
236
|
|
235
237
|
def next_token
|
@@ -344,12 +346,8 @@ Disallow: /
|
|
344
346
|
@options[ruleline.token.downcase] = ruleline.value
|
345
347
|
end
|
346
348
|
}
|
347
|
-
@acls.
|
348
|
-
[
|
349
|
-
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
350
|
-
] <=> [
|
351
|
-
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
352
|
-
]
|
349
|
+
@acls.replace @acls.sort_by { |x|
|
350
|
+
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
353
351
|
}
|
354
352
|
end
|
355
353
|
|
data/lib/webrobots.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'webrobots/robotstxt'
|
2
2
|
require 'uri'
|
3
3
|
require 'net/https'
|
4
|
+
require 'thread'
|
4
5
|
if defined?(Nokogiri)
|
5
6
|
require 'webrobots/nokogiri'
|
6
7
|
else
|
@@ -19,6 +20,7 @@ class WebRobots
|
|
19
20
|
def initialize(user_agent, options = nil)
|
20
21
|
@user_agent = user_agent
|
21
22
|
@parser = RobotsTxt::Parser.new(user_agent)
|
23
|
+
@parser_mutex = Mutex.new
|
22
24
|
|
23
25
|
options ||= {}
|
24
26
|
@http_get = options[:http_get] || method(:http_get)
|
@@ -133,7 +135,9 @@ class WebRobots
|
|
133
135
|
rescue => e
|
134
136
|
return RobotsTxt.unfetchable(site, e, @user_agent)
|
135
137
|
end
|
136
|
-
@
|
138
|
+
@parser_mutex.synchronize {
|
139
|
+
@parser.parse!(body, site)
|
140
|
+
}
|
137
141
|
end
|
138
142
|
|
139
143
|
def http_get(uri)
|
data/test/test_webrobots.rb
CHANGED
@@ -89,6 +89,7 @@ class TestWebRobots < Test::Unit::TestCase
|
|
89
89
|
# Punish evil bots
|
90
90
|
User-Agent: evil
|
91
91
|
Disallow: /
|
92
|
+
Disallow-Not: / # parser teaser
|
92
93
|
|
93
94
|
User-Agent: good
|
94
95
|
# Be generous to good bots
|
@@ -172,7 +173,9 @@ Disallow: /~joe/index.html
|
|
172
173
|
end
|
173
174
|
|
174
175
|
should "properly restrict access" do
|
175
|
-
|
176
|
+
assert_nothing_raised {
|
177
|
+
assert @robots_good.allowed?('http://www.example.org/index.html')
|
178
|
+
}
|
176
179
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
|
177
180
|
assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
|
178
181
|
assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01
|
12
|
+
s.date = %q{2011-02-01}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
@@ -35,7 +35,7 @@ Gem::Specification.new do |s|
|
|
35
35
|
]
|
36
36
|
s.licenses = ["2-clause BSDL"]
|
37
37
|
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = %q{1.4.
|
38
|
+
s.rubygems_version = %q{1.4.2}
|
39
39
|
s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
|
40
40
|
s.test_files = [
|
41
41
|
"test/helper.rb",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 7
|
10
|
+
version: 0.0.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,10 +15,12 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01
|
18
|
+
date: 2011-02-01 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
name: racc
|
22
24
|
type: :runtime
|
23
25
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
24
26
|
none: false
|
@@ -30,9 +32,9 @@ dependencies:
|
|
30
32
|
- 0
|
31
33
|
version: "0"
|
32
34
|
requirement: *id001
|
33
|
-
prerelease: false
|
34
|
-
name: racc
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: nokogiri
|
36
38
|
type: :runtime
|
37
39
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
38
40
|
none: false
|
@@ -46,9 +48,9 @@ dependencies:
|
|
46
48
|
- 4
|
47
49
|
version: 1.4.4
|
48
50
|
requirement: *id002
|
49
|
-
prerelease: false
|
50
|
-
name: nokogiri
|
51
51
|
- !ruby/object:Gem::Dependency
|
52
|
+
prerelease: false
|
53
|
+
name: shoulda
|
52
54
|
type: :development
|
53
55
|
version_requirements: &id003 !ruby/object:Gem::Requirement
|
54
56
|
none: false
|
@@ -60,9 +62,9 @@ dependencies:
|
|
60
62
|
- 0
|
61
63
|
version: "0"
|
62
64
|
requirement: *id003
|
63
|
-
prerelease: false
|
64
|
-
name: shoulda
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
|
+
prerelease: false
|
67
|
+
name: bundler
|
66
68
|
type: :development
|
67
69
|
version_requirements: &id004 !ruby/object:Gem::Requirement
|
68
70
|
none: false
|
@@ -76,9 +78,9 @@ dependencies:
|
|
76
78
|
- 0
|
77
79
|
version: 1.0.0
|
78
80
|
requirement: *id004
|
79
|
-
prerelease: false
|
80
|
-
name: bundler
|
81
81
|
- !ruby/object:Gem::Dependency
|
82
|
+
prerelease: false
|
83
|
+
name: jeweler
|
82
84
|
type: :development
|
83
85
|
version_requirements: &id005 !ruby/object:Gem::Requirement
|
84
86
|
none: false
|
@@ -92,9 +94,9 @@ dependencies:
|
|
92
94
|
- 1
|
93
95
|
version: 1.5.1
|
94
96
|
requirement: *id005
|
95
|
-
prerelease: false
|
96
|
-
name: jeweler
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
+
prerelease: false
|
99
|
+
name: rcov
|
98
100
|
type: :development
|
99
101
|
version_requirements: &id006 !ruby/object:Gem::Requirement
|
100
102
|
none: false
|
@@ -106,9 +108,9 @@ dependencies:
|
|
106
108
|
- 0
|
107
109
|
version: "0"
|
108
110
|
requirement: *id006
|
109
|
-
prerelease: false
|
110
|
-
name: rcov
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
+
prerelease: false
|
113
|
+
name: racc
|
112
114
|
type: :development
|
113
115
|
version_requirements: &id007 !ruby/object:Gem::Requirement
|
114
116
|
none: false
|
@@ -120,8 +122,6 @@ dependencies:
|
|
120
122
|
- 0
|
121
123
|
version: "0"
|
122
124
|
requirement: *id007
|
123
|
-
prerelease: false
|
124
|
-
name: racc
|
125
125
|
description: |
|
126
126
|
This library helps write robots.txt compliant web robots in Ruby.
|
127
127
|
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
requirements: []
|
179
179
|
|
180
180
|
rubyforge_project:
|
181
|
-
rubygems_version: 1.4.
|
181
|
+
rubygems_version: 1.4.2
|
182
182
|
signing_key:
|
183
183
|
specification_version: 3
|
184
184
|
summary: A Ruby library to help write robots.txt compliant web robots
|