webrobots 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/webrobots/robotstxt.rb +14 -16
- data/lib/webrobots/robotstxt.ry +17 -19
- data/lib/webrobots.rb +5 -1
- data/test/test_webrobots.rb +4 -1
- data/webrobots.gemspec +3 -3
- metadata +19 -19
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.7
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -8,7 +8,6 @@ require 'racc/parser.rb'
|
|
8
8
|
|
9
9
|
|
10
10
|
require 'strscan'
|
11
|
-
require 'uri'
|
12
11
|
|
13
12
|
class WebRobots
|
14
13
|
class Error < StandardError
|
@@ -20,7 +19,7 @@ class WebRobots
|
|
20
19
|
class RobotsTxt
|
21
20
|
class Parser < Racc::Parser
|
22
21
|
|
23
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
22
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
|
24
23
|
|
25
24
|
def initialize(target = nil)
|
26
25
|
super()
|
@@ -34,10 +33,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
34
33
|
end
|
35
34
|
|
36
35
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
37
|
-
RE_KNOWN_TOKENS =
|
36
|
+
RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
|
38
37
|
|
39
38
|
def parse(input, site)
|
40
|
-
@q
|
39
|
+
@q ||= []
|
41
40
|
@errors = []
|
42
41
|
@lineno = 1
|
43
42
|
@site = site
|
@@ -71,14 +70,15 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
71
70
|
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
72
71
|
end
|
73
72
|
value_expected = false
|
74
|
-
|
75
|
-
|
73
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
74
|
+
case t
|
75
|
+
when RE_KNOWN_TOKENS
|
76
76
|
@q << [t.downcase, t]
|
77
|
-
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
78
|
-
@q << [:TOKEN, t]
|
79
77
|
else
|
80
|
-
|
78
|
+
@q << [:TOKEN, t]
|
81
79
|
end
|
80
|
+
else
|
81
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
@@ -90,6 +90,8 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
|
|
90
90
|
do_parse
|
91
91
|
rescue Racc::ParseError => e
|
92
92
|
raise ParseError, e.message
|
93
|
+
ensure
|
94
|
+
@q.clear
|
93
95
|
end
|
94
96
|
|
95
97
|
def next_token
|
@@ -283,7 +285,7 @@ Racc_token_to_s_table = [
|
|
283
285
|
"opt_blanklines",
|
284
286
|
"body",
|
285
287
|
"@1",
|
286
|
-
"
|
288
|
+
"records",
|
287
289
|
"blanklines",
|
288
290
|
"blankline",
|
289
291
|
"eol",
|
@@ -614,12 +616,8 @@ Disallow: /
|
|
614
616
|
@options[ruleline.token.downcase] = ruleline.value
|
615
617
|
end
|
616
618
|
}
|
617
|
-
@acls.
|
618
|
-
[
|
619
|
-
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
620
|
-
] <=> [
|
621
|
-
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
622
|
-
]
|
619
|
+
@acls.replace @acls.sort_by { |x|
|
620
|
+
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
623
621
|
}
|
624
622
|
end
|
625
623
|
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -15,7 +15,7 @@ rule
|
|
15
15
|
}
|
16
16
|
|
17
17
|
body :
|
18
|
-
|
|
18
|
+
| records
|
19
19
|
opt_blanklines
|
20
20
|
|
21
21
|
opt_blanklines :
|
@@ -48,7 +48,7 @@ rule
|
|
48
48
|
@sitemaps << val[3]
|
49
49
|
}
|
50
50
|
|
51
|
-
|
51
|
+
records : record
|
52
52
|
{
|
53
53
|
result = []
|
54
54
|
result << val[0]
|
@@ -57,13 +57,13 @@ rule
|
|
57
57
|
{
|
58
58
|
result = []
|
59
59
|
}
|
60
|
-
|
|
60
|
+
| records
|
61
61
|
blanklines
|
62
62
|
record
|
63
63
|
{
|
64
64
|
result << val[2]
|
65
65
|
}
|
66
|
-
|
|
66
|
+
| records
|
67
67
|
blanklines
|
68
68
|
rulelines
|
69
69
|
{
|
@@ -72,7 +72,7 @@ rule
|
|
72
72
|
[@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
|
73
73
|
}
|
74
74
|
}
|
75
|
-
|
|
75
|
+
| records
|
76
76
|
blanklines
|
77
77
|
commentblock
|
78
78
|
|
@@ -150,7 +150,6 @@ rule
|
|
150
150
|
---- header
|
151
151
|
|
152
152
|
require 'strscan'
|
153
|
-
require 'uri'
|
154
153
|
|
155
154
|
class WebRobots
|
156
155
|
class Error < StandardError
|
@@ -174,10 +173,10 @@ class WebRobots
|
|
174
173
|
end
|
175
174
|
|
176
175
|
KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
|
177
|
-
RE_KNOWN_TOKENS =
|
176
|
+
RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
|
178
177
|
|
179
178
|
def parse(input, site)
|
180
|
-
@q
|
179
|
+
@q ||= []
|
181
180
|
@errors = []
|
182
181
|
@lineno = 1
|
183
182
|
@site = site
|
@@ -211,14 +210,15 @@ class WebRobots
|
|
211
210
|
parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
|
212
211
|
end
|
213
212
|
value_expected = false
|
214
|
-
|
215
|
-
|
213
|
+
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
214
|
+
case t
|
215
|
+
when RE_KNOWN_TOKENS
|
216
216
|
@q << [t.downcase, t]
|
217
|
-
elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
|
218
|
-
@q << [:TOKEN, t]
|
219
217
|
else
|
220
|
-
|
218
|
+
@q << [:TOKEN, t]
|
221
219
|
end
|
220
|
+
else
|
221
|
+
parse_error "unexpected characters: %s" % s.check(/.*/)
|
222
222
|
end
|
223
223
|
end
|
224
224
|
end
|
@@ -230,6 +230,8 @@ class WebRobots
|
|
230
230
|
do_parse
|
231
231
|
rescue Racc::ParseError => e
|
232
232
|
raise ParseError, e.message
|
233
|
+
ensure
|
234
|
+
@q.clear
|
233
235
|
end
|
234
236
|
|
235
237
|
def next_token
|
@@ -344,12 +346,8 @@ Disallow: /
|
|
344
346
|
@options[ruleline.token.downcase] = ruleline.value
|
345
347
|
end
|
346
348
|
}
|
347
|
-
@acls.
|
348
|
-
[
|
349
|
-
b.value.length, b.is_a?(AllowLine) ? 1 : 0
|
350
|
-
] <=> [
|
351
|
-
a.value.length, a.is_a?(AllowLine) ? 1 : 0
|
352
|
-
]
|
349
|
+
@acls.replace @acls.sort_by { |x|
|
350
|
+
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
353
351
|
}
|
354
352
|
end
|
355
353
|
|
data/lib/webrobots.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'webrobots/robotstxt'
|
2
2
|
require 'uri'
|
3
3
|
require 'net/https'
|
4
|
+
require 'thread'
|
4
5
|
if defined?(Nokogiri)
|
5
6
|
require 'webrobots/nokogiri'
|
6
7
|
else
|
@@ -19,6 +20,7 @@ class WebRobots
|
|
19
20
|
def initialize(user_agent, options = nil)
|
20
21
|
@user_agent = user_agent
|
21
22
|
@parser = RobotsTxt::Parser.new(user_agent)
|
23
|
+
@parser_mutex = Mutex.new
|
22
24
|
|
23
25
|
options ||= {}
|
24
26
|
@http_get = options[:http_get] || method(:http_get)
|
@@ -133,7 +135,9 @@ class WebRobots
|
|
133
135
|
rescue => e
|
134
136
|
return RobotsTxt.unfetchable(site, e, @user_agent)
|
135
137
|
end
|
136
|
-
@
|
138
|
+
@parser_mutex.synchronize {
|
139
|
+
@parser.parse!(body, site)
|
140
|
+
}
|
137
141
|
end
|
138
142
|
|
139
143
|
def http_get(uri)
|
data/test/test_webrobots.rb
CHANGED
@@ -89,6 +89,7 @@ class TestWebRobots < Test::Unit::TestCase
|
|
89
89
|
# Punish evil bots
|
90
90
|
User-Agent: evil
|
91
91
|
Disallow: /
|
92
|
+
Disallow-Not: / # parser teaser
|
92
93
|
|
93
94
|
User-Agent: good
|
94
95
|
# Be generous to good bots
|
@@ -172,7 +173,9 @@ Disallow: /~joe/index.html
|
|
172
173
|
end
|
173
174
|
|
174
175
|
should "properly restrict access" do
|
175
|
-
|
176
|
+
assert_nothing_raised {
|
177
|
+
assert @robots_good.allowed?('http://www.example.org/index.html')
|
178
|
+
}
|
176
179
|
assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
|
177
180
|
assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
|
178
181
|
assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01
|
12
|
+
s.date = %q{2011-02-01}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
@@ -35,7 +35,7 @@ Gem::Specification.new do |s|
|
|
35
35
|
]
|
36
36
|
s.licenses = ["2-clause BSDL"]
|
37
37
|
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = %q{1.4.
|
38
|
+
s.rubygems_version = %q{1.4.2}
|
39
39
|
s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
|
40
40
|
s.test_files = [
|
41
41
|
"test/helper.rb",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 7
|
10
|
+
version: 0.0.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,10 +15,12 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01
|
18
|
+
date: 2011-02-01 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
name: racc
|
22
24
|
type: :runtime
|
23
25
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
24
26
|
none: false
|
@@ -30,9 +32,9 @@ dependencies:
|
|
30
32
|
- 0
|
31
33
|
version: "0"
|
32
34
|
requirement: *id001
|
33
|
-
prerelease: false
|
34
|
-
name: racc
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: nokogiri
|
36
38
|
type: :runtime
|
37
39
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
38
40
|
none: false
|
@@ -46,9 +48,9 @@ dependencies:
|
|
46
48
|
- 4
|
47
49
|
version: 1.4.4
|
48
50
|
requirement: *id002
|
49
|
-
prerelease: false
|
50
|
-
name: nokogiri
|
51
51
|
- !ruby/object:Gem::Dependency
|
52
|
+
prerelease: false
|
53
|
+
name: shoulda
|
52
54
|
type: :development
|
53
55
|
version_requirements: &id003 !ruby/object:Gem::Requirement
|
54
56
|
none: false
|
@@ -60,9 +62,9 @@ dependencies:
|
|
60
62
|
- 0
|
61
63
|
version: "0"
|
62
64
|
requirement: *id003
|
63
|
-
prerelease: false
|
64
|
-
name: shoulda
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
|
+
prerelease: false
|
67
|
+
name: bundler
|
66
68
|
type: :development
|
67
69
|
version_requirements: &id004 !ruby/object:Gem::Requirement
|
68
70
|
none: false
|
@@ -76,9 +78,9 @@ dependencies:
|
|
76
78
|
- 0
|
77
79
|
version: 1.0.0
|
78
80
|
requirement: *id004
|
79
|
-
prerelease: false
|
80
|
-
name: bundler
|
81
81
|
- !ruby/object:Gem::Dependency
|
82
|
+
prerelease: false
|
83
|
+
name: jeweler
|
82
84
|
type: :development
|
83
85
|
version_requirements: &id005 !ruby/object:Gem::Requirement
|
84
86
|
none: false
|
@@ -92,9 +94,9 @@ dependencies:
|
|
92
94
|
- 1
|
93
95
|
version: 1.5.1
|
94
96
|
requirement: *id005
|
95
|
-
prerelease: false
|
96
|
-
name: jeweler
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
+
prerelease: false
|
99
|
+
name: rcov
|
98
100
|
type: :development
|
99
101
|
version_requirements: &id006 !ruby/object:Gem::Requirement
|
100
102
|
none: false
|
@@ -106,9 +108,9 @@ dependencies:
|
|
106
108
|
- 0
|
107
109
|
version: "0"
|
108
110
|
requirement: *id006
|
109
|
-
prerelease: false
|
110
|
-
name: rcov
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
+
prerelease: false
|
113
|
+
name: racc
|
112
114
|
type: :development
|
113
115
|
version_requirements: &id007 !ruby/object:Gem::Requirement
|
114
116
|
none: false
|
@@ -120,8 +122,6 @@ dependencies:
|
|
120
122
|
- 0
|
121
123
|
version: "0"
|
122
124
|
requirement: *id007
|
123
|
-
prerelease: false
|
124
|
-
name: racc
|
125
125
|
description: |
|
126
126
|
This library helps write robots.txt compliant web robots in Ruby.
|
127
127
|
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
requirements: []
|
179
179
|
|
180
180
|
rubyforge_project:
|
181
|
-
rubygems_version: 1.4.
|
181
|
+
rubygems_version: 1.4.2
|
182
182
|
signing_key:
|
183
183
|
specification_version: 3
|
184
184
|
summary: A Ruby library to help write robots.txt compliant web robots
|