webrobots 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.6
1
+ 0.0.7
@@ -8,7 +8,6 @@ require 'racc/parser.rb'
8
8
 
9
9
 
10
10
  require 'strscan'
11
- require 'uri'
12
11
 
13
12
  class WebRobots
14
13
  class Error < StandardError
@@ -20,7 +19,7 @@ class WebRobots
20
19
  class RobotsTxt
21
20
  class Parser < Racc::Parser
22
21
 
23
- module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
22
+ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
24
23
 
25
24
  def initialize(target = nil)
26
25
  super()
@@ -34,10 +33,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
34
33
  end
35
34
 
36
35
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
37
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
36
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
38
37
 
39
38
  def parse(input, site)
40
- @q = []
39
+ @q ||= []
41
40
  @errors = []
42
41
  @lineno = 1
43
42
  @site = site
@@ -71,14 +70,15 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
71
70
  parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
72
71
  end
73
72
  value_expected = false
74
- else
75
- if t = s.scan(RE_KNOWN_TOKENS)
73
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
74
+ case t
75
+ when RE_KNOWN_TOKENS
76
76
  @q << [t.downcase, t]
77
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
78
- @q << [:TOKEN, t]
79
77
  else
80
- parse_error "unexpected characters: %s" % s.check(/.*/)
78
+ @q << [:TOKEN, t]
81
79
  end
80
+ else
81
+ parse_error "unexpected characters: %s" % s.check(/.*/)
82
82
  end
83
83
  end
84
84
  end
@@ -90,6 +90,8 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
90
90
  do_parse
91
91
  rescue Racc::ParseError => e
92
92
  raise ParseError, e.message
93
+ ensure
94
+ @q.clear
93
95
  end
94
96
 
95
97
  def next_token
@@ -283,7 +285,7 @@ Racc_token_to_s_table = [
283
285
  "opt_blanklines",
284
286
  "body",
285
287
  "@1",
286
- "blocks",
288
+ "records",
287
289
  "blanklines",
288
290
  "blankline",
289
291
  "eol",
@@ -614,12 +616,8 @@ Disallow: /
614
616
  @options[ruleline.token.downcase] = ruleline.value
615
617
  end
616
618
  }
617
- @acls.sort! { |a, b|
618
- [
619
- b.value.length, b.is_a?(AllowLine) ? 1 : 0
620
- ] <=> [
621
- a.value.length, a.is_a?(AllowLine) ? 1 : 0
622
- ]
619
+ @acls.replace @acls.sort_by { |x|
620
+ [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
623
621
  }
624
622
  end
625
623
 
@@ -15,7 +15,7 @@ rule
15
15
  }
16
16
 
17
17
  body :
18
- | blocks
18
+ | records
19
19
  opt_blanklines
20
20
 
21
21
  opt_blanklines :
@@ -48,7 +48,7 @@ rule
48
48
  @sitemaps << val[3]
49
49
  }
50
50
 
51
- blocks : record
51
+ records : record
52
52
  {
53
53
  result = []
54
54
  result << val[0]
@@ -57,13 +57,13 @@ rule
57
57
  {
58
58
  result = []
59
59
  }
60
- | blocks
60
+ | records
61
61
  blanklines
62
62
  record
63
63
  {
64
64
  result << val[2]
65
65
  }
66
- | blocks
66
+ | records
67
67
  blanklines
68
68
  rulelines
69
69
  {
@@ -72,7 +72,7 @@ rule
72
72
  [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
73
73
  }
74
74
  }
75
- | blocks
75
+ | records
76
76
  blanklines
77
77
  commentblock
78
78
 
@@ -150,7 +150,6 @@ rule
150
150
  ---- header
151
151
 
152
152
  require 'strscan'
153
- require 'uri'
154
153
 
155
154
  class WebRobots
156
155
  class Error < StandardError
@@ -174,10 +173,10 @@ class WebRobots
174
173
  end
175
174
 
176
175
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
177
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
176
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
178
177
 
179
178
  def parse(input, site)
180
- @q = []
179
+ @q ||= []
181
180
  @errors = []
182
181
  @lineno = 1
183
182
  @site = site
@@ -211,14 +210,15 @@ class WebRobots
211
210
  parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
212
211
  end
213
212
  value_expected = false
214
- else
215
- if t = s.scan(RE_KNOWN_TOKENS)
213
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
214
+ case t
215
+ when RE_KNOWN_TOKENS
216
216
  @q << [t.downcase, t]
217
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
218
- @q << [:TOKEN, t]
219
217
  else
220
- parse_error "unexpected characters: %s" % s.check(/.*/)
218
+ @q << [:TOKEN, t]
221
219
  end
220
+ else
221
+ parse_error "unexpected characters: %s" % s.check(/.*/)
222
222
  end
223
223
  end
224
224
  end
@@ -230,6 +230,8 @@ class WebRobots
230
230
  do_parse
231
231
  rescue Racc::ParseError => e
232
232
  raise ParseError, e.message
233
+ ensure
234
+ @q.clear
233
235
  end
234
236
 
235
237
  def next_token
@@ -344,12 +346,8 @@ Disallow: /
344
346
  @options[ruleline.token.downcase] = ruleline.value
345
347
  end
346
348
  }
347
- @acls.sort! { |a, b|
348
- [
349
- b.value.length, b.is_a?(AllowLine) ? 1 : 0
350
- ] <=> [
351
- a.value.length, a.is_a?(AllowLine) ? 1 : 0
352
- ]
349
+ @acls.replace @acls.sort_by { |x|
350
+ [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
353
351
  }
354
352
  end
355
353
 
data/lib/webrobots.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'webrobots/robotstxt'
2
2
  require 'uri'
3
3
  require 'net/https'
4
+ require 'thread'
4
5
  if defined?(Nokogiri)
5
6
  require 'webrobots/nokogiri'
6
7
  else
@@ -19,6 +20,7 @@ class WebRobots
19
20
  def initialize(user_agent, options = nil)
20
21
  @user_agent = user_agent
21
22
  @parser = RobotsTxt::Parser.new(user_agent)
23
+ @parser_mutex = Mutex.new
22
24
 
23
25
  options ||= {}
24
26
  @http_get = options[:http_get] || method(:http_get)
@@ -133,7 +135,9 @@ class WebRobots
133
135
  rescue => e
134
136
  return RobotsTxt.unfetchable(site, e, @user_agent)
135
137
  end
136
- @parser.parse!(body, site)
138
+ @parser_mutex.synchronize {
139
+ @parser.parse!(body, site)
140
+ }
137
141
  end
138
142
 
139
143
  def http_get(uri)
@@ -89,6 +89,7 @@ class TestWebRobots < Test::Unit::TestCase
89
89
  # Punish evil bots
90
90
  User-Agent: evil
91
91
  Disallow: /
92
+ Disallow-Not: / # parser teaser
92
93
 
93
94
  User-Agent: good
94
95
  # Be generous to good bots
@@ -172,7 +173,9 @@ Disallow: /~joe/index.html
172
173
  end
173
174
 
174
175
  should "properly restrict access" do
175
- assert @robots_good.allowed?('http://www.example.org/index.html')
176
+ assert_nothing_raised {
177
+ assert @robots_good.allowed?('http://www.example.org/index.html')
178
+ }
176
179
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
177
180
  assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
178
181
  assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.6"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-09}
12
+ s.date = %q{2011-02-01}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
@@ -35,7 +35,7 @@ Gem::Specification.new do |s|
35
35
  ]
36
36
  s.licenses = ["2-clause BSDL"]
37
37
  s.require_paths = ["lib"]
38
- s.rubygems_version = %q{1.4.1}
38
+ s.rubygems_version = %q{1.4.2}
39
39
  s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
40
40
  s.test_files = [
41
41
  "test/helper.rb",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 6
10
- version: 0.0.6
9
+ - 7
10
+ version: 0.0.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,10 +15,12 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-09 00:00:00 +09:00
18
+ date: 2011-02-01 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ name: racc
22
24
  type: :runtime
23
25
  version_requirements: &id001 !ruby/object:Gem::Requirement
24
26
  none: false
@@ -30,9 +32,9 @@ dependencies:
30
32
  - 0
31
33
  version: "0"
32
34
  requirement: *id001
33
- prerelease: false
34
- name: racc
35
35
  - !ruby/object:Gem::Dependency
36
+ prerelease: false
37
+ name: nokogiri
36
38
  type: :runtime
37
39
  version_requirements: &id002 !ruby/object:Gem::Requirement
38
40
  none: false
@@ -46,9 +48,9 @@ dependencies:
46
48
  - 4
47
49
  version: 1.4.4
48
50
  requirement: *id002
49
- prerelease: false
50
- name: nokogiri
51
51
  - !ruby/object:Gem::Dependency
52
+ prerelease: false
53
+ name: shoulda
52
54
  type: :development
53
55
  version_requirements: &id003 !ruby/object:Gem::Requirement
54
56
  none: false
@@ -60,9 +62,9 @@ dependencies:
60
62
  - 0
61
63
  version: "0"
62
64
  requirement: *id003
63
- prerelease: false
64
- name: shoulda
65
65
  - !ruby/object:Gem::Dependency
66
+ prerelease: false
67
+ name: bundler
66
68
  type: :development
67
69
  version_requirements: &id004 !ruby/object:Gem::Requirement
68
70
  none: false
@@ -76,9 +78,9 @@ dependencies:
76
78
  - 0
77
79
  version: 1.0.0
78
80
  requirement: *id004
79
- prerelease: false
80
- name: bundler
81
81
  - !ruby/object:Gem::Dependency
82
+ prerelease: false
83
+ name: jeweler
82
84
  type: :development
83
85
  version_requirements: &id005 !ruby/object:Gem::Requirement
84
86
  none: false
@@ -92,9 +94,9 @@ dependencies:
92
94
  - 1
93
95
  version: 1.5.1
94
96
  requirement: *id005
95
- prerelease: false
96
- name: jeweler
97
97
  - !ruby/object:Gem::Dependency
98
+ prerelease: false
99
+ name: rcov
98
100
  type: :development
99
101
  version_requirements: &id006 !ruby/object:Gem::Requirement
100
102
  none: false
@@ -106,9 +108,9 @@ dependencies:
106
108
  - 0
107
109
  version: "0"
108
110
  requirement: *id006
109
- prerelease: false
110
- name: rcov
111
111
  - !ruby/object:Gem::Dependency
112
+ prerelease: false
113
+ name: racc
112
114
  type: :development
113
115
  version_requirements: &id007 !ruby/object:Gem::Requirement
114
116
  none: false
@@ -120,8 +122,6 @@ dependencies:
120
122
  - 0
121
123
  version: "0"
122
124
  requirement: *id007
123
- prerelease: false
124
- name: racc
125
125
  description: |
126
126
  This library helps write robots.txt compliant web robots in Ruby.
127
127
 
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
178
178
  requirements: []
179
179
 
180
180
  rubyforge_project:
181
- rubygems_version: 1.4.1
181
+ rubygems_version: 1.4.2
182
182
  signing_key:
183
183
  specification_version: 3
184
184
  summary: A Ruby library to help write robots.txt compliant web robots