webrobots 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.6
1
+ 0.0.7
@@ -8,7 +8,6 @@ require 'racc/parser.rb'
8
8
 
9
9
 
10
10
  require 'strscan'
11
- require 'uri'
12
11
 
13
12
  class WebRobots
14
13
  class Error < StandardError
@@ -20,7 +19,7 @@ class WebRobots
20
19
  class RobotsTxt
21
20
  class Parser < Racc::Parser
22
21
 
23
- module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
22
+ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 163)
24
23
 
25
24
  def initialize(target = nil)
26
25
  super()
@@ -34,10 +33,10 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
34
33
  end
35
34
 
36
35
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
37
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
36
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
38
37
 
39
38
  def parse(input, site)
40
- @q = []
39
+ @q ||= []
41
40
  @errors = []
42
41
  @lineno = 1
43
42
  @site = site
@@ -71,14 +70,15 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
71
70
  parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
72
71
  end
73
72
  value_expected = false
74
- else
75
- if t = s.scan(RE_KNOWN_TOKENS)
73
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
74
+ case t
75
+ when RE_KNOWN_TOKENS
76
76
  @q << [t.downcase, t]
77
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
78
- @q << [:TOKEN, t]
79
77
  else
80
- parse_error "unexpected characters: %s" % s.check(/.*/)
78
+ @q << [:TOKEN, t]
81
79
  end
80
+ else
81
+ parse_error "unexpected characters: %s" % s.check(/.*/)
82
82
  end
83
83
  end
84
84
  end
@@ -90,6 +90,8 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 164)
90
90
  do_parse
91
91
  rescue Racc::ParseError => e
92
92
  raise ParseError, e.message
93
+ ensure
94
+ @q.clear
93
95
  end
94
96
 
95
97
  def next_token
@@ -283,7 +285,7 @@ Racc_token_to_s_table = [
283
285
  "opt_blanklines",
284
286
  "body",
285
287
  "@1",
286
- "blocks",
288
+ "records",
287
289
  "blanklines",
288
290
  "blankline",
289
291
  "eol",
@@ -614,12 +616,8 @@ Disallow: /
614
616
  @options[ruleline.token.downcase] = ruleline.value
615
617
  end
616
618
  }
617
- @acls.sort! { |a, b|
618
- [
619
- b.value.length, b.is_a?(AllowLine) ? 1 : 0
620
- ] <=> [
621
- a.value.length, a.is_a?(AllowLine) ? 1 : 0
622
- ]
619
+ @acls.replace @acls.sort_by { |x|
620
+ [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
623
621
  }
624
622
  end
625
623
 
@@ -15,7 +15,7 @@ rule
15
15
  }
16
16
 
17
17
  body :
18
- | blocks
18
+ | records
19
19
  opt_blanklines
20
20
 
21
21
  opt_blanklines :
@@ -48,7 +48,7 @@ rule
48
48
  @sitemaps << val[3]
49
49
  }
50
50
 
51
- blocks : record
51
+ records : record
52
52
  {
53
53
  result = []
54
54
  result << val[0]
@@ -57,13 +57,13 @@ rule
57
57
  {
58
58
  result = []
59
59
  }
60
- | blocks
60
+ | records
61
61
  blanklines
62
62
  record
63
63
  {
64
64
  result << val[2]
65
65
  }
66
- | blocks
66
+ | records
67
67
  blanklines
68
68
  rulelines
69
69
  {
@@ -72,7 +72,7 @@ rule
72
72
  [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE
73
73
  }
74
74
  }
75
- | blocks
75
+ | records
76
76
  blanklines
77
77
  commentblock
78
78
 
@@ -150,7 +150,6 @@ rule
150
150
  ---- header
151
151
 
152
152
  require 'strscan'
153
- require 'uri'
154
153
 
155
154
  class WebRobots
156
155
  class Error < StandardError
@@ -174,10 +173,10 @@ class WebRobots
174
173
  end
175
174
 
176
175
  KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap]
177
- RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i
176
+ RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i
178
177
 
179
178
  def parse(input, site)
180
- @q = []
179
+ @q ||= []
181
180
  @errors = []
182
181
  @lineno = 1
183
182
  @site = site
@@ -211,14 +210,15 @@ class WebRobots
211
210
  parse_error @lineno, "unexpected characters: %s" % s.check(/.*/)
212
211
  end
213
212
  value_expected = false
214
- else
215
- if t = s.scan(RE_KNOWN_TOKENS)
213
+ elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
214
+ case t
215
+ when RE_KNOWN_TOKENS
216
216
  @q << [t.downcase, t]
217
- elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/)
218
- @q << [:TOKEN, t]
219
217
  else
220
- parse_error "unexpected characters: %s" % s.check(/.*/)
218
+ @q << [:TOKEN, t]
221
219
  end
220
+ else
221
+ parse_error "unexpected characters: %s" % s.check(/.*/)
222
222
  end
223
223
  end
224
224
  end
@@ -230,6 +230,8 @@ class WebRobots
230
230
  do_parse
231
231
  rescue Racc::ParseError => e
232
232
  raise ParseError, e.message
233
+ ensure
234
+ @q.clear
233
235
  end
234
236
 
235
237
  def next_token
@@ -344,12 +346,8 @@ Disallow: /
344
346
  @options[ruleline.token.downcase] = ruleline.value
345
347
  end
346
348
  }
347
- @acls.sort! { |a, b|
348
- [
349
- b.value.length, b.is_a?(AllowLine) ? 1 : 0
350
- ] <=> [
351
- a.value.length, a.is_a?(AllowLine) ? 1 : 0
352
- ]
349
+ @acls.replace @acls.sort_by { |x|
350
+ [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
353
351
  }
354
352
  end
355
353
 
data/lib/webrobots.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'webrobots/robotstxt'
2
2
  require 'uri'
3
3
  require 'net/https'
4
+ require 'thread'
4
5
  if defined?(Nokogiri)
5
6
  require 'webrobots/nokogiri'
6
7
  else
@@ -19,6 +20,7 @@ class WebRobots
19
20
  def initialize(user_agent, options = nil)
20
21
  @user_agent = user_agent
21
22
  @parser = RobotsTxt::Parser.new(user_agent)
23
+ @parser_mutex = Mutex.new
22
24
 
23
25
  options ||= {}
24
26
  @http_get = options[:http_get] || method(:http_get)
@@ -133,7 +135,9 @@ class WebRobots
133
135
  rescue => e
134
136
  return RobotsTxt.unfetchable(site, e, @user_agent)
135
137
  end
136
- @parser.parse!(body, site)
138
+ @parser_mutex.synchronize {
139
+ @parser.parse!(body, site)
140
+ }
137
141
  end
138
142
 
139
143
  def http_get(uri)
@@ -89,6 +89,7 @@ class TestWebRobots < Test::Unit::TestCase
89
89
  # Punish evil bots
90
90
  User-Agent: evil
91
91
  Disallow: /
92
+ Disallow-Not: / # parser teaser
92
93
 
93
94
  User-Agent: good
94
95
  # Be generous to good bots
@@ -172,7 +173,9 @@ Disallow: /~joe/index.html
172
173
  end
173
174
 
174
175
  should "properly restrict access" do
175
- assert @robots_good.allowed?('http://www.example.org/index.html')
176
+ assert_nothing_raised {
177
+ assert @robots_good.allowed?('http://www.example.org/index.html')
178
+ }
176
179
  assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php')
177
180
  assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php')
178
181
  assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php'))
data/webrobots.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{webrobots}
8
- s.version = "0.0.6"
8
+ s.version = "0.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = %q{2011-01-09}
12
+ s.date = %q{2011-02-01}
13
13
  s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
14
14
  }
15
15
  s.email = %q{knu@idaemons.org}
@@ -35,7 +35,7 @@ Gem::Specification.new do |s|
35
35
  ]
36
36
  s.licenses = ["2-clause BSDL"]
37
37
  s.require_paths = ["lib"]
38
- s.rubygems_version = %q{1.4.1}
38
+ s.rubygems_version = %q{1.4.2}
39
39
  s.summary = %q{A Ruby library to help write robots.txt compliant web robots}
40
40
  s.test_files = [
41
41
  "test/helper.rb",
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 6
10
- version: 0.0.6
9
+ - 7
10
+ version: 0.0.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,10 +15,12 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-01-09 00:00:00 +09:00
18
+ date: 2011-02-01 00:00:00 +09:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ name: racc
22
24
  type: :runtime
23
25
  version_requirements: &id001 !ruby/object:Gem::Requirement
24
26
  none: false
@@ -30,9 +32,9 @@ dependencies:
30
32
  - 0
31
33
  version: "0"
32
34
  requirement: *id001
33
- prerelease: false
34
- name: racc
35
35
  - !ruby/object:Gem::Dependency
36
+ prerelease: false
37
+ name: nokogiri
36
38
  type: :runtime
37
39
  version_requirements: &id002 !ruby/object:Gem::Requirement
38
40
  none: false
@@ -46,9 +48,9 @@ dependencies:
46
48
  - 4
47
49
  version: 1.4.4
48
50
  requirement: *id002
49
- prerelease: false
50
- name: nokogiri
51
51
  - !ruby/object:Gem::Dependency
52
+ prerelease: false
53
+ name: shoulda
52
54
  type: :development
53
55
  version_requirements: &id003 !ruby/object:Gem::Requirement
54
56
  none: false
@@ -60,9 +62,9 @@ dependencies:
60
62
  - 0
61
63
  version: "0"
62
64
  requirement: *id003
63
- prerelease: false
64
- name: shoulda
65
65
  - !ruby/object:Gem::Dependency
66
+ prerelease: false
67
+ name: bundler
66
68
  type: :development
67
69
  version_requirements: &id004 !ruby/object:Gem::Requirement
68
70
  none: false
@@ -76,9 +78,9 @@ dependencies:
76
78
  - 0
77
79
  version: 1.0.0
78
80
  requirement: *id004
79
- prerelease: false
80
- name: bundler
81
81
  - !ruby/object:Gem::Dependency
82
+ prerelease: false
83
+ name: jeweler
82
84
  type: :development
83
85
  version_requirements: &id005 !ruby/object:Gem::Requirement
84
86
  none: false
@@ -92,9 +94,9 @@ dependencies:
92
94
  - 1
93
95
  version: 1.5.1
94
96
  requirement: *id005
95
- prerelease: false
96
- name: jeweler
97
97
  - !ruby/object:Gem::Dependency
98
+ prerelease: false
99
+ name: rcov
98
100
  type: :development
99
101
  version_requirements: &id006 !ruby/object:Gem::Requirement
100
102
  none: false
@@ -106,9 +108,9 @@ dependencies:
106
108
  - 0
107
109
  version: "0"
108
110
  requirement: *id006
109
- prerelease: false
110
- name: rcov
111
111
  - !ruby/object:Gem::Dependency
112
+ prerelease: false
113
+ name: racc
112
114
  type: :development
113
115
  version_requirements: &id007 !ruby/object:Gem::Requirement
114
116
  none: false
@@ -120,8 +122,6 @@ dependencies:
120
122
  - 0
121
123
  version: "0"
122
124
  requirement: *id007
123
- prerelease: false
124
- name: racc
125
125
  description: |
126
126
  This library helps write robots.txt compliant web robots in Ruby.
127
127
 
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
178
178
  requirements: []
179
179
 
180
180
  rubyforge_project:
181
- rubygems_version: 1.4.1
181
+ rubygems_version: 1.4.2
182
182
  signing_key:
183
183
  specification_version: 3
184
184
  summary: A Ruby library to help write robots.txt compliant web robots