webrobots 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -2,14 +2,16 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
- gem "nokogiri", ">= 1.4.4"
6
5
 
7
6
  # Add dependencies to develop your gem here.
8
7
  # Include everything needed to run rake, tests, features, etc.
9
8
  group :development do
10
9
  gem "racc", ">= 0"
11
10
  gem "shoulda", ">= 0"
12
- gem "bundler", "~> 1.0.0"
11
+ gem "bundler", ">= 1.0.0"
13
12
  gem "jeweler", "~> 1.6.4"
14
- gem "rcov", ">= 0"
13
+ gem "rcov", "~> 0.9.11"
14
+
15
+ # To test the webrobots/nokogiri module.
16
+ gem "nokogiri", ">= 1.4.4"
15
17
  end
@@ -7,18 +7,18 @@ GEM
7
7
  git (>= 1.2.5)
8
8
  rake
9
9
  nokogiri (1.5.0)
10
- racc (1.4.6)
11
- rake (0.9.2)
12
- rcov (0.9.10)
10
+ racc (1.4.7)
11
+ rake (0.9.2.2)
12
+ rcov (0.9.11)
13
13
  shoulda (2.11.3)
14
14
 
15
15
  PLATFORMS
16
16
  ruby
17
17
 
18
18
  DEPENDENCIES
19
- bundler (~> 1.0.0)
19
+ bundler (>= 1.0.0)
20
20
  jeweler (~> 1.6.4)
21
21
  nokogiri (>= 1.4.4)
22
22
  racc
23
- rcov
23
+ rcov (~> 0.9.11)
24
24
  shoulda
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2011 Akinori MUSHA
1
+ Copyright (c) 2010, 2011, 2012 Akinori MUSHA
2
2
 
3
3
  All rights reserved.
4
4
 
@@ -21,7 +21,6 @@ This is a library to help write robots.txt compliant web robots.
21
21
  == Requirements
22
22
 
23
23
  - Ruby 1.8.7 or 1.9.2+
24
- - Nokogiri 1.4.4+
25
24
 
26
25
  == Contributing to webrobots
27
26
 
@@ -35,6 +34,5 @@ This is a library to help write robots.txt compliant web robots.
35
34
 
36
35
  == Copyright
37
36
 
38
- Copyright (c) 2010, 2011 Akinori MUSHA. See LICENSE.txt for
37
+ Copyright (c) 2010, 2011, 2012 Akinori MUSHA. See LICENSE.txt for
39
38
  further details.
40
-
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.12
1
+ 0.0.13
@@ -145,8 +145,12 @@ class WebRobots
145
145
  referer = nil
146
146
  10.times {
147
147
  http = Net::HTTP.new(uri.host, uri.port)
148
- http.use_ssl = uri.is_a?(URI::HTTPS)
149
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
148
+ if http.use_ssl = uri.is_a?(URI::HTTPS)
149
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
150
+ http.cert_store = OpenSSL::X509::Store.new.tap { |store|
151
+ store.set_default_paths
152
+ }
153
+ end
150
154
  header = { 'User-Agent' => @user_agent }
151
155
  header['Referer'] = referer if referer
152
156
  # header is destroyed by this in ruby 1.9.2!
@@ -1,6 +1,6 @@
1
1
  #
2
2
  # DO NOT MODIFY!!!!
3
- # This file is automatically generated by Racc 1.4.6
3
+ # This file is automatically generated by Racc 1.4.7
4
4
  # from Racc grammer file "".
5
5
  #
6
6
 
@@ -14,12 +14,23 @@ class WebRobots
14
14
  end
15
15
 
16
16
  class ParseError < Error
17
+ # The site's root URI
18
+ attr_reader :site
19
+
20
+ def initialize(message, site)
21
+ @message = message
22
+ @site = site
23
+ end
24
+
25
+ def to_s
26
+ @message
27
+ end
17
28
  end
18
29
 
19
30
  class RobotsTxt
20
31
  class Parser < Racc::Parser
21
32
 
22
- module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 158)
33
+ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 169)
23
34
 
24
35
  def initialize(target = nil)
25
36
  super()
@@ -90,7 +101,7 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 158)
90
101
 
91
102
  do_parse
92
103
  rescue Racc::ParseError => e
93
- raise ParseError, e.message
104
+ raise ParseError.new(e.message, @site)
94
105
  ensure
95
106
  @q.clear
96
107
  end
@@ -606,7 +617,7 @@ Disallow: /
606
617
  else
607
618
  @options[ruleline.token.downcase] = ruleline.value
608
619
  end
609
- }
620
+ } if rulelines
610
621
  @acls.replace @acls.sort_by { |x|
611
622
  [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
612
623
  }
@@ -151,6 +151,17 @@ class WebRobots
151
151
  end
152
152
 
153
153
  class ParseError < Error
154
+ # The site's root URI
155
+ attr_reader :site
156
+
157
+ def initialize(message, site)
158
+ @message = message
159
+ @site = site
160
+ end
161
+
162
+ def to_s
163
+ @message
164
+ end
154
165
  end
155
166
 
156
167
  class RobotsTxt
@@ -225,7 +236,7 @@ class WebRobots
225
236
 
226
237
  do_parse
227
238
  rescue Racc::ParseError => e
228
- raise ParseError, e.message
239
+ raise ParseError.new(e.message, @site)
229
240
  ensure
230
241
  @q.clear
231
242
  end
@@ -341,7 +352,7 @@ Disallow: /
341
352
  else
342
353
  @options[ruleline.token.downcase] = ruleline.value
343
354
  end
344
- }
355
+ } if rulelines
345
356
  @acls.replace @acls.sort_by { |x|
346
357
  [-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
347
358
  }
@@ -327,7 +327,9 @@ Allow: /2heavy/*.html
327
327
  robots.reset(url)
328
328
 
329
329
  assert robots.allowed?(url)
330
- assert_instance_of WebRobots::ParseError, robots.error(url)
330
+ error = robots.error(url)
331
+ assert_instance_of WebRobots::ParseError, error
332
+ assert_equal URI('http://www.example.org/'), error.site
331
333
  assert_raise(WebRobots::ParseError) {
332
334
  robots.error!(url)
333
335
  }
@@ -472,6 +474,7 @@ Disallow: /
472
474
  should "be parsed for major sites" do
473
475
  assert_nothing_raised {
474
476
  assert !@testbot.allowed?("http://www.google.com/search")
477
+ assert !@testbot.allowed?("https://www.google.com/search")
475
478
  assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
476
479
  assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
477
480
  }
@@ -655,4 +658,25 @@ TXT
655
658
  assert !@fetched
656
659
  end
657
660
  end
661
+
662
+ context "robots.txt with just user-agent & sitemap and no blank line between them" do
663
+ setup do
664
+ @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
665
+ res = case uri.to_s
666
+ when 'http://site1.example.com/robots.txt'
667
+ <<-'TXT'
668
+ User-agent: *
669
+ Sitemap: http://site1.example.com/text/sitemap.xml
670
+ TXT
671
+ else
672
+ raise "#{uri} is not supposed to be fetched"
673
+ end
674
+ })
675
+ end
676
+
677
+ should "be properly parsed" do
678
+ assert @robots.allowed?("http://site1.example.com/foo")
679
+ assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/"))
680
+ end
681
+ end
658
682
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "webrobots"
8
- s.version = "0.0.12"
8
+ s.version = "0.0.13"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Akinori MUSHA"]
12
- s.date = "2011-09-26"
12
+ s.date = "2012-01-24"
13
13
  s.description = "This library helps write robots.txt compliant web robots in Ruby.\n"
14
14
  s.email = "knu@idaemons.org"
15
15
  s.extra_rdoc_files = [
@@ -35,34 +35,34 @@ Gem::Specification.new do |s|
35
35
  s.homepage = "https://github.com/knu/webrobots"
36
36
  s.licenses = ["2-clause BSDL"]
37
37
  s.require_paths = ["lib"]
38
- s.rubygems_version = "1.8.10"
38
+ s.rubygems_version = "1.8.15"
39
39
  s.summary = "A Ruby library to help write robots.txt compliant web robots"
40
40
 
41
41
  if s.respond_to? :specification_version then
42
42
  s.specification_version = 3
43
43
 
44
44
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
- s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
46
45
  s.add_development_dependency(%q<racc>, [">= 0"])
47
46
  s.add_development_dependency(%q<shoulda>, [">= 0"])
48
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
47
+ s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
49
48
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
50
- s.add_development_dependency(%q<rcov>, [">= 0"])
49
+ s.add_development_dependency(%q<rcov>, ["~> 0.9.11"])
50
+ s.add_development_dependency(%q<nokogiri>, [">= 1.4.4"])
51
51
  else
52
- s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
53
52
  s.add_dependency(%q<racc>, [">= 0"])
54
53
  s.add_dependency(%q<shoulda>, [">= 0"])
55
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
54
+ s.add_dependency(%q<bundler>, [">= 1.0.0"])
56
55
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
57
- s.add_dependency(%q<rcov>, [">= 0"])
56
+ s.add_dependency(%q<rcov>, ["~> 0.9.11"])
57
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
58
58
  end
59
59
  else
60
- s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
61
60
  s.add_dependency(%q<racc>, [">= 0"])
62
61
  s.add_dependency(%q<shoulda>, [">= 0"])
63
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
62
+ s.add_dependency(%q<bundler>, [">= 1.0.0"])
64
63
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
65
- s.add_dependency(%q<rcov>, [">= 0"])
64
+ s.add_dependency(%q<rcov>, ["~> 0.9.11"])
65
+ s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
66
66
  end
67
67
  end
68
68
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webrobots
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 12
10
- version: 0.0.12
9
+ - 13
10
+ version: 0.0.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Akinori MUSHA
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-26 00:00:00 Z
18
+ date: 2012-01-24 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  requirement: &id001 !ruby/object:Gem::Requirement
@@ -23,16 +23,14 @@ dependencies:
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- hash: 15
26
+ hash: 3
27
27
  segments:
28
- - 1
29
- - 4
30
- - 4
31
- version: 1.4.4
28
+ - 0
29
+ version: "0"
32
30
  version_requirements: *id001
33
- name: nokogiri
31
+ name: racc
34
32
  prerelease: false
35
- type: :runtime
33
+ type: :development
36
34
  - !ruby/object:Gem::Dependency
37
35
  requirement: &id002 !ruby/object:Gem::Requirement
38
36
  none: false
@@ -44,7 +42,7 @@ dependencies:
44
42
  - 0
45
43
  version: "0"
46
44
  version_requirements: *id002
47
- name: racc
45
+ name: shoulda
48
46
  prerelease: false
49
47
  type: :development
50
48
  - !ruby/object:Gem::Dependency
@@ -53,12 +51,14 @@ dependencies:
53
51
  requirements:
54
52
  - - ">="
55
53
  - !ruby/object:Gem::Version
56
- hash: 3
54
+ hash: 23
57
55
  segments:
56
+ - 1
58
57
  - 0
59
- version: "0"
58
+ - 0
59
+ version: 1.0.0
60
60
  version_requirements: *id003
61
- name: shoulda
61
+ name: bundler
62
62
  prerelease: false
63
63
  type: :development
64
64
  - !ruby/object:Gem::Dependency
@@ -67,14 +67,14 @@ dependencies:
67
67
  requirements:
68
68
  - - ~>
69
69
  - !ruby/object:Gem::Version
70
- hash: 23
70
+ hash: 7
71
71
  segments:
72
72
  - 1
73
- - 0
74
- - 0
75
- version: 1.0.0
73
+ - 6
74
+ - 4
75
+ version: 1.6.4
76
76
  version_requirements: *id004
77
- name: bundler
77
+ name: jeweler
78
78
  prerelease: false
79
79
  type: :development
80
80
  - !ruby/object:Gem::Dependency
@@ -83,14 +83,14 @@ dependencies:
83
83
  requirements:
84
84
  - - ~>
85
85
  - !ruby/object:Gem::Version
86
- hash: 7
86
+ hash: 45
87
87
  segments:
88
- - 1
89
- - 6
90
- - 4
91
- version: 1.6.4
88
+ - 0
89
+ - 9
90
+ - 11
91
+ version: 0.9.11
92
92
  version_requirements: *id005
93
- name: jeweler
93
+ name: rcov
94
94
  prerelease: false
95
95
  type: :development
96
96
  - !ruby/object:Gem::Dependency
@@ -99,12 +99,14 @@ dependencies:
99
99
  requirements:
100
100
  - - ">="
101
101
  - !ruby/object:Gem::Version
102
- hash: 3
102
+ hash: 15
103
103
  segments:
104
- - 0
105
- version: "0"
104
+ - 1
105
+ - 4
106
+ - 4
107
+ version: 1.4.4
106
108
  version_requirements: *id006
107
- name: rcov
109
+ name: nokogiri
108
110
  prerelease: false
109
111
  type: :development
110
112
  description: |
@@ -162,7 +164,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
162
164
  requirements: []
163
165
 
164
166
  rubyforge_project:
165
- rubygems_version: 1.8.10
167
+ rubygems_version: 1.8.15
166
168
  signing_key:
167
169
  specification_version: 3
168
170
  summary: A Ruby library to help write robots.txt compliant web robots