webrobots 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -3
- data/Gemfile.lock +5 -5
- data/LICENSE.txt +1 -1
- data/README.rdoc +1 -3
- data/VERSION +1 -1
- data/lib/webrobots.rb +6 -2
- data/lib/webrobots/robotstxt.rb +15 -4
- data/lib/webrobots/robotstxt.ry +13 -2
- data/test/test_webrobots.rb +25 -1
- data/webrobots.gemspec +12 -12
- metadata +33 -31
data/Gemfile
CHANGED
@@ -2,14 +2,16 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
-
gem "nokogiri", ">= 1.4.4"
|
6
5
|
|
7
6
|
# Add dependencies to develop your gem here.
|
8
7
|
# Include everything needed to run rake, tests, features, etc.
|
9
8
|
group :development do
|
10
9
|
gem "racc", ">= 0"
|
11
10
|
gem "shoulda", ">= 0"
|
12
|
-
gem "bundler", "
|
11
|
+
gem "bundler", ">= 1.0.0"
|
13
12
|
gem "jeweler", "~> 1.6.4"
|
14
|
-
gem "rcov", "
|
13
|
+
gem "rcov", "~> 0.9.11"
|
14
|
+
|
15
|
+
# To test the webrobots/nokogiri module.
|
16
|
+
gem "nokogiri", ">= 1.4.4"
|
15
17
|
end
|
data/Gemfile.lock
CHANGED
@@ -7,18 +7,18 @@ GEM
|
|
7
7
|
git (>= 1.2.5)
|
8
8
|
rake
|
9
9
|
nokogiri (1.5.0)
|
10
|
-
racc (1.4.
|
11
|
-
rake (0.9.2)
|
12
|
-
rcov (0.9.
|
10
|
+
racc (1.4.7)
|
11
|
+
rake (0.9.2.2)
|
12
|
+
rcov (0.9.11)
|
13
13
|
shoulda (2.11.3)
|
14
14
|
|
15
15
|
PLATFORMS
|
16
16
|
ruby
|
17
17
|
|
18
18
|
DEPENDENCIES
|
19
|
-
bundler (
|
19
|
+
bundler (>= 1.0.0)
|
20
20
|
jeweler (~> 1.6.4)
|
21
21
|
nokogiri (>= 1.4.4)
|
22
22
|
racc
|
23
|
-
rcov
|
23
|
+
rcov (~> 0.9.11)
|
24
24
|
shoulda
|
data/LICENSE.txt
CHANGED
data/README.rdoc
CHANGED
@@ -21,7 +21,6 @@ This is a library to help write robots.txt compliant web robots.
|
|
21
21
|
== Requirements
|
22
22
|
|
23
23
|
- Ruby 1.8.7 or 1.9.2+
|
24
|
-
- Nokogiri 1.4.4+
|
25
24
|
|
26
25
|
== Contributing to webrobots
|
27
26
|
|
@@ -35,6 +34,5 @@ This is a library to help write robots.txt compliant web robots.
|
|
35
34
|
|
36
35
|
== Copyright
|
37
36
|
|
38
|
-
Copyright (c) 2010, 2011 Akinori MUSHA. See LICENSE.txt for
|
37
|
+
Copyright (c) 2010, 2011, 2012 Akinori MUSHA. See LICENSE.txt for
|
39
38
|
further details.
|
40
|
-
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.13
|
data/lib/webrobots.rb
CHANGED
@@ -145,8 +145,12 @@ class WebRobots
|
|
145
145
|
referer = nil
|
146
146
|
10.times {
|
147
147
|
http = Net::HTTP.new(uri.host, uri.port)
|
148
|
-
http.use_ssl = uri.is_a?(URI::HTTPS)
|
149
|
-
|
148
|
+
if http.use_ssl = uri.is_a?(URI::HTTPS)
|
149
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
150
|
+
http.cert_store = OpenSSL::X509::Store.new.tap { |store|
|
151
|
+
store.set_default_paths
|
152
|
+
}
|
153
|
+
end
|
150
154
|
header = { 'User-Agent' => @user_agent }
|
151
155
|
header['Referer'] = referer if referer
|
152
156
|
# header is destroyed by this in ruby 1.9.2!
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.4.
|
3
|
+
# This file is automatically generated by Racc 1.4.7
|
4
4
|
# from Racc grammer file "".
|
5
5
|
#
|
6
6
|
|
@@ -14,12 +14,23 @@ class WebRobots
|
|
14
14
|
end
|
15
15
|
|
16
16
|
class ParseError < Error
|
17
|
+
# The site's root URI
|
18
|
+
attr_reader :site
|
19
|
+
|
20
|
+
def initialize(message, site)
|
21
|
+
@message = message
|
22
|
+
@site = site
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
@message
|
27
|
+
end
|
17
28
|
end
|
18
29
|
|
19
30
|
class RobotsTxt
|
20
31
|
class Parser < Racc::Parser
|
21
32
|
|
22
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
33
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 169)
|
23
34
|
|
24
35
|
def initialize(target = nil)
|
25
36
|
super()
|
@@ -90,7 +101,7 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 158)
|
|
90
101
|
|
91
102
|
do_parse
|
92
103
|
rescue Racc::ParseError => e
|
93
|
-
raise ParseError
|
104
|
+
raise ParseError.new(e.message, @site)
|
94
105
|
ensure
|
95
106
|
@q.clear
|
96
107
|
end
|
@@ -606,7 +617,7 @@ Disallow: /
|
|
606
617
|
else
|
607
618
|
@options[ruleline.token.downcase] = ruleline.value
|
608
619
|
end
|
609
|
-
}
|
620
|
+
} if rulelines
|
610
621
|
@acls.replace @acls.sort_by { |x|
|
611
622
|
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
612
623
|
}
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -151,6 +151,17 @@ class WebRobots
|
|
151
151
|
end
|
152
152
|
|
153
153
|
class ParseError < Error
|
154
|
+
# The site's root URI
|
155
|
+
attr_reader :site
|
156
|
+
|
157
|
+
def initialize(message, site)
|
158
|
+
@message = message
|
159
|
+
@site = site
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
@message
|
164
|
+
end
|
154
165
|
end
|
155
166
|
|
156
167
|
class RobotsTxt
|
@@ -225,7 +236,7 @@ class WebRobots
|
|
225
236
|
|
226
237
|
do_parse
|
227
238
|
rescue Racc::ParseError => e
|
228
|
-
raise ParseError
|
239
|
+
raise ParseError.new(e.message, @site)
|
229
240
|
ensure
|
230
241
|
@q.clear
|
231
242
|
end
|
@@ -341,7 +352,7 @@ Disallow: /
|
|
341
352
|
else
|
342
353
|
@options[ruleline.token.downcase] = ruleline.value
|
343
354
|
end
|
344
|
-
}
|
355
|
+
} if rulelines
|
345
356
|
@acls.replace @acls.sort_by { |x|
|
346
357
|
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
347
358
|
}
|
data/test/test_webrobots.rb
CHANGED
@@ -327,7 +327,9 @@ Allow: /2heavy/*.html
|
|
327
327
|
robots.reset(url)
|
328
328
|
|
329
329
|
assert robots.allowed?(url)
|
330
|
-
|
330
|
+
error = robots.error(url)
|
331
|
+
assert_instance_of WebRobots::ParseError, error
|
332
|
+
assert_equal URI('http://www.example.org/'), error.site
|
331
333
|
assert_raise(WebRobots::ParseError) {
|
332
334
|
robots.error!(url)
|
333
335
|
}
|
@@ -472,6 +474,7 @@ Disallow: /
|
|
472
474
|
should "be parsed for major sites" do
|
473
475
|
assert_nothing_raised {
|
474
476
|
assert !@testbot.allowed?("http://www.google.com/search")
|
477
|
+
assert !@testbot.allowed?("https://www.google.com/search")
|
475
478
|
assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
|
476
479
|
assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
|
477
480
|
}
|
@@ -655,4 +658,25 @@ TXT
|
|
655
658
|
assert !@fetched
|
656
659
|
end
|
657
660
|
end
|
661
|
+
|
662
|
+
context "robots.txt with just user-agent & sitemap and no blank line between them" do
|
663
|
+
setup do
|
664
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
665
|
+
res = case uri.to_s
|
666
|
+
when 'http://site1.example.com/robots.txt'
|
667
|
+
<<-'TXT'
|
668
|
+
User-agent: *
|
669
|
+
Sitemap: http://site1.example.com/text/sitemap.xml
|
670
|
+
TXT
|
671
|
+
else
|
672
|
+
raise "#{uri} is not supposed to be fetched"
|
673
|
+
end
|
674
|
+
})
|
675
|
+
end
|
676
|
+
|
677
|
+
should "be properly parsed" do
|
678
|
+
assert @robots.allowed?("http://site1.example.com/foo")
|
679
|
+
assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/"))
|
680
|
+
end
|
681
|
+
end
|
658
682
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "webrobots"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.13"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2012-01-24"
|
13
13
|
s.description = "This library helps write robots.txt compliant web robots in Ruby.\n"
|
14
14
|
s.email = "knu@idaemons.org"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -35,34 +35,34 @@ Gem::Specification.new do |s|
|
|
35
35
|
s.homepage = "https://github.com/knu/webrobots"
|
36
36
|
s.licenses = ["2-clause BSDL"]
|
37
37
|
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = "1.8.
|
38
|
+
s.rubygems_version = "1.8.15"
|
39
39
|
s.summary = "A Ruby library to help write robots.txt compliant web robots"
|
40
40
|
|
41
41
|
if s.respond_to? :specification_version then
|
42
42
|
s.specification_version = 3
|
43
43
|
|
44
44
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
-
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
46
45
|
s.add_development_dependency(%q<racc>, [">= 0"])
|
47
46
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
-
s.add_development_dependency(%q<bundler>, ["
|
47
|
+
s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
|
49
48
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
50
|
-
s.add_development_dependency(%q<rcov>, ["
|
49
|
+
s.add_development_dependency(%q<rcov>, ["~> 0.9.11"])
|
50
|
+
s.add_development_dependency(%q<nokogiri>, [">= 1.4.4"])
|
51
51
|
else
|
52
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
53
52
|
s.add_dependency(%q<racc>, [">= 0"])
|
54
53
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
55
|
-
s.add_dependency(%q<bundler>, ["
|
54
|
+
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
56
55
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
57
|
-
s.add_dependency(%q<rcov>, ["
|
56
|
+
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
57
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
58
58
|
end
|
59
59
|
else
|
60
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
61
60
|
s.add_dependency(%q<racc>, [">= 0"])
|
62
61
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
-
s.add_dependency(%q<bundler>, ["
|
62
|
+
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
64
63
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
|
-
s.add_dependency(%q<rcov>, ["
|
64
|
+
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
65
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 13
|
10
|
+
version: 0.0.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-24 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
requirement: &id001 !ruby/object:Gem::Requirement
|
@@ -23,16 +23,14 @@ dependencies:
|
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
hash:
|
26
|
+
hash: 3
|
27
27
|
segments:
|
28
|
-
-
|
29
|
-
|
30
|
-
- 4
|
31
|
-
version: 1.4.4
|
28
|
+
- 0
|
29
|
+
version: "0"
|
32
30
|
version_requirements: *id001
|
33
|
-
name:
|
31
|
+
name: racc
|
34
32
|
prerelease: false
|
35
|
-
type: :
|
33
|
+
type: :development
|
36
34
|
- !ruby/object:Gem::Dependency
|
37
35
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
36
|
none: false
|
@@ -44,7 +42,7 @@ dependencies:
|
|
44
42
|
- 0
|
45
43
|
version: "0"
|
46
44
|
version_requirements: *id002
|
47
|
-
name:
|
45
|
+
name: shoulda
|
48
46
|
prerelease: false
|
49
47
|
type: :development
|
50
48
|
- !ruby/object:Gem::Dependency
|
@@ -53,12 +51,14 @@ dependencies:
|
|
53
51
|
requirements:
|
54
52
|
- - ">="
|
55
53
|
- !ruby/object:Gem::Version
|
56
|
-
hash:
|
54
|
+
hash: 23
|
57
55
|
segments:
|
56
|
+
- 1
|
58
57
|
- 0
|
59
|
-
|
58
|
+
- 0
|
59
|
+
version: 1.0.0
|
60
60
|
version_requirements: *id003
|
61
|
-
name:
|
61
|
+
name: bundler
|
62
62
|
prerelease: false
|
63
63
|
type: :development
|
64
64
|
- !ruby/object:Gem::Dependency
|
@@ -67,14 +67,14 @@ dependencies:
|
|
67
67
|
requirements:
|
68
68
|
- - ~>
|
69
69
|
- !ruby/object:Gem::Version
|
70
|
-
hash:
|
70
|
+
hash: 7
|
71
71
|
segments:
|
72
72
|
- 1
|
73
|
-
-
|
74
|
-
-
|
75
|
-
version: 1.
|
73
|
+
- 6
|
74
|
+
- 4
|
75
|
+
version: 1.6.4
|
76
76
|
version_requirements: *id004
|
77
|
-
name:
|
77
|
+
name: jeweler
|
78
78
|
prerelease: false
|
79
79
|
type: :development
|
80
80
|
- !ruby/object:Gem::Dependency
|
@@ -83,14 +83,14 @@ dependencies:
|
|
83
83
|
requirements:
|
84
84
|
- - ~>
|
85
85
|
- !ruby/object:Gem::Version
|
86
|
-
hash:
|
86
|
+
hash: 45
|
87
87
|
segments:
|
88
|
-
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
version:
|
88
|
+
- 0
|
89
|
+
- 9
|
90
|
+
- 11
|
91
|
+
version: 0.9.11
|
92
92
|
version_requirements: *id005
|
93
|
-
name:
|
93
|
+
name: rcov
|
94
94
|
prerelease: false
|
95
95
|
type: :development
|
96
96
|
- !ruby/object:Gem::Dependency
|
@@ -99,12 +99,14 @@ dependencies:
|
|
99
99
|
requirements:
|
100
100
|
- - ">="
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
hash:
|
102
|
+
hash: 15
|
103
103
|
segments:
|
104
|
-
-
|
105
|
-
|
104
|
+
- 1
|
105
|
+
- 4
|
106
|
+
- 4
|
107
|
+
version: 1.4.4
|
106
108
|
version_requirements: *id006
|
107
|
-
name:
|
109
|
+
name: nokogiri
|
108
110
|
prerelease: false
|
109
111
|
type: :development
|
110
112
|
description: |
|
@@ -162,7 +164,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
164
|
requirements: []
|
163
165
|
|
164
166
|
rubyforge_project:
|
165
|
-
rubygems_version: 1.8.
|
167
|
+
rubygems_version: 1.8.15
|
166
168
|
signing_key:
|
167
169
|
specification_version: 3
|
168
170
|
summary: A Ruby library to help write robots.txt compliant web robots
|