webrobots 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -3
- data/Gemfile.lock +5 -5
- data/LICENSE.txt +1 -1
- data/README.rdoc +1 -3
- data/VERSION +1 -1
- data/lib/webrobots.rb +6 -2
- data/lib/webrobots/robotstxt.rb +15 -4
- data/lib/webrobots/robotstxt.ry +13 -2
- data/test/test_webrobots.rb +25 -1
- data/webrobots.gemspec +12 -12
- metadata +33 -31
data/Gemfile
CHANGED
@@ -2,14 +2,16 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
-
gem "nokogiri", ">= 1.4.4"
|
6
5
|
|
7
6
|
# Add dependencies to develop your gem here.
|
8
7
|
# Include everything needed to run rake, tests, features, etc.
|
9
8
|
group :development do
|
10
9
|
gem "racc", ">= 0"
|
11
10
|
gem "shoulda", ">= 0"
|
12
|
-
gem "bundler", "
|
11
|
+
gem "bundler", ">= 1.0.0"
|
13
12
|
gem "jeweler", "~> 1.6.4"
|
14
|
-
gem "rcov", "
|
13
|
+
gem "rcov", "~> 0.9.11"
|
14
|
+
|
15
|
+
# To test the webrobots/nokogiri module.
|
16
|
+
gem "nokogiri", ">= 1.4.4"
|
15
17
|
end
|
data/Gemfile.lock
CHANGED
@@ -7,18 +7,18 @@ GEM
|
|
7
7
|
git (>= 1.2.5)
|
8
8
|
rake
|
9
9
|
nokogiri (1.5.0)
|
10
|
-
racc (1.4.
|
11
|
-
rake (0.9.2)
|
12
|
-
rcov (0.9.
|
10
|
+
racc (1.4.7)
|
11
|
+
rake (0.9.2.2)
|
12
|
+
rcov (0.9.11)
|
13
13
|
shoulda (2.11.3)
|
14
14
|
|
15
15
|
PLATFORMS
|
16
16
|
ruby
|
17
17
|
|
18
18
|
DEPENDENCIES
|
19
|
-
bundler (
|
19
|
+
bundler (>= 1.0.0)
|
20
20
|
jeweler (~> 1.6.4)
|
21
21
|
nokogiri (>= 1.4.4)
|
22
22
|
racc
|
23
|
-
rcov
|
23
|
+
rcov (~> 0.9.11)
|
24
24
|
shoulda
|
data/LICENSE.txt
CHANGED
data/README.rdoc
CHANGED
@@ -21,7 +21,6 @@ This is a library to help write robots.txt compliant web robots.
|
|
21
21
|
== Requirements
|
22
22
|
|
23
23
|
- Ruby 1.8.7 or 1.9.2+
|
24
|
-
- Nokogiri 1.4.4+
|
25
24
|
|
26
25
|
== Contributing to webrobots
|
27
26
|
|
@@ -35,6 +34,5 @@ This is a library to help write robots.txt compliant web robots.
|
|
35
34
|
|
36
35
|
== Copyright
|
37
36
|
|
38
|
-
Copyright (c) 2010, 2011 Akinori MUSHA. See LICENSE.txt for
|
37
|
+
Copyright (c) 2010, 2011, 2012 Akinori MUSHA. See LICENSE.txt for
|
39
38
|
further details.
|
40
|
-
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.13
|
data/lib/webrobots.rb
CHANGED
@@ -145,8 +145,12 @@ class WebRobots
|
|
145
145
|
referer = nil
|
146
146
|
10.times {
|
147
147
|
http = Net::HTTP.new(uri.host, uri.port)
|
148
|
-
http.use_ssl = uri.is_a?(URI::HTTPS)
|
149
|
-
|
148
|
+
if http.use_ssl = uri.is_a?(URI::HTTPS)
|
149
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
150
|
+
http.cert_store = OpenSSL::X509::Store.new.tap { |store|
|
151
|
+
store.set_default_paths
|
152
|
+
}
|
153
|
+
end
|
150
154
|
header = { 'User-Agent' => @user_agent }
|
151
155
|
header['Referer'] = referer if referer
|
152
156
|
# header is destroyed by this in ruby 1.9.2!
|
data/lib/webrobots/robotstxt.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.4.
|
3
|
+
# This file is automatically generated by Racc 1.4.7
|
4
4
|
# from Racc grammer file "".
|
5
5
|
#
|
6
6
|
|
@@ -14,12 +14,23 @@ class WebRobots
|
|
14
14
|
end
|
15
15
|
|
16
16
|
class ParseError < Error
|
17
|
+
# The site's root URI
|
18
|
+
attr_reader :site
|
19
|
+
|
20
|
+
def initialize(message, site)
|
21
|
+
@message = message
|
22
|
+
@site = site
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
@message
|
27
|
+
end
|
17
28
|
end
|
18
29
|
|
19
30
|
class RobotsTxt
|
20
31
|
class Parser < Racc::Parser
|
21
32
|
|
22
|
-
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry',
|
33
|
+
module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 169)
|
23
34
|
|
24
35
|
def initialize(target = nil)
|
25
36
|
super()
|
@@ -90,7 +101,7 @@ module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 158)
|
|
90
101
|
|
91
102
|
do_parse
|
92
103
|
rescue Racc::ParseError => e
|
93
|
-
raise ParseError
|
104
|
+
raise ParseError.new(e.message, @site)
|
94
105
|
ensure
|
95
106
|
@q.clear
|
96
107
|
end
|
@@ -606,7 +617,7 @@ Disallow: /
|
|
606
617
|
else
|
607
618
|
@options[ruleline.token.downcase] = ruleline.value
|
608
619
|
end
|
609
|
-
}
|
620
|
+
} if rulelines
|
610
621
|
@acls.replace @acls.sort_by { |x|
|
611
622
|
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
612
623
|
}
|
data/lib/webrobots/robotstxt.ry
CHANGED
@@ -151,6 +151,17 @@ class WebRobots
|
|
151
151
|
end
|
152
152
|
|
153
153
|
class ParseError < Error
|
154
|
+
# The site's root URI
|
155
|
+
attr_reader :site
|
156
|
+
|
157
|
+
def initialize(message, site)
|
158
|
+
@message = message
|
159
|
+
@site = site
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
@message
|
164
|
+
end
|
154
165
|
end
|
155
166
|
|
156
167
|
class RobotsTxt
|
@@ -225,7 +236,7 @@ class WebRobots
|
|
225
236
|
|
226
237
|
do_parse
|
227
238
|
rescue Racc::ParseError => e
|
228
|
-
raise ParseError
|
239
|
+
raise ParseError.new(e.message, @site)
|
229
240
|
ensure
|
230
241
|
@q.clear
|
231
242
|
end
|
@@ -341,7 +352,7 @@ Disallow: /
|
|
341
352
|
else
|
342
353
|
@options[ruleline.token.downcase] = ruleline.value
|
343
354
|
end
|
344
|
-
}
|
355
|
+
} if rulelines
|
345
356
|
@acls.replace @acls.sort_by { |x|
|
346
357
|
[-x.value.length, x.is_a?(AllowLine) ? -1 : 0]
|
347
358
|
}
|
data/test/test_webrobots.rb
CHANGED
@@ -327,7 +327,9 @@ Allow: /2heavy/*.html
|
|
327
327
|
robots.reset(url)
|
328
328
|
|
329
329
|
assert robots.allowed?(url)
|
330
|
-
|
330
|
+
error = robots.error(url)
|
331
|
+
assert_instance_of WebRobots::ParseError, error
|
332
|
+
assert_equal URI('http://www.example.org/'), error.site
|
331
333
|
assert_raise(WebRobots::ParseError) {
|
332
334
|
robots.error!(url)
|
333
335
|
}
|
@@ -472,6 +474,7 @@ Disallow: /
|
|
472
474
|
should "be parsed for major sites" do
|
473
475
|
assert_nothing_raised {
|
474
476
|
assert !@testbot.allowed?("http://www.google.com/search")
|
477
|
+
assert !@testbot.allowed?("https://www.google.com/search")
|
475
478
|
assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln")
|
476
479
|
assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6")
|
477
480
|
}
|
@@ -655,4 +658,25 @@ TXT
|
|
655
658
|
assert !@fetched
|
656
659
|
end
|
657
660
|
end
|
661
|
+
|
662
|
+
context "robots.txt with just user-agent & sitemap and no blank line between them" do
|
663
|
+
setup do
|
664
|
+
@robots = WebRobots.new('RandomBot', :http_get => lambda { |uri|
|
665
|
+
res = case uri.to_s
|
666
|
+
when 'http://site1.example.com/robots.txt'
|
667
|
+
<<-'TXT'
|
668
|
+
User-agent: *
|
669
|
+
Sitemap: http://site1.example.com/text/sitemap.xml
|
670
|
+
TXT
|
671
|
+
else
|
672
|
+
raise "#{uri} is not supposed to be fetched"
|
673
|
+
end
|
674
|
+
})
|
675
|
+
end
|
676
|
+
|
677
|
+
should "be properly parsed" do
|
678
|
+
assert @robots.allowed?("http://site1.example.com/foo")
|
679
|
+
assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/"))
|
680
|
+
end
|
681
|
+
end
|
658
682
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "webrobots"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.13"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2012-01-24"
|
13
13
|
s.description = "This library helps write robots.txt compliant web robots in Ruby.\n"
|
14
14
|
s.email = "knu@idaemons.org"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -35,34 +35,34 @@ Gem::Specification.new do |s|
|
|
35
35
|
s.homepage = "https://github.com/knu/webrobots"
|
36
36
|
s.licenses = ["2-clause BSDL"]
|
37
37
|
s.require_paths = ["lib"]
|
38
|
-
s.rubygems_version = "1.8.
|
38
|
+
s.rubygems_version = "1.8.15"
|
39
39
|
s.summary = "A Ruby library to help write robots.txt compliant web robots"
|
40
40
|
|
41
41
|
if s.respond_to? :specification_version then
|
42
42
|
s.specification_version = 3
|
43
43
|
|
44
44
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
-
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
46
45
|
s.add_development_dependency(%q<racc>, [">= 0"])
|
47
46
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
48
|
-
s.add_development_dependency(%q<bundler>, ["
|
47
|
+
s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
|
49
48
|
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
50
|
-
s.add_development_dependency(%q<rcov>, ["
|
49
|
+
s.add_development_dependency(%q<rcov>, ["~> 0.9.11"])
|
50
|
+
s.add_development_dependency(%q<nokogiri>, [">= 1.4.4"])
|
51
51
|
else
|
52
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
53
52
|
s.add_dependency(%q<racc>, [">= 0"])
|
54
53
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
55
|
-
s.add_dependency(%q<bundler>, ["
|
54
|
+
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
56
55
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
57
|
-
s.add_dependency(%q<rcov>, ["
|
56
|
+
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
57
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
58
58
|
end
|
59
59
|
else
|
60
|
-
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
61
60
|
s.add_dependency(%q<racc>, [">= 0"])
|
62
61
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
|
-
s.add_dependency(%q<bundler>, ["
|
62
|
+
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
64
63
|
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
65
|
-
s.add_dependency(%q<rcov>, ["
|
64
|
+
s.add_dependency(%q<rcov>, ["~> 0.9.11"])
|
65
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 13
|
10
|
+
version: 0.0.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-24 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
requirement: &id001 !ruby/object:Gem::Requirement
|
@@ -23,16 +23,14 @@ dependencies:
|
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
hash:
|
26
|
+
hash: 3
|
27
27
|
segments:
|
28
|
-
-
|
29
|
-
|
30
|
-
- 4
|
31
|
-
version: 1.4.4
|
28
|
+
- 0
|
29
|
+
version: "0"
|
32
30
|
version_requirements: *id001
|
33
|
-
name:
|
31
|
+
name: racc
|
34
32
|
prerelease: false
|
35
|
-
type: :
|
33
|
+
type: :development
|
36
34
|
- !ruby/object:Gem::Dependency
|
37
35
|
requirement: &id002 !ruby/object:Gem::Requirement
|
38
36
|
none: false
|
@@ -44,7 +42,7 @@ dependencies:
|
|
44
42
|
- 0
|
45
43
|
version: "0"
|
46
44
|
version_requirements: *id002
|
47
|
-
name:
|
45
|
+
name: shoulda
|
48
46
|
prerelease: false
|
49
47
|
type: :development
|
50
48
|
- !ruby/object:Gem::Dependency
|
@@ -53,12 +51,14 @@ dependencies:
|
|
53
51
|
requirements:
|
54
52
|
- - ">="
|
55
53
|
- !ruby/object:Gem::Version
|
56
|
-
hash:
|
54
|
+
hash: 23
|
57
55
|
segments:
|
56
|
+
- 1
|
58
57
|
- 0
|
59
|
-
|
58
|
+
- 0
|
59
|
+
version: 1.0.0
|
60
60
|
version_requirements: *id003
|
61
|
-
name:
|
61
|
+
name: bundler
|
62
62
|
prerelease: false
|
63
63
|
type: :development
|
64
64
|
- !ruby/object:Gem::Dependency
|
@@ -67,14 +67,14 @@ dependencies:
|
|
67
67
|
requirements:
|
68
68
|
- - ~>
|
69
69
|
- !ruby/object:Gem::Version
|
70
|
-
hash:
|
70
|
+
hash: 7
|
71
71
|
segments:
|
72
72
|
- 1
|
73
|
-
-
|
74
|
-
-
|
75
|
-
version: 1.
|
73
|
+
- 6
|
74
|
+
- 4
|
75
|
+
version: 1.6.4
|
76
76
|
version_requirements: *id004
|
77
|
-
name:
|
77
|
+
name: jeweler
|
78
78
|
prerelease: false
|
79
79
|
type: :development
|
80
80
|
- !ruby/object:Gem::Dependency
|
@@ -83,14 +83,14 @@ dependencies:
|
|
83
83
|
requirements:
|
84
84
|
- - ~>
|
85
85
|
- !ruby/object:Gem::Version
|
86
|
-
hash:
|
86
|
+
hash: 45
|
87
87
|
segments:
|
88
|
-
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
version:
|
88
|
+
- 0
|
89
|
+
- 9
|
90
|
+
- 11
|
91
|
+
version: 0.9.11
|
92
92
|
version_requirements: *id005
|
93
|
-
name:
|
93
|
+
name: rcov
|
94
94
|
prerelease: false
|
95
95
|
type: :development
|
96
96
|
- !ruby/object:Gem::Dependency
|
@@ -99,12 +99,14 @@ dependencies:
|
|
99
99
|
requirements:
|
100
100
|
- - ">="
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
hash:
|
102
|
+
hash: 15
|
103
103
|
segments:
|
104
|
-
-
|
105
|
-
|
104
|
+
- 1
|
105
|
+
- 4
|
106
|
+
- 4
|
107
|
+
version: 1.4.4
|
106
108
|
version_requirements: *id006
|
107
|
-
name:
|
109
|
+
name: nokogiri
|
108
110
|
prerelease: false
|
109
111
|
type: :development
|
110
112
|
description: |
|
@@ -162,7 +164,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
164
|
requirements: []
|
163
165
|
|
164
166
|
rubyforge_project:
|
165
|
-
rubygems_version: 1.8.
|
167
|
+
rubygems_version: 1.8.15
|
166
168
|
signing_key:
|
167
169
|
specification_version: 3
|
168
170
|
summary: A Ruby library to help write robots.txt compliant web robots
|