webrobots 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/README.rdoc +16 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/webrobots.rb +5 -0
- data/lib/webrobots/nokogiri.rb +32 -0
- data/test/test_webrobots.rb +31 -2
- data/webrobots.gemspec +9 -2
- metadata +43 -12
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -6,6 +6,7 @@ GEM
|
|
6
6
|
bundler (~> 1.0.0)
|
7
7
|
git (>= 1.2.5)
|
8
8
|
rake
|
9
|
+
nokogiri (1.4.4)
|
9
10
|
racc (1.4.6)
|
10
11
|
rake (0.8.7)
|
11
12
|
rcov (0.9.9)
|
@@ -17,6 +18,7 @@ PLATFORMS
|
|
17
18
|
DEPENDENCIES
|
18
19
|
bundler (~> 1.0.0)
|
19
20
|
jeweler (~> 1.5.1)
|
21
|
+
nokogiri (>= 1.4.4)
|
20
22
|
racc
|
21
23
|
rcov
|
22
24
|
shoulda
|
data/README.rdoc
CHANGED
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
This is a library to help write robots.txt compliant web robots.
|
4
4
|
|
5
|
+
== Usage
|
6
|
+
|
7
|
+
require 'webrobots'
|
8
|
+
require 'uri'
|
9
|
+
require 'net/http'
|
10
|
+
|
11
|
+
robots = WebRobots.new('MyBot/1.0')
|
12
|
+
|
13
|
+
uri = URI('http://digg.com/news/24hr')
|
14
|
+
if robots.disallowed?(uri)
|
15
|
+
STDERR.puts "Access disallowed: #{uri}"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
body = Net::HTTP.get(uri)
|
19
|
+
# ...
|
20
|
+
|
5
21
|
== Contributing to webrobots
|
6
22
|
|
7
23
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
data/Rakefile
CHANGED
@@ -24,7 +24,7 @@ This library helps write robots.txt compliant web robots in Ruby.
|
|
24
24
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
25
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
26
|
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
-
|
27
|
+
gem.add_development_dependency 'racc'
|
28
28
|
end
|
29
29
|
Jeweler::RubygemsDotOrgTasks.new
|
30
30
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/lib/webrobots.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
require 'webrobots/robotstxt'
|
2
2
|
require 'uri'
|
3
3
|
require 'net/https'
|
4
|
+
if defined?(Nokogiri)
|
5
|
+
require 'webrobots/nokogiri'
|
6
|
+
else
|
7
|
+
autoload :Nokogiri, 'webrobots/nokogiri'
|
8
|
+
end
|
4
9
|
|
5
10
|
class WebRobots
|
6
11
|
# Creates a WebRobots object for a robot named +user_agent+, with
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Nokogiri::HTML::Document
|
4
|
+
# Returns an array of lower-cased <meta name="ROBOTS"> tokens. If
|
5
|
+
# no tag is found, returns an empty array. An optional
|
6
|
+
# +custom_name+ specifies the name of a meta tag to look for ahead
|
7
|
+
# of "ROBOTS". Names are compared in a case-insensitive manner.
|
8
|
+
def meta_robots(custom_name = nil)
|
9
|
+
(@meta_robots ||= {})[custom_name] =
|
10
|
+
(custom_name && parse_meta_robots(custom_name)) || parse_meta_robots('robots')
|
11
|
+
end
|
12
|
+
|
13
|
+
# Equivalent to meta_robots(custom_name).include?('noindex').
|
14
|
+
def noindex?(custom_name = nil)
|
15
|
+
meta_robots(custom_name).include?('noindex')
|
16
|
+
end
|
17
|
+
|
18
|
+
# Equivalent to meta_robots(custom_name).include?('nofollow').
|
19
|
+
def nofollow?(custom_name = nil)
|
20
|
+
meta_robots(custom_name).include?('nofollow')
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def parse_meta_robots(custom_name)
|
26
|
+
pattern = /\A#{Regexp.quote(custom_name)}\z/i
|
27
|
+
meta = css('meta[@name]').find { |meta|
|
28
|
+
meta['name'].match(pattern)
|
29
|
+
} and content = meta['content'] or return []
|
30
|
+
content.downcase.split(/[,\s]+/)
|
31
|
+
end
|
32
|
+
end
|
data/test/test_webrobots.rb
CHANGED
@@ -283,9 +283,38 @@ Disallow: /
|
|
283
283
|
assert !@testbot.allowed?("http://store.apple.com/vieworder")
|
284
284
|
assert @msnbot.allowed?("http://store.apple.com/vieworder")
|
285
285
|
}
|
286
|
-
|
286
|
+
assert_nothing_raised {
|
287
287
|
assert !@testbot.allowed?("http://github.com/login")
|
288
|
-
|
288
|
+
}
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
context "meta robots tag" do
|
293
|
+
setup do
|
294
|
+
@doc = Nokogiri::HTML(<<-HTML)
|
295
|
+
<html>
|
296
|
+
<head>
|
297
|
+
<meta name="ROBOTS" content="NOFOLLOW">
|
298
|
+
<meta name="Slurp" content="noindex,nofollow">
|
299
|
+
<meta name="googlebot" content="noarchive, noindex">
|
300
|
+
</head>
|
301
|
+
<body>
|
302
|
+
test
|
303
|
+
</body>
|
304
|
+
</html>
|
305
|
+
HTML
|
306
|
+
end
|
307
|
+
|
308
|
+
should "be properly parsed when given in HTML string" do
|
309
|
+
assert !@doc.noindex?
|
310
|
+
assert @doc.nofollow?
|
311
|
+
|
312
|
+
assert @doc.noindex?('slurp')
|
313
|
+
assert @doc.nofollow?('slurp')
|
314
|
+
|
315
|
+
assert @doc.noindex?('googlebot')
|
316
|
+
assert !@doc.nofollow?('googlebot')
|
317
|
+
assert @doc.meta_robots('googlebot').include?('noarchive')
|
289
318
|
end
|
290
319
|
end
|
291
320
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-05}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
@@ -26,6 +26,7 @@ Gem::Specification.new do |s|
|
|
26
26
|
"Rakefile",
|
27
27
|
"VERSION",
|
28
28
|
"lib/webrobots.rb",
|
29
|
+
"lib/webrobots/nokogiri.rb",
|
29
30
|
"lib/webrobots/robotstxt.rb",
|
30
31
|
"lib/webrobots/robotstxt.ry",
|
31
32
|
"test/helper.rb",
|
@@ -46,23 +47,29 @@ Gem::Specification.new do |s|
|
|
46
47
|
|
47
48
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
48
49
|
s.add_runtime_dependency(%q<racc>, [">= 0"])
|
50
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
49
51
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
50
52
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
51
53
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.1"])
|
52
54
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
+
s.add_development_dependency(%q<racc>, [">= 0"])
|
53
56
|
else
|
54
57
|
s.add_dependency(%q<racc>, [">= 0"])
|
58
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
55
59
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
56
60
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
61
|
s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
|
58
62
|
s.add_dependency(%q<rcov>, [">= 0"])
|
63
|
+
s.add_dependency(%q<racc>, [">= 0"])
|
59
64
|
end
|
60
65
|
else
|
61
66
|
s.add_dependency(%q<racc>, [">= 0"])
|
67
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
62
68
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
69
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
70
|
s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
|
65
71
|
s.add_dependency(%q<rcov>, [">= 0"])
|
72
|
+
s.add_dependency(%q<racc>, [">= 0"])
|
66
73
|
end
|
67
74
|
end
|
68
75
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-05 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -33,8 +33,24 @@ dependencies:
|
|
33
33
|
prerelease: false
|
34
34
|
name: racc
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
type: :
|
36
|
+
type: :runtime
|
37
37
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 15
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 4
|
46
|
+
- 4
|
47
|
+
version: 1.4.4
|
48
|
+
requirement: *id002
|
49
|
+
prerelease: false
|
50
|
+
name: nokogiri
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
type: :development
|
53
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
38
54
|
none: false
|
39
55
|
requirements:
|
40
56
|
- - ">="
|
@@ -43,12 +59,12 @@ dependencies:
|
|
43
59
|
segments:
|
44
60
|
- 0
|
45
61
|
version: "0"
|
46
|
-
requirement: *
|
62
|
+
requirement: *id003
|
47
63
|
prerelease: false
|
48
64
|
name: shoulda
|
49
65
|
- !ruby/object:Gem::Dependency
|
50
66
|
type: :development
|
51
|
-
version_requirements: &
|
67
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
52
68
|
none: false
|
53
69
|
requirements:
|
54
70
|
- - ~>
|
@@ -59,12 +75,12 @@ dependencies:
|
|
59
75
|
- 0
|
60
76
|
- 0
|
61
77
|
version: 1.0.0
|
62
|
-
requirement: *
|
78
|
+
requirement: *id004
|
63
79
|
prerelease: false
|
64
80
|
name: bundler
|
65
81
|
- !ruby/object:Gem::Dependency
|
66
82
|
type: :development
|
67
|
-
version_requirements: &
|
83
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
68
84
|
none: false
|
69
85
|
requirements:
|
70
86
|
- - ~>
|
@@ -75,12 +91,12 @@ dependencies:
|
|
75
91
|
- 5
|
76
92
|
- 1
|
77
93
|
version: 1.5.1
|
78
|
-
requirement: *
|
94
|
+
requirement: *id005
|
79
95
|
prerelease: false
|
80
96
|
name: jeweler
|
81
97
|
- !ruby/object:Gem::Dependency
|
82
98
|
type: :development
|
83
|
-
version_requirements: &
|
99
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
84
100
|
none: false
|
85
101
|
requirements:
|
86
102
|
- - ">="
|
@@ -89,9 +105,23 @@ dependencies:
|
|
89
105
|
segments:
|
90
106
|
- 0
|
91
107
|
version: "0"
|
92
|
-
requirement: *
|
108
|
+
requirement: *id006
|
93
109
|
prerelease: false
|
94
110
|
name: rcov
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
type: :development
|
113
|
+
version_requirements: &id007 !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
hash: 3
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirement: *id007
|
123
|
+
prerelease: false
|
124
|
+
name: racc
|
95
125
|
description: |
|
96
126
|
This library helps write robots.txt compliant web robots in Ruby.
|
97
127
|
|
@@ -112,6 +142,7 @@ files:
|
|
112
142
|
- Rakefile
|
113
143
|
- VERSION
|
114
144
|
- lib/webrobots.rb
|
145
|
+
- lib/webrobots/nokogiri.rb
|
115
146
|
- lib/webrobots/robotstxt.rb
|
116
147
|
- lib/webrobots/robotstxt.ry
|
117
148
|
- test/helper.rb
|