webrobots 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/README.rdoc +16 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/webrobots.rb +5 -0
- data/lib/webrobots/nokogiri.rb +32 -0
- data/test/test_webrobots.rb +31 -2
- data/webrobots.gemspec +9 -2
- metadata +43 -12
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -6,6 +6,7 @@ GEM
|
|
6
6
|
bundler (~> 1.0.0)
|
7
7
|
git (>= 1.2.5)
|
8
8
|
rake
|
9
|
+
nokogiri (1.4.4)
|
9
10
|
racc (1.4.6)
|
10
11
|
rake (0.8.7)
|
11
12
|
rcov (0.9.9)
|
@@ -17,6 +18,7 @@ PLATFORMS
|
|
17
18
|
DEPENDENCIES
|
18
19
|
bundler (~> 1.0.0)
|
19
20
|
jeweler (~> 1.5.1)
|
21
|
+
nokogiri (>= 1.4.4)
|
20
22
|
racc
|
21
23
|
rcov
|
22
24
|
shoulda
|
data/README.rdoc
CHANGED
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
This is a library to help write robots.txt compliant web robots.
|
4
4
|
|
5
|
+
== Usage
|
6
|
+
|
7
|
+
require 'webrobots'
|
8
|
+
require 'uri'
|
9
|
+
require 'net/http'
|
10
|
+
|
11
|
+
robots = WebRobots.new('MyBot/1.0')
|
12
|
+
|
13
|
+
uri = URI('http://digg.com/news/24hr')
|
14
|
+
if robots.disallowed?(uri)
|
15
|
+
STDERR.puts "Access disallowed: #{uri}"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
body = Net::HTTP.get(uri)
|
19
|
+
# ...
|
20
|
+
|
5
21
|
== Contributing to webrobots
|
6
22
|
|
7
23
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
data/Rakefile
CHANGED
@@ -24,7 +24,7 @@ This library helps write robots.txt compliant web robots in Ruby.
|
|
24
24
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
25
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
26
|
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
-
|
27
|
+
gem.add_development_dependency 'racc'
|
28
28
|
end
|
29
29
|
Jeweler::RubygemsDotOrgTasks.new
|
30
30
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/lib/webrobots.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
require 'webrobots/robotstxt'
|
2
2
|
require 'uri'
|
3
3
|
require 'net/https'
|
4
|
+
if defined?(Nokogiri)
|
5
|
+
require 'webrobots/nokogiri'
|
6
|
+
else
|
7
|
+
autoload :Nokogiri, 'webrobots/nokogiri'
|
8
|
+
end
|
4
9
|
|
5
10
|
class WebRobots
|
6
11
|
# Creates a WebRobots object for a robot named +user_agent+, with
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Nokogiri::HTML::Document
|
4
|
+
# Returns an array of lower-cased <meta name="ROBOTS"> tokens. If
|
5
|
+
# no tag is found, returns an empty array. An optional
|
6
|
+
# +custom_name+ specifies the name of a meta tag to look for ahead
|
7
|
+
# of "ROBOTS". Names are compared in a case-insensitive manner.
|
8
|
+
def meta_robots(custom_name = nil)
|
9
|
+
(@meta_robots ||= {})[custom_name] =
|
10
|
+
(custom_name && parse_meta_robots(custom_name)) || parse_meta_robots('robots')
|
11
|
+
end
|
12
|
+
|
13
|
+
# Equivalent to meta_robots(custom_name).include?('noindex').
|
14
|
+
def noindex?(custom_name = nil)
|
15
|
+
meta_robots(custom_name).include?('noindex')
|
16
|
+
end
|
17
|
+
|
18
|
+
# Equivalent to meta_robots(custom_name).include?('nofollow').
|
19
|
+
def nofollow?(custom_name = nil)
|
20
|
+
meta_robots(custom_name).include?('nofollow')
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def parse_meta_robots(custom_name)
|
26
|
+
pattern = /\A#{Regexp.quote(custom_name)}\z/i
|
27
|
+
meta = css('meta[@name]').find { |meta|
|
28
|
+
meta['name'].match(pattern)
|
29
|
+
} and content = meta['content'] or return []
|
30
|
+
content.downcase.split(/[,\s]+/)
|
31
|
+
end
|
32
|
+
end
|
data/test/test_webrobots.rb
CHANGED
@@ -283,9 +283,38 @@ Disallow: /
|
|
283
283
|
assert !@testbot.allowed?("http://store.apple.com/vieworder")
|
284
284
|
assert @msnbot.allowed?("http://store.apple.com/vieworder")
|
285
285
|
}
|
286
|
-
|
286
|
+
assert_nothing_raised {
|
287
287
|
assert !@testbot.allowed?("http://github.com/login")
|
288
|
-
|
288
|
+
}
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
context "meta robots tag" do
|
293
|
+
setup do
|
294
|
+
@doc = Nokogiri::HTML(<<-HTML)
|
295
|
+
<html>
|
296
|
+
<head>
|
297
|
+
<meta name="ROBOTS" content="NOFOLLOW">
|
298
|
+
<meta name="Slurp" content="noindex,nofollow">
|
299
|
+
<meta name="googlebot" content="noarchive, noindex">
|
300
|
+
</head>
|
301
|
+
<body>
|
302
|
+
test
|
303
|
+
</body>
|
304
|
+
</html>
|
305
|
+
HTML
|
306
|
+
end
|
307
|
+
|
308
|
+
should "be properly parsed when given in HTML string" do
|
309
|
+
assert !@doc.noindex?
|
310
|
+
assert @doc.nofollow?
|
311
|
+
|
312
|
+
assert @doc.noindex?('slurp')
|
313
|
+
assert @doc.nofollow?('slurp')
|
314
|
+
|
315
|
+
assert @doc.noindex?('googlebot')
|
316
|
+
assert !@doc.nofollow?('googlebot')
|
317
|
+
assert @doc.meta_robots('googlebot').include?('noarchive')
|
289
318
|
end
|
290
319
|
end
|
291
320
|
end
|
data/webrobots.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{webrobots}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Akinori MUSHA"]
|
12
|
-
s.date = %q{2011-01-
|
12
|
+
s.date = %q{2011-01-05}
|
13
13
|
s.description = %q{This library helps write robots.txt compliant web robots in Ruby.
|
14
14
|
}
|
15
15
|
s.email = %q{knu@idaemons.org}
|
@@ -26,6 +26,7 @@ Gem::Specification.new do |s|
|
|
26
26
|
"Rakefile",
|
27
27
|
"VERSION",
|
28
28
|
"lib/webrobots.rb",
|
29
|
+
"lib/webrobots/nokogiri.rb",
|
29
30
|
"lib/webrobots/robotstxt.rb",
|
30
31
|
"lib/webrobots/robotstxt.ry",
|
31
32
|
"test/helper.rb",
|
@@ -46,23 +47,29 @@ Gem::Specification.new do |s|
|
|
46
47
|
|
47
48
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
48
49
|
s.add_runtime_dependency(%q<racc>, [">= 0"])
|
50
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.4"])
|
49
51
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
50
52
|
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
51
53
|
s.add_development_dependency(%q<jeweler>, ["~> 1.5.1"])
|
52
54
|
s.add_development_dependency(%q<rcov>, [">= 0"])
|
55
|
+
s.add_development_dependency(%q<racc>, [">= 0"])
|
53
56
|
else
|
54
57
|
s.add_dependency(%q<racc>, [">= 0"])
|
58
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
55
59
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
56
60
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
61
|
s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
|
58
62
|
s.add_dependency(%q<rcov>, [">= 0"])
|
63
|
+
s.add_dependency(%q<racc>, [">= 0"])
|
59
64
|
end
|
60
65
|
else
|
61
66
|
s.add_dependency(%q<racc>, [">= 0"])
|
67
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.4"])
|
62
68
|
s.add_dependency(%q<shoulda>, [">= 0"])
|
63
69
|
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
64
70
|
s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
|
65
71
|
s.add_dependency(%q<rcov>, [">= 0"])
|
72
|
+
s.add_dependency(%q<racc>, [">= 0"])
|
66
73
|
end
|
67
74
|
end
|
68
75
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webrobots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Akinori MUSHA
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-05 00:00:00 +09:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -33,8 +33,24 @@ dependencies:
|
|
33
33
|
prerelease: false
|
34
34
|
name: racc
|
35
35
|
- !ruby/object:Gem::Dependency
|
36
|
-
type: :
|
36
|
+
type: :runtime
|
37
37
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 15
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 4
|
46
|
+
- 4
|
47
|
+
version: 1.4.4
|
48
|
+
requirement: *id002
|
49
|
+
prerelease: false
|
50
|
+
name: nokogiri
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
type: :development
|
53
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
38
54
|
none: false
|
39
55
|
requirements:
|
40
56
|
- - ">="
|
@@ -43,12 +59,12 @@ dependencies:
|
|
43
59
|
segments:
|
44
60
|
- 0
|
45
61
|
version: "0"
|
46
|
-
requirement: *
|
62
|
+
requirement: *id003
|
47
63
|
prerelease: false
|
48
64
|
name: shoulda
|
49
65
|
- !ruby/object:Gem::Dependency
|
50
66
|
type: :development
|
51
|
-
version_requirements: &
|
67
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
52
68
|
none: false
|
53
69
|
requirements:
|
54
70
|
- - ~>
|
@@ -59,12 +75,12 @@ dependencies:
|
|
59
75
|
- 0
|
60
76
|
- 0
|
61
77
|
version: 1.0.0
|
62
|
-
requirement: *
|
78
|
+
requirement: *id004
|
63
79
|
prerelease: false
|
64
80
|
name: bundler
|
65
81
|
- !ruby/object:Gem::Dependency
|
66
82
|
type: :development
|
67
|
-
version_requirements: &
|
83
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
68
84
|
none: false
|
69
85
|
requirements:
|
70
86
|
- - ~>
|
@@ -75,12 +91,12 @@ dependencies:
|
|
75
91
|
- 5
|
76
92
|
- 1
|
77
93
|
version: 1.5.1
|
78
|
-
requirement: *
|
94
|
+
requirement: *id005
|
79
95
|
prerelease: false
|
80
96
|
name: jeweler
|
81
97
|
- !ruby/object:Gem::Dependency
|
82
98
|
type: :development
|
83
|
-
version_requirements: &
|
99
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
84
100
|
none: false
|
85
101
|
requirements:
|
86
102
|
- - ">="
|
@@ -89,9 +105,23 @@ dependencies:
|
|
89
105
|
segments:
|
90
106
|
- 0
|
91
107
|
version: "0"
|
92
|
-
requirement: *
|
108
|
+
requirement: *id006
|
93
109
|
prerelease: false
|
94
110
|
name: rcov
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
type: :development
|
113
|
+
version_requirements: &id007 !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
hash: 3
|
119
|
+
segments:
|
120
|
+
- 0
|
121
|
+
version: "0"
|
122
|
+
requirement: *id007
|
123
|
+
prerelease: false
|
124
|
+
name: racc
|
95
125
|
description: |
|
96
126
|
This library helps write robots.txt compliant web robots in Ruby.
|
97
127
|
|
@@ -112,6 +142,7 @@ files:
|
|
112
142
|
- Rakefile
|
113
143
|
- VERSION
|
114
144
|
- lib/webrobots.rb
|
145
|
+
- lib/webrobots/nokogiri.rb
|
115
146
|
- lib/webrobots/robotstxt.rb
|
116
147
|
- lib/webrobots/robotstxt.ry
|
117
148
|
- test/helper.rb
|