robotstxt 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.rdoc +25 -0
- data/Manifest +8 -0
- data/Rakefile +57 -0
- data/lib/robotstxt.rb +9 -9
- data/lib/robotstxt/parser.rb +24 -31
- data/robotstxt.gemspec +38 -0
- metadata +45 -12
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= License
|
2
|
+
|
3
|
+
(The MIT License)
|
4
|
+
|
5
|
+
Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
8
|
+
a copy of this software and associated documentation files (the
|
9
|
+
"Software"), to deal in the Software without restriction, including
|
10
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
11
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
+
permit persons to whom the Software is furnished to do so, subject to
|
13
|
+
the following conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
|
data/Manifest
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + "/lib")
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'echoe'
|
6
|
+
require 'robotstxt'
|
7
|
+
|
8
|
+
|
9
|
+
# Common package properties
|
10
|
+
PKG_NAME = 'robotstxt'
|
11
|
+
PKG_VERSION = Robotstxt::VERSION
|
12
|
+
RUBYFORGE_PROJECT = 'robotstxt'
|
13
|
+
|
14
|
+
if ENV['SNAPSHOT'].to_i == 1
|
15
|
+
PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
Echoe.new(PKG_NAME, PKG_VERSION) do |p|
|
20
|
+
p.author = "Simone Rinzivillo"
|
21
|
+
p.email = "srinzivillo@gmail.com"
|
22
|
+
p.summary = "Robotstxt is an Ruby robots.txt file parser"
|
23
|
+
p.url = "http://www.simonerinzivillo.it"
|
24
|
+
p.project = RUBYFORGE_PROJECT
|
25
|
+
p.description = <<-EOD
|
26
|
+
Robotstxt Parser allows you to the check the accessibility of URLs and get other data. \
|
27
|
+
Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
28
|
+
EOD
|
29
|
+
|
30
|
+
p.need_zip = true
|
31
|
+
|
32
|
+
p.development_dependencies += ["rake ~>0.8",
|
33
|
+
"echoe ~>3.1"]
|
34
|
+
|
35
|
+
p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
desc "Open an irb session preloaded with this library"
|
40
|
+
task :console do
|
41
|
+
sh "irb -rubygems -I lib -r robotstxt.rb"
|
42
|
+
end
|
43
|
+
|
44
|
+
begin
|
45
|
+
require 'code_statistics'
|
46
|
+
desc "Show library's code statistics"
|
47
|
+
task :stats do
|
48
|
+
CodeStatistics.new(["Robotstxt", "lib"],
|
49
|
+
["Tests", "test"]).to_s
|
50
|
+
end
|
51
|
+
rescue LoadError
|
52
|
+
puts "CodeStatistics (Rails) is not available"
|
53
|
+
end
|
54
|
+
|
55
|
+
Dir["tasks/**/*.rake"].each do |file|
|
56
|
+
load(file)
|
57
|
+
end
|
data/lib/robotstxt.rb
CHANGED
@@ -24,31 +24,31 @@ module Robotstxt
|
|
24
24
|
NAME = 'Robotstxt'
|
25
25
|
GEM = 'robotstxt'
|
26
26
|
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
-
VERSION = '0.5.
|
27
|
+
VERSION = '0.5.2'
|
28
28
|
|
29
29
|
|
30
30
|
# Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
|
31
31
|
# Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
32
32
|
#
|
33
|
-
#
|
33
|
+
# Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
|
34
34
|
#
|
35
35
|
def self.allowed?(url, robot_id)
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
u = URI.parse(url)
|
38
|
+
r = Robotstxt::Parser.new(robot_id)
|
39
|
+
r.allowed?(url) if r.get(u.scheme + '://' + u.host)
|
40
40
|
|
41
41
|
end
|
42
42
|
|
43
43
|
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
44
44
|
#
|
45
|
-
#
|
45
|
+
# Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
|
46
46
|
#
|
47
47
|
def self.sitemaps(url, robot_id)
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
49
|
+
u = URI.parse(url)
|
50
|
+
r = Robotstxt::Parser.new(robot_id)
|
51
|
+
r.sitemaps if r.get(u.scheme + '://' + u.host)
|
52
52
|
|
53
53
|
end
|
54
54
|
|
data/lib/robotstxt/parser.rb
CHANGED
@@ -27,6 +27,7 @@ module Robotstxt
|
|
27
27
|
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
28
28
|
#
|
29
29
|
def initialize(robot_id = nil)
|
30
|
+
|
30
31
|
@robot_id = '*'
|
31
32
|
@rules = []
|
32
33
|
@sitemaps = []
|
@@ -37,14 +38,14 @@ module Robotstxt
|
|
37
38
|
|
38
39
|
# Requires and parses the Robots.txt file for the <tt>hostname</tt>.
|
39
40
|
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
# <tt>client.get('http:// www.simonerinzivillo.it')</tt>
|
41
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
42
|
+
# client.get('http://www.simonerinzivillo.it')
|
43
43
|
#
|
44
44
|
#
|
45
45
|
# This method returns <tt>true</tt> if the parsing is gone.
|
46
46
|
#
|
47
47
|
def get(hostname)
|
48
|
+
|
48
49
|
@ehttp = true
|
49
50
|
url = URI.parse(hostname)
|
50
51
|
|
@@ -59,12 +60,12 @@ module Robotstxt
|
|
59
60
|
|
60
61
|
case response
|
61
62
|
when Net::HTTPSuccess then
|
62
|
-
|
63
|
-
|
64
|
-
|
63
|
+
@found = true
|
64
|
+
@body = response.body
|
65
|
+
parse()
|
65
66
|
|
66
67
|
else
|
67
|
-
|
68
|
+
@found = false
|
68
69
|
end
|
69
70
|
|
70
71
|
return @found
|
@@ -73,22 +74,20 @@ module Robotstxt
|
|
73
74
|
if @ehttp
|
74
75
|
@ettp = false
|
75
76
|
retry
|
76
|
-
|
77
|
+
else
|
77
78
|
return nil
|
78
79
|
end
|
79
80
|
end
|
81
|
+
|
80
82
|
end
|
81
83
|
|
82
84
|
|
83
85
|
# Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
|
84
86
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
# <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
|
90
|
-
#
|
91
|
-
# <tt>end</tt>
|
87
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
88
|
+
# if client.get('http://www.simonerinzivillo.it')
|
89
|
+
# client.allowed?('http://www.simonerinzivillo.it/no-dir/')
|
90
|
+
# end
|
92
91
|
#
|
93
92
|
# This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
94
93
|
#
|
@@ -101,9 +100,9 @@ module Robotstxt
|
|
101
100
|
@rules.each {|ua|
|
102
101
|
|
103
102
|
if @robot_id == ua[0] || ua[0] == '*'
|
104
|
-
|
105
|
-
ua[1].each {|d|
|
106
103
|
|
104
|
+
ua[1].each {|d|
|
105
|
+
|
107
106
|
is_allow = false if url_path.match('^' + d ) || d == '/'
|
108
107
|
|
109
108
|
}
|
@@ -111,23 +110,17 @@ module Robotstxt
|
|
111
110
|
end
|
112
111
|
|
113
112
|
}
|
114
|
-
|
113
|
+
is_allow
|
115
114
|
end
|
116
115
|
|
117
116
|
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
118
117
|
#
|
119
|
-
#
|
120
|
-
#
|
121
|
-
#
|
122
|
-
#
|
123
|
-
#
|
124
|
-
#
|
125
|
-
# <tt> puts url</tt>
|
126
|
-
#
|
127
|
-
#
|
128
|
-
# <tt> }</tt>
|
129
|
-
#
|
130
|
-
# <tt>end</tt>
|
118
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
119
|
+
# if client.get('http://www.simonerinzivillo.it')
|
120
|
+
# client.sitemaps.each{ |url|
|
121
|
+
# puts url
|
122
|
+
# }
|
123
|
+
# end
|
131
124
|
#
|
132
125
|
def sitemaps
|
133
126
|
@sitemaps
|
@@ -146,7 +139,7 @@ module Robotstxt
|
|
146
139
|
@body = @body.downcase
|
147
140
|
|
148
141
|
@body.each_line {|r|
|
149
|
-
|
142
|
+
|
150
143
|
case r
|
151
144
|
when /^#.+$/
|
152
145
|
|
data/robotstxt.gemspec
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{robotstxt}
|
5
|
+
s.version = "0.5.2"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Simone Rinzivillo"]
|
9
|
+
s.date = %q{2009-12-19}
|
10
|
+
s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
11
|
+
}
|
12
|
+
s.email = %q{srinzivillo@gmail.com}
|
13
|
+
s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
|
14
|
+
s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
|
15
|
+
s.homepage = %q{http://www.simonerinzivillo.it}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{robotstxt}
|
19
|
+
s.rubygems_version = %q{1.3.5}
|
20
|
+
s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
|
21
|
+
s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8"])
|
29
|
+
s.add_development_dependency(%q<echoe>, ["~> 3.1"])
|
30
|
+
else
|
31
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
32
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
33
|
+
end
|
34
|
+
else
|
35
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
36
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
37
|
+
end
|
38
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: robotstxt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Rinzivillo
|
@@ -9,46 +9,79 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-19 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
16
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.8"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: echoe
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "3.1"
|
34
|
+
version:
|
35
|
+
description: " Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.\n"
|
17
36
|
email: srinzivillo@gmail.com
|
18
37
|
executables: []
|
19
38
|
|
20
39
|
extensions: []
|
21
40
|
|
22
41
|
extra_rdoc_files:
|
42
|
+
- LICENSE.rdoc
|
23
43
|
- README.rdoc
|
24
|
-
files:
|
25
|
-
- lib/robotstxt/parser.rb
|
26
44
|
- lib/robotstxt.rb
|
45
|
+
- lib/robotstxt/parser.rb
|
46
|
+
files:
|
47
|
+
- LICENSE.rdoc
|
48
|
+
- Manifest
|
27
49
|
- README.rdoc
|
50
|
+
- Rakefile
|
51
|
+
- lib/robotstxt.rb
|
52
|
+
- lib/robotstxt/parser.rb
|
53
|
+
- test/parser_test.rb
|
54
|
+
- test/robotstxt_test.rb
|
55
|
+
- robotstxt.gemspec
|
28
56
|
has_rdoc: true
|
29
57
|
homepage: http://www.simonerinzivillo.it
|
30
58
|
licenses: []
|
31
59
|
|
32
60
|
post_install_message:
|
33
|
-
rdoc_options:
|
34
|
-
|
61
|
+
rdoc_options:
|
62
|
+
- --line-numbers
|
63
|
+
- --inline-source
|
64
|
+
- --title
|
65
|
+
- Robotstxt
|
66
|
+
- --main
|
67
|
+
- README.rdoc
|
35
68
|
require_paths:
|
36
69
|
- lib
|
37
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
71
|
requirements:
|
39
72
|
- - ">="
|
40
73
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
74
|
+
version: "0"
|
42
75
|
version:
|
43
76
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
77
|
requirements:
|
45
78
|
- - ">="
|
46
79
|
- !ruby/object:Gem::Version
|
47
|
-
version: "
|
80
|
+
version: "1.2"
|
48
81
|
version:
|
49
82
|
requirements: []
|
50
83
|
|
51
|
-
rubyforge_project:
|
84
|
+
rubyforge_project: robotstxt
|
52
85
|
rubygems_version: 1.3.5
|
53
86
|
signing_key:
|
54
87
|
specification_version: 3
|