robotstxt 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.rdoc +25 -0
- data/Manifest +8 -0
- data/Rakefile +57 -0
- data/lib/robotstxt.rb +9 -9
- data/lib/robotstxt/parser.rb +24 -31
- data/robotstxt.gemspec +38 -0
- metadata +45 -12
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= License
|
2
|
+
|
3
|
+
(The MIT License)
|
4
|
+
|
5
|
+
Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
8
|
+
a copy of this software and associated documentation files (the
|
9
|
+
"Software"), to deal in the Software without restriction, including
|
10
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
11
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
+
permit persons to whom the Software is furnished to do so, subject to
|
13
|
+
the following conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
|
data/Manifest
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + "/lib")
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'echoe'
|
6
|
+
require 'robotstxt'
|
7
|
+
|
8
|
+
|
9
|
+
# Common package properties
|
10
|
+
PKG_NAME = 'robotstxt'
|
11
|
+
PKG_VERSION = Robotstxt::VERSION
|
12
|
+
RUBYFORGE_PROJECT = 'robotstxt'
|
13
|
+
|
14
|
+
if ENV['SNAPSHOT'].to_i == 1
|
15
|
+
PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
Echoe.new(PKG_NAME, PKG_VERSION) do |p|
|
20
|
+
p.author = "Simone Rinzivillo"
|
21
|
+
p.email = "srinzivillo@gmail.com"
|
22
|
+
p.summary = "Robotstxt is an Ruby robots.txt file parser"
|
23
|
+
p.url = "http://www.simonerinzivillo.it"
|
24
|
+
p.project = RUBYFORGE_PROJECT
|
25
|
+
p.description = <<-EOD
|
26
|
+
Robotstxt Parser allows you to the check the accessibility of URLs and get other data. \
|
27
|
+
Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
28
|
+
EOD
|
29
|
+
|
30
|
+
p.need_zip = true
|
31
|
+
|
32
|
+
p.development_dependencies += ["rake ~>0.8",
|
33
|
+
"echoe ~>3.1"]
|
34
|
+
|
35
|
+
p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
desc "Open an irb session preloaded with this library"
|
40
|
+
task :console do
|
41
|
+
sh "irb -rubygems -I lib -r robotstxt.rb"
|
42
|
+
end
|
43
|
+
|
44
|
+
begin
|
45
|
+
require 'code_statistics'
|
46
|
+
desc "Show library's code statistics"
|
47
|
+
task :stats do
|
48
|
+
CodeStatistics.new(["Robotstxt", "lib"],
|
49
|
+
["Tests", "test"]).to_s
|
50
|
+
end
|
51
|
+
rescue LoadError
|
52
|
+
puts "CodeStatistics (Rails) is not available"
|
53
|
+
end
|
54
|
+
|
55
|
+
Dir["tasks/**/*.rake"].each do |file|
|
56
|
+
load(file)
|
57
|
+
end
|
data/lib/robotstxt.rb
CHANGED
@@ -24,31 +24,31 @@ module Robotstxt
|
|
24
24
|
NAME = 'Robotstxt'
|
25
25
|
GEM = 'robotstxt'
|
26
26
|
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
-
VERSION = '0.5.
|
27
|
+
VERSION = '0.5.2'
|
28
28
|
|
29
29
|
|
30
30
|
# Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
|
31
31
|
# Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
32
32
|
#
|
33
|
-
#
|
33
|
+
# Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
|
34
34
|
#
|
35
35
|
def self.allowed?(url, robot_id)
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
u = URI.parse(url)
|
38
|
+
r = Robotstxt::Parser.new(robot_id)
|
39
|
+
r.allowed?(url) if r.get(u.scheme + '://' + u.host)
|
40
40
|
|
41
41
|
end
|
42
42
|
|
43
43
|
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
44
44
|
#
|
45
|
-
#
|
45
|
+
# Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
|
46
46
|
#
|
47
47
|
def self.sitemaps(url, robot_id)
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
49
|
+
u = URI.parse(url)
|
50
|
+
r = Robotstxt::Parser.new(robot_id)
|
51
|
+
r.sitemaps if r.get(u.scheme + '://' + u.host)
|
52
52
|
|
53
53
|
end
|
54
54
|
|
data/lib/robotstxt/parser.rb
CHANGED
@@ -27,6 +27,7 @@ module Robotstxt
|
|
27
27
|
# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
|
28
28
|
#
|
29
29
|
def initialize(robot_id = nil)
|
30
|
+
|
30
31
|
@robot_id = '*'
|
31
32
|
@rules = []
|
32
33
|
@sitemaps = []
|
@@ -37,14 +38,14 @@ module Robotstxt
|
|
37
38
|
|
38
39
|
# Requires and parses the Robots.txt file for the <tt>hostname</tt>.
|
39
40
|
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
# <tt>client.get('http:// www.simonerinzivillo.it')</tt>
|
41
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
42
|
+
# client.get('http://www.simonerinzivillo.it')
|
43
43
|
#
|
44
44
|
#
|
45
45
|
# This method returns <tt>true</tt> if the parsing is gone.
|
46
46
|
#
|
47
47
|
def get(hostname)
|
48
|
+
|
48
49
|
@ehttp = true
|
49
50
|
url = URI.parse(hostname)
|
50
51
|
|
@@ -59,12 +60,12 @@ module Robotstxt
|
|
59
60
|
|
60
61
|
case response
|
61
62
|
when Net::HTTPSuccess then
|
62
|
-
|
63
|
-
|
64
|
-
|
63
|
+
@found = true
|
64
|
+
@body = response.body
|
65
|
+
parse()
|
65
66
|
|
66
67
|
else
|
67
|
-
|
68
|
+
@found = false
|
68
69
|
end
|
69
70
|
|
70
71
|
return @found
|
@@ -73,22 +74,20 @@ module Robotstxt
|
|
73
74
|
if @ehttp
|
74
75
|
@ettp = false
|
75
76
|
retry
|
76
|
-
|
77
|
+
else
|
77
78
|
return nil
|
78
79
|
end
|
79
80
|
end
|
81
|
+
|
80
82
|
end
|
81
83
|
|
82
84
|
|
83
85
|
# Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
|
84
86
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
# <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
|
90
|
-
#
|
91
|
-
# <tt>end</tt>
|
87
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
88
|
+
# if client.get('http://www.simonerinzivillo.it')
|
89
|
+
# client.allowed?('http://www.simonerinzivillo.it/no-dir/')
|
90
|
+
# end
|
92
91
|
#
|
93
92
|
# This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
|
94
93
|
#
|
@@ -101,9 +100,9 @@ module Robotstxt
|
|
101
100
|
@rules.each {|ua|
|
102
101
|
|
103
102
|
if @robot_id == ua[0] || ua[0] == '*'
|
104
|
-
|
105
|
-
ua[1].each {|d|
|
106
103
|
|
104
|
+
ua[1].each {|d|
|
105
|
+
|
107
106
|
is_allow = false if url_path.match('^' + d ) || d == '/'
|
108
107
|
|
109
108
|
}
|
@@ -111,23 +110,17 @@ module Robotstxt
|
|
111
110
|
end
|
112
111
|
|
113
112
|
}
|
114
|
-
|
113
|
+
is_allow
|
115
114
|
end
|
116
115
|
|
117
116
|
# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
|
118
117
|
#
|
119
|
-
#
|
120
|
-
#
|
121
|
-
#
|
122
|
-
#
|
123
|
-
#
|
124
|
-
#
|
125
|
-
# <tt> puts url</tt>
|
126
|
-
#
|
127
|
-
#
|
128
|
-
# <tt> }</tt>
|
129
|
-
#
|
130
|
-
# <tt>end</tt>
|
118
|
+
# client = Robotstxt::Robotstxtistance.new('my_robot_id')
|
119
|
+
# if client.get('http://www.simonerinzivillo.it')
|
120
|
+
# client.sitemaps.each{ |url|
|
121
|
+
# puts url
|
122
|
+
# }
|
123
|
+
# end
|
131
124
|
#
|
132
125
|
def sitemaps
|
133
126
|
@sitemaps
|
@@ -146,7 +139,7 @@ module Robotstxt
|
|
146
139
|
@body = @body.downcase
|
147
140
|
|
148
141
|
@body.each_line {|r|
|
149
|
-
|
142
|
+
|
150
143
|
case r
|
151
144
|
when /^#.+$/
|
152
145
|
|
data/robotstxt.gemspec
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{robotstxt}
|
5
|
+
s.version = "0.5.2"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Simone Rinzivillo"]
|
9
|
+
s.date = %q{2009-12-19}
|
10
|
+
s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
|
11
|
+
}
|
12
|
+
s.email = %q{srinzivillo@gmail.com}
|
13
|
+
s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
|
14
|
+
s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
|
15
|
+
s.homepage = %q{http://www.simonerinzivillo.it}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{robotstxt}
|
19
|
+
s.rubygems_version = %q{1.3.5}
|
20
|
+
s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
|
21
|
+
s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8"])
|
29
|
+
s.add_development_dependency(%q<echoe>, ["~> 3.1"])
|
30
|
+
else
|
31
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
32
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
33
|
+
end
|
34
|
+
else
|
35
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
36
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
37
|
+
end
|
38
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: robotstxt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Rinzivillo
|
@@ -9,46 +9,79 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-19 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
16
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.8"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: echoe
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "3.1"
|
34
|
+
version:
|
35
|
+
description: " Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.\n"
|
17
36
|
email: srinzivillo@gmail.com
|
18
37
|
executables: []
|
19
38
|
|
20
39
|
extensions: []
|
21
40
|
|
22
41
|
extra_rdoc_files:
|
42
|
+
- LICENSE.rdoc
|
23
43
|
- README.rdoc
|
24
|
-
files:
|
25
|
-
- lib/robotstxt/parser.rb
|
26
44
|
- lib/robotstxt.rb
|
45
|
+
- lib/robotstxt/parser.rb
|
46
|
+
files:
|
47
|
+
- LICENSE.rdoc
|
48
|
+
- Manifest
|
27
49
|
- README.rdoc
|
50
|
+
- Rakefile
|
51
|
+
- lib/robotstxt.rb
|
52
|
+
- lib/robotstxt/parser.rb
|
53
|
+
- test/parser_test.rb
|
54
|
+
- test/robotstxt_test.rb
|
55
|
+
- robotstxt.gemspec
|
28
56
|
has_rdoc: true
|
29
57
|
homepage: http://www.simonerinzivillo.it
|
30
58
|
licenses: []
|
31
59
|
|
32
60
|
post_install_message:
|
33
|
-
rdoc_options:
|
34
|
-
|
61
|
+
rdoc_options:
|
62
|
+
- --line-numbers
|
63
|
+
- --inline-source
|
64
|
+
- --title
|
65
|
+
- Robotstxt
|
66
|
+
- --main
|
67
|
+
- README.rdoc
|
35
68
|
require_paths:
|
36
69
|
- lib
|
37
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
71
|
requirements:
|
39
72
|
- - ">="
|
40
73
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
74
|
+
version: "0"
|
42
75
|
version:
|
43
76
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
77
|
requirements:
|
45
78
|
- - ">="
|
46
79
|
- !ruby/object:Gem::Version
|
47
|
-
version: "
|
80
|
+
version: "1.2"
|
48
81
|
version:
|
49
82
|
requirements: []
|
50
83
|
|
51
|
-
rubyforge_project:
|
84
|
+
rubyforge_project: robotstxt
|
52
85
|
rubygems_version: 1.3.5
|
53
86
|
signing_key:
|
54
87
|
specification_version: 3
|