robotstxt 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ = License
2
+
3
+ (The MIT License)
4
+
5
+ Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining
8
+ a copy of this software and associated documentation files (the
9
+ "Software"), to deal in the Software without restriction, including
10
+ without limitation the rights to use, copy, modify, merge, publish,
11
+ distribute, sublicense, and/or sell copies of the Software, and to
12
+ permit persons to whom the Software is furnished to do so, subject to
13
+ the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+
@@ -0,0 +1,8 @@
1
+ LICENSE.rdoc
2
+ Manifest
3
+ README.rdoc
4
+ Rakefile
5
+ lib/robotstxt.rb
6
+ lib/robotstxt/parser.rb
7
+ test/parser_test.rb
8
+ test/robotstxt_test.rb
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__) + "/lib")
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'echoe'
6
+ require 'robotstxt'
7
+
8
+
9
+ # Common package properties
10
+ PKG_NAME = 'robotstxt'
11
+ PKG_VERSION = Robotstxt::VERSION
12
+ RUBYFORGE_PROJECT = 'robotstxt'
13
+
14
+ if ENV['SNAPSHOT'].to_i == 1
15
+ PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
16
+ end
17
+
18
+
19
+ Echoe.new(PKG_NAME, PKG_VERSION) do |p|
20
+ p.author = "Simone Rinzivillo"
21
+ p.email = "srinzivillo@gmail.com"
22
+ p.summary = "Robotstxt is an Ruby robots.txt file parser"
23
+ p.url = "http://www.simonerinzivillo.it"
24
+ p.project = RUBYFORGE_PROJECT
25
+ p.description = <<-EOD
26
+ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. \
27
+ Full support for the robots.txt RFC, wildcards and Sitemap: rules.
28
+ EOD
29
+
30
+ p.need_zip = true
31
+
32
+ p.development_dependencies += ["rake ~>0.8",
33
+ "echoe ~>3.1"]
34
+
35
+ p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
36
+ end
37
+
38
+
39
+ desc "Open an irb session preloaded with this library"
40
+ task :console do
41
+ sh "irb -rubygems -I lib -r robotstxt.rb"
42
+ end
43
+
44
+ begin
45
+ require 'code_statistics'
46
+ desc "Show library's code statistics"
47
+ task :stats do
48
+ CodeStatistics.new(["Robotstxt", "lib"],
49
+ ["Tests", "test"]).to_s
50
+ end
51
+ rescue LoadError
52
+ puts "CodeStatistics (Rails) is not available"
53
+ end
54
+
55
+ Dir["tasks/**/*.rake"].each do |file|
56
+ load(file)
57
+ end
@@ -24,31 +24,31 @@ module Robotstxt
24
24
  NAME = 'Robotstxt'
25
25
  GEM = 'robotstxt'
26
26
  AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
- VERSION = '0.5.1'
27
+ VERSION = '0.5.2'
28
28
 
29
29
 
30
30
  # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
31
31
  # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
32
32
  #
33
- # <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
33
+ # Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
34
34
  #
35
35
  def self.allowed?(url, robot_id)
36
36
 
37
- u = URI.parse(url)
38
- r = Robotstxt::Parser.new(robot_id)
39
- return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
37
+ u = URI.parse(url)
38
+ r = Robotstxt::Parser.new(robot_id)
39
+ r.allowed?(url) if r.get(u.scheme + '://' + u.host)
40
40
 
41
41
  end
42
42
 
43
43
  # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
44
44
  #
45
- # <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
45
+ # Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
46
46
  #
47
47
  def self.sitemaps(url, robot_id)
48
48
 
49
- u = URI.parse(url)
50
- r = Robotstxt::Parser.new(robot_id)
51
- return r.sitemaps if r.get(u.scheme + '://' + u.host)
49
+ u = URI.parse(url)
50
+ r = Robotstxt::Parser.new(robot_id)
51
+ r.sitemaps if r.get(u.scheme + '://' + u.host)
52
52
 
53
53
  end
54
54
 
@@ -27,6 +27,7 @@ module Robotstxt
27
27
  # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
28
28
  #
29
29
  def initialize(robot_id = nil)
30
+
30
31
  @robot_id = '*'
31
32
  @rules = []
32
33
  @sitemaps = []
@@ -37,14 +38,14 @@ module Robotstxt
37
38
 
38
39
  # Requires and parses the Robots.txt file for the <tt>hostname</tt>.
39
40
  #
40
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
41
- #
42
- # <tt>client.get('http:// www.simonerinzivillo.it')</tt>
41
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
42
+ # client.get('http://www.simonerinzivillo.it')
43
43
  #
44
44
  #
45
45
  # This method returns <tt>true</tt> if the parsing is gone.
46
46
  #
47
47
  def get(hostname)
48
+
48
49
  @ehttp = true
49
50
  url = URI.parse(hostname)
50
51
 
@@ -59,12 +60,12 @@ module Robotstxt
59
60
 
60
61
  case response
61
62
  when Net::HTTPSuccess then
62
- @found = true
63
- @body = response.body
64
- parse()
63
+ @found = true
64
+ @body = response.body
65
+ parse()
65
66
 
66
67
  else
67
- @found = false
68
+ @found = false
68
69
  end
69
70
 
70
71
  return @found
@@ -73,22 +74,20 @@ module Robotstxt
73
74
  if @ehttp
74
75
  @ettp = false
75
76
  retry
76
- else
77
+ else
77
78
  return nil
78
79
  end
79
80
  end
81
+
80
82
  end
81
83
 
82
84
 
83
85
  # Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
84
86
  #
85
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
86
- #
87
- # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
88
- #
89
- # <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
90
- #
91
- # <tt>end</tt>
87
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
88
+ # if client.get('http://www.simonerinzivillo.it')
89
+ # client.allowed?('http://www.simonerinzivillo.it/no-dir/')
90
+ # end
92
91
  #
93
92
  # This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
94
93
  #
@@ -101,9 +100,9 @@ module Robotstxt
101
100
  @rules.each {|ua|
102
101
 
103
102
  if @robot_id == ua[0] || ua[0] == '*'
104
-
105
- ua[1].each {|d|
106
103
 
104
+ ua[1].each {|d|
105
+
107
106
  is_allow = false if url_path.match('^' + d ) || d == '/'
108
107
 
109
108
  }
@@ -111,23 +110,17 @@ module Robotstxt
111
110
  end
112
111
 
113
112
  }
114
- return is_allow
113
+ is_allow
115
114
  end
116
115
 
117
116
  # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
118
117
  #
119
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
120
- #
121
- # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
122
- #
123
- # <tt> client.sitemaps.each{ |url|</tt>
124
- #
125
- # <tt> puts url</tt>
126
- #
127
- #
128
- # <tt> }</tt>
129
- #
130
- # <tt>end</tt>
118
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
119
+ # if client.get('http://www.simonerinzivillo.it')
120
+ # client.sitemaps.each{ |url|
121
+ # puts url
122
+ # }
123
+ # end
131
124
  #
132
125
  def sitemaps
133
126
  @sitemaps
@@ -146,7 +139,7 @@ module Robotstxt
146
139
  @body = @body.downcase
147
140
 
148
141
  @body.each_line {|r|
149
-
142
+
150
143
  case r
151
144
  when /^#.+$/
152
145
 
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{robotstxt}
5
+ s.version = "0.5.2"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Simone Rinzivillo"]
9
+ s.date = %q{2009-12-19}
10
+ s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
11
+ }
12
+ s.email = %q{srinzivillo@gmail.com}
13
+ s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
14
+ s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
15
+ s.homepage = %q{http://www.simonerinzivillo.it}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{robotstxt}
19
+ s.rubygems_version = %q{1.3.5}
20
+ s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
21
+ s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ s.add_development_dependency(%q<rake>, ["~> 0.8"])
29
+ s.add_development_dependency(%q<echoe>, ["~> 3.1"])
30
+ else
31
+ s.add_dependency(%q<rake>, ["~> 0.8"])
32
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
33
+ end
34
+ else
35
+ s.add_dependency(%q<rake>, ["~> 0.8"])
36
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
37
+ end
38
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robotstxt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Rinzivillo
@@ -9,46 +9,79 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-06 00:00:00 +01:00
12
+ date: 2009-12-19 00:00:00 +01:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: "0.8"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: echoe
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: "3.1"
34
+ version:
35
+ description: " Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.\n"
17
36
  email: srinzivillo@gmail.com
18
37
  executables: []
19
38
 
20
39
  extensions: []
21
40
 
22
41
  extra_rdoc_files:
42
+ - LICENSE.rdoc
23
43
  - README.rdoc
24
- files:
25
- - lib/robotstxt/parser.rb
26
44
  - lib/robotstxt.rb
45
+ - lib/robotstxt/parser.rb
46
+ files:
47
+ - LICENSE.rdoc
48
+ - Manifest
27
49
  - README.rdoc
50
+ - Rakefile
51
+ - lib/robotstxt.rb
52
+ - lib/robotstxt/parser.rb
53
+ - test/parser_test.rb
54
+ - test/robotstxt_test.rb
55
+ - robotstxt.gemspec
28
56
  has_rdoc: true
29
57
  homepage: http://www.simonerinzivillo.it
30
58
  licenses: []
31
59
 
32
60
  post_install_message:
33
- rdoc_options: []
34
-
61
+ rdoc_options:
62
+ - --line-numbers
63
+ - --inline-source
64
+ - --title
65
+ - Robotstxt
66
+ - --main
67
+ - README.rdoc
35
68
  require_paths:
36
69
  - lib
37
70
  required_ruby_version: !ruby/object:Gem::Requirement
38
71
  requirements:
39
72
  - - ">="
40
73
  - !ruby/object:Gem::Version
41
- version: 1.8.7
74
+ version: "0"
42
75
  version:
43
76
  required_rubygems_version: !ruby/object:Gem::Requirement
44
77
  requirements:
45
78
  - - ">="
46
79
  - !ruby/object:Gem::Version
47
- version: "0"
80
+ version: "1.2"
48
81
  version:
49
82
  requirements: []
50
83
 
51
- rubyforge_project:
84
+ rubyforge_project: robotstxt
52
85
  rubygems_version: 1.3.5
53
86
  signing_key:
54
87
  specification_version: 3