robotstxt 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ = License
2
+
3
+ (The MIT License)
4
+
5
+ Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining
8
+ a copy of this software and associated documentation files (the
9
+ "Software"), to deal in the Software without restriction, including
10
+ without limitation the rights to use, copy, modify, merge, publish,
11
+ distribute, sublicense, and/or sell copies of the Software, and to
12
+ permit persons to whom the Software is furnished to do so, subject to
13
+ the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+
@@ -0,0 +1,8 @@
1
+ LICENSE.rdoc
2
+ Manifest
3
+ README.rdoc
4
+ Rakefile
5
+ lib/robotstxt.rb
6
+ lib/robotstxt/parser.rb
7
+ test/parser_test.rb
8
+ test/robotstxt_test.rb
@@ -0,0 +1,57 @@
1
+ $:.unshift(File.dirname(__FILE__) + "/lib")
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'echoe'
6
+ require 'robotstxt'
7
+
8
+
9
+ # Common package properties
10
+ PKG_NAME = 'robotstxt'
11
+ PKG_VERSION = Robotstxt::VERSION
12
+ RUBYFORGE_PROJECT = 'robotstxt'
13
+
14
+ if ENV['SNAPSHOT'].to_i == 1
15
+ PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
16
+ end
17
+
18
+
19
+ Echoe.new(PKG_NAME, PKG_VERSION) do |p|
20
+ p.author = "Simone Rinzivillo"
21
+ p.email = "srinzivillo@gmail.com"
22
+ p.summary = "Robotstxt is an Ruby robots.txt file parser"
23
+ p.url = "http://www.simonerinzivillo.it"
24
+ p.project = RUBYFORGE_PROJECT
25
+ p.description = <<-EOD
26
+ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. \
27
+ Full support for the robots.txt RFC, wildcards and Sitemap: rules.
28
+ EOD
29
+
30
+ p.need_zip = true
31
+
32
+ p.development_dependencies += ["rake ~>0.8",
33
+ "echoe ~>3.1"]
34
+
35
+ p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
36
+ end
37
+
38
+
39
+ desc "Open an irb session preloaded with this library"
40
+ task :console do
41
+ sh "irb -rubygems -I lib -r robotstxt.rb"
42
+ end
43
+
44
+ begin
45
+ require 'code_statistics'
46
+ desc "Show library's code statistics"
47
+ task :stats do
48
+ CodeStatistics.new(["Robotstxt", "lib"],
49
+ ["Tests", "test"]).to_s
50
+ end
51
+ rescue LoadError
52
+ puts "CodeStatistics (Rails) is not available"
53
+ end
54
+
55
+ Dir["tasks/**/*.rake"].each do |file|
56
+ load(file)
57
+ end
@@ -24,31 +24,31 @@ module Robotstxt
24
24
  NAME = 'Robotstxt'
25
25
  GEM = 'robotstxt'
26
26
  AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
- VERSION = '0.5.1'
27
+ VERSION = '0.5.2'
28
28
 
29
29
 
30
30
  # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
31
31
  # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
32
32
  #
33
- # <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
33
+ # Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
34
34
  #
35
35
  def self.allowed?(url, robot_id)
36
36
 
37
- u = URI.parse(url)
38
- r = Robotstxt::Parser.new(robot_id)
39
- return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
37
+ u = URI.parse(url)
38
+ r = Robotstxt::Parser.new(robot_id)
39
+ r.allowed?(url) if r.get(u.scheme + '://' + u.host)
40
40
 
41
41
  end
42
42
 
43
43
  # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
44
44
  #
45
- # <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
45
+ # Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest')
46
46
  #
47
47
  def self.sitemaps(url, robot_id)
48
48
 
49
- u = URI.parse(url)
50
- r = Robotstxt::Parser.new(robot_id)
51
- return r.sitemaps if r.get(u.scheme + '://' + u.host)
49
+ u = URI.parse(url)
50
+ r = Robotstxt::Parser.new(robot_id)
51
+ r.sitemaps if r.get(u.scheme + '://' + u.host)
52
52
 
53
53
  end
54
54
 
@@ -27,6 +27,7 @@ module Robotstxt
27
27
  # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
28
28
  #
29
29
  def initialize(robot_id = nil)
30
+
30
31
  @robot_id = '*'
31
32
  @rules = []
32
33
  @sitemaps = []
@@ -37,14 +38,14 @@ module Robotstxt
37
38
 
38
39
  # Requires and parses the Robots.txt file for the <tt>hostname</tt>.
39
40
  #
40
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
41
- #
42
- # <tt>client.get('http:// www.simonerinzivillo.it')</tt>
41
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
42
+ # client.get('http://www.simonerinzivillo.it')
43
43
  #
44
44
  #
45
45
  # This method returns <tt>true</tt> if the parsing is gone.
46
46
  #
47
47
  def get(hostname)
48
+
48
49
  @ehttp = true
49
50
  url = URI.parse(hostname)
50
51
 
@@ -59,12 +60,12 @@ module Robotstxt
59
60
 
60
61
  case response
61
62
  when Net::HTTPSuccess then
62
- @found = true
63
- @body = response.body
64
- parse()
63
+ @found = true
64
+ @body = response.body
65
+ parse()
65
66
 
66
67
  else
67
- @found = false
68
+ @found = false
68
69
  end
69
70
 
70
71
  return @found
@@ -73,22 +74,20 @@ module Robotstxt
73
74
  if @ehttp
74
75
  @ettp = false
75
76
  retry
76
- else
77
+ else
77
78
  return nil
78
79
  end
79
80
  end
81
+
80
82
  end
81
83
 
82
84
 
83
85
  # Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
84
86
  #
85
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
86
- #
87
- # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
88
- #
89
- # <tt> client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
90
- #
91
- # <tt>end</tt>
87
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
88
+ # if client.get('http://www.simonerinzivillo.it')
89
+ # client.allowed?('http://www.simonerinzivillo.it/no-dir/')
90
+ # end
92
91
  #
93
92
  # This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
94
93
  #
@@ -101,9 +100,9 @@ module Robotstxt
101
100
  @rules.each {|ua|
102
101
 
103
102
  if @robot_id == ua[0] || ua[0] == '*'
104
-
105
- ua[1].each {|d|
106
103
 
104
+ ua[1].each {|d|
105
+
107
106
  is_allow = false if url_path.match('^' + d ) || d == '/'
108
107
 
109
108
  }
@@ -111,23 +110,17 @@ module Robotstxt
111
110
  end
112
111
 
113
112
  }
114
- return is_allow
113
+ is_allow
115
114
  end
116
115
 
117
116
  # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
118
117
  #
119
- # <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
120
- #
121
- # <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
122
- #
123
- # <tt> client.sitemaps.each{ |url|</tt>
124
- #
125
- # <tt> puts url</tt>
126
- #
127
- #
128
- # <tt> }</tt>
129
- #
130
- # <tt>end</tt>
118
+ # client = Robotstxt::Robotstxtistance.new('my_robot_id')
119
+ # if client.get('http://www.simonerinzivillo.it')
120
+ # client.sitemaps.each{ |url|
121
+ # puts url
122
+ # }
123
+ # end
131
124
  #
132
125
  def sitemaps
133
126
  @sitemaps
@@ -146,7 +139,7 @@ module Robotstxt
146
139
  @body = @body.downcase
147
140
 
148
141
  @body.each_line {|r|
149
-
142
+
150
143
  case r
151
144
  when /^#.+$/
152
145
 
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{robotstxt}
5
+ s.version = "0.5.2"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Simone Rinzivillo"]
9
+ s.date = %q{2009-12-19}
10
+ s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
11
+ }
12
+ s.email = %q{srinzivillo@gmail.com}
13
+ s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
14
+ s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
15
+ s.homepage = %q{http://www.simonerinzivillo.it}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{robotstxt}
19
+ s.rubygems_version = %q{1.3.5}
20
+ s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
21
+ s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ s.add_development_dependency(%q<rake>, ["~> 0.8"])
29
+ s.add_development_dependency(%q<echoe>, ["~> 3.1"])
30
+ else
31
+ s.add_dependency(%q<rake>, ["~> 0.8"])
32
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
33
+ end
34
+ else
35
+ s.add_dependency(%q<rake>, ["~> 0.8"])
36
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
37
+ end
38
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robotstxt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Rinzivillo
@@ -9,46 +9,79 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-06 00:00:00 +01:00
12
+ date: 2009-12-19 00:00:00 +01:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: "0.8"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: echoe
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: "3.1"
34
+ version:
35
+ description: " Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.\n"
17
36
  email: srinzivillo@gmail.com
18
37
  executables: []
19
38
 
20
39
  extensions: []
21
40
 
22
41
  extra_rdoc_files:
42
+ - LICENSE.rdoc
23
43
  - README.rdoc
24
- files:
25
- - lib/robotstxt/parser.rb
26
44
  - lib/robotstxt.rb
45
+ - lib/robotstxt/parser.rb
46
+ files:
47
+ - LICENSE.rdoc
48
+ - Manifest
27
49
  - README.rdoc
50
+ - Rakefile
51
+ - lib/robotstxt.rb
52
+ - lib/robotstxt/parser.rb
53
+ - test/parser_test.rb
54
+ - test/robotstxt_test.rb
55
+ - robotstxt.gemspec
28
56
  has_rdoc: true
29
57
  homepage: http://www.simonerinzivillo.it
30
58
  licenses: []
31
59
 
32
60
  post_install_message:
33
- rdoc_options: []
34
-
61
+ rdoc_options:
62
+ - --line-numbers
63
+ - --inline-source
64
+ - --title
65
+ - Robotstxt
66
+ - --main
67
+ - README.rdoc
35
68
  require_paths:
36
69
  - lib
37
70
  required_ruby_version: !ruby/object:Gem::Requirement
38
71
  requirements:
39
72
  - - ">="
40
73
  - !ruby/object:Gem::Version
41
- version: 1.8.7
74
+ version: "0"
42
75
  version:
43
76
  required_rubygems_version: !ruby/object:Gem::Requirement
44
77
  requirements:
45
78
  - - ">="
46
79
  - !ruby/object:Gem::Version
47
- version: "0"
80
+ version: "1.2"
48
81
  version:
49
82
  requirements: []
50
83
 
51
- rubyforge_project:
84
+ rubyforge_project: robotstxt
52
85
  rubygems_version: 1.3.5
53
86
  signing_key:
54
87
  specification_version: 3