RubyGems - robotstxt - Versions diffs - 0.5.0 - Mend

robotstxt 0.5.0

Files changed (6) hide show

data/README.rdoc +45 -0
data/lib/robotstxt.rb +55 -0
data/lib/robotstxt/robotstxtistance.rb +176 -0
data/test/robotstxt_test.rb +19 -0
data/test/robotstxtistance_test.rb +43 -0
metadata +58 -0

data/README.rdoc ADDED

@@ -0,0 +1,45 @@
+= Robotstxt
+Robotstxt is an Ruby robots.txt file parser.
+Robotstxt Parser allows you to the check the accessibility of URLs and get other data.
+Full support for the robots.txt RFC, wildcards and Sitemap: rules.
+== Features
+* Check if the URL is allowed to be crawled from your Robot
+* Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs
+== Requirements
+* Ruby >= 1.8.7
+== Installation
+This library is intended to be installed via the
+RubyGems[http://rubyforge.org/projects/rubygems/] system.
+  $ gem install robotstxt
+You might need administrator privileges on your system to install it.
+== Author
+Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
+== Resources
+* {Homepage}[http://www.simonerinzivillo.it/]
+== License
+Copyright (c) 2009 Simone Rinzivillo, Robotstxt is released under the MIT license.

data/lib/robotstxt.rb ADDED

@@ -0,0 +1,55 @@
+#
+# = Ruby Robotstxt
+#
+# An Ruby Robots.txt parser.
+#
+#
+# Category::    Net
+# Package::     Robotstxt
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+require 'robotstxt/robotstxtistance'
+require 'uri'
+module Robotstxt
+  NAME            = 'Robotstxt'
+  GEM             = 'robotstxt'
+  AUTHORS         = ['Simone Rinzivillo <srinzivillo@gmail.com>']
+  VERSION	      = '0.5.0'
+  # Check if the <tt>URL</tt> is allowed to be crawled from the current <tt>Robot_id</tt>.
+  # Robots:Allowed? returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
+  #
+  # <tt>Robotstxt.allowed?('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
+  #
+  def self.allowed?(url, robot_id)
+	allowed = false
+	u = URI.parse(url)
+	r = Robotstxt::Robotstxtistance.new(robot_id)
+	return r.allowed?(url) if r.get(u.scheme + '://' + u.host)
+  end
+  # Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
+  #
+  # <tt>Robotstxt.sitemaps('http:// www.simonerinzivillo.it/', 'rubytest')</tt>
+  #
+  def self.sitemaps(url, robot_id)
+	allowed = false
+	u = URI.parse(url)
+	r = Robotstxt::Robotstxtistance.new(robot_id)
+	return r.sitemaps if r.get(u.scheme + '://' + u.host)
+  end
+end

data/lib/robotstxt/robotstxtistance.rb ADDED

@@ -0,0 +1,176 @@
+#
+# = Ruby Robotstxt
+#
+# An Ruby Robots.txt parser.
+#
+#
+# Category::    Net
+# Package::     Robotstxt
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+require 'rubygems'
+require 'net/http'
+require 'uri'
+module Robotstxt
+	class Robotstxtistance
+		attr_accessor :robot_id
+		attr_reader :found, :body, :sitemaps, :rules
+		# Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
+		#
+		# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+		#
+		def initialize(robot_id = nil)
+			@robot_id = '*'
+			@rules = []
+			@sitemaps = []
+			@robot_id = robot_id.downcase if !robot_id.nil?
+		end
+		# Requires and parses the Robots.txt file for the <tt>hostname</tt>.
+		#
+		# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+		#
+		# <tt>client.get('http:// www.simonerinzivillo.it')</tt>
+		#
+		#
+		# This method returns <tt>true</tt> if the parsing is gone.
+		#
+		def get(hostname)
+			@ehttp = true
+			url = URI.parse(hostname)
+			begin
+				http = Net::HTTP.new(url.host, url.port)
+				if url.scheme == 'https'
+					http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+					http.use_ssl = true
+				end
+				response =  http.request(Net::HTTP::Get.new('/robots.txt'))
+				case response
+					when Net::HTTPSuccess then
+						@found = true
+						@body = response.body
+						parse()
+					else
+						@found = false
+				end
+				return @found
+				rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
+				if @ehttp
+					@ettp = false
+					retry
+				else
+					return nil
+				end
+			end
+		end
+		# Check if the <tt>URL</tt> is allowed to be crawled from the current Robot_id.
+		#
+		# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+		#
+		# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
+		#
+		# <tt>  client.allowed?('http:// www.simonerinzivillo.it/no-dir/')</tt>
+		#
+		# <tt>end</tt>
+		#
+		# This method returns <tt>true</tt> if the robots.txt file does not block the access to the URL.
+		#
+		def allowed?(var)
+			is_allow = true
+			url = URI.parse(var)
+			querystring = (!url.query.nil?) ? '?' + url.query : ''
+			url_path = url.path + querystring
+			@rules.each {|ua|
+				if @robot_id == ua[0] || ua[0] == '*'
+					ua[1].each {|d|
+						is_allow = false if url_path.match('^' + d ) || d == '/'
+					}
+				end
+			}
+			return is_allow
+		end
+		# Analyze the robots.txt file to return an <tt>Array</tt> containing the list of XML Sitemaps URLs.
+		#
+		# <tt>client = Robotstxt::Robotstxtistance.new('my_robot_id')</tt>
+		#
+		# <tt>if client.get('http:// www.simonerinzivillo.it')</tt>
+		#
+		# <tt>  client.sitemaps.each{ |url|</tt>
+		#
+		# <tt>		puts url</tt>
+		#
+		#
+		# <tt>	}</tt>
+		#
+		# <tt>end</tt>
+		#
+		def sitemaps()
+			return @sitemaps
+		end
+		# This method returns <tt>true</tt> if the Robots.txt parsing is gone.
+		#
+		def found?()
+			return @found
+		end
+		private
+		def parse()
+			@body = @body.downcase
+			@body.each_line {|r|
+				case r
+					when /^#.+$/
+					when /^\s*user-agent\s*:.+$/
+					@rules << [ r.split(':')[1].strip, [], []]
+					when /^\s*disallow\s*:.+$/
+					r = r.split(':')[1].strip
+					@rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
+					when /^\s*allow\s*:.+$/
+					r = r.split(':')[1].strip
+					@rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
+					when /^\s*sitemap\s*:.+$/
+					@sitemaps<< r.split(':')[1].strip + r.split(':')[2].strip if r.length > 0
+				end
+			}
+		end
+	end
+end

data/test/robotstxt_test.rb ADDED

@@ -0,0 +1,19 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'Robotstxt'
+class TestRobotstxt < Test::Unit::TestCase
+	def test_allowed
+		assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
+		assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
+    end
+	def test_sitemaps
+		assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
+    end
+end

data/test/robotstxtistance_test.rb ADDED

@@ -0,0 +1,43 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'Robotstxt'
+class TestRobotstxtistance < Test::Unit::TestCase
+	def setup
+		@client = Robotstxt::Robotstxtistance.new('rubytest')
+		@client.get('http://www.simonerinzivillo.it')
+	end
+	def test_initialize
+		client = Robotstxt::Robotstxtistance.new('*')
+		assert_instance_of Robotstxt::Robotstxtistance, client
+	end
+	def test_get_file_robotstxt
+		assert @client.get('http://www.simonerinzivillo.it')
+    end
+	def test_robotstxt_isfound
+		assert @client.found?()
+    end
+	def test_url_allowed
+		assert true ==  @client.allowed?('http://www.simonerinzivillo.it/')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/no-dir/')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/foo-no-dir/page.html')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/page.php?var=0')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/dir/page.php?var=0')
+		assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/')
+		assert true == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php')
+		assert false == @client.allowed?('http://www.simonerinzivillo.it/blog/page.php?var=0')
+    end
+	def test_sitemaps
+		assert @client.sitemaps.length() > 0
+    end
+end

metadata ADDED

@@ -0,0 +1,58 @@
+--- !ruby/object:Gem::Specification
+name: robotstxt
+version: !ruby/object:Gem::Version
+  version: 0.5.0
+platform: ruby
+authors:
+- Simone Rinzivillo
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-12-06 00:00:00 +01:00
+default_executable:
+dependencies: []
+description:
+email: srinzivillo@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- lib/robotstxt/robotstxtistance.rb
+- lib/robotstxt.rb
+- README.rdoc
+has_rdoc: true
+homepage: http://www.simonerinzivillo.it
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.8.7
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Robotstxt is an Ruby robots.txt file parser
+test_files:
+- test/robotstxt_test.rb
+- test/robotstxtistance_test.rb