RubyGems - rwspider - Versions diffs - 0.4.2 - Mend

rwspider 0.4.2

Files changed (15) hide show

data/Changelog.rdoc ADDED Viewed

@@ -0,0 +1,5 @@
+= Changelog
+== Release 0.4.1
+* First release

data/LICENSE.rdoc ADDED Viewed

@@ -0,0 +1,25 @@
+= License
+(The MIT License)
+Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Manifest ADDED Viewed

@@ -0,0 +1,13 @@
+Changelog.rdoc
+LICENSE.rdoc
+README.rdoc
+Rakefile
+lib/rwspider.rb
+lib/rwspider/client.rb
+lib/rwspider/document.rb
+lib/rwspider/queue.rb
+lib/rwspider/version.rb
+test/client_test.rb
+test/document_test.rb
+test/rwspider_test.rb
+Manifest

data/README.rdoc ADDED Viewed

@@ -0,0 +1,55 @@
+= RW Spider
+RW Spider is an multithreading spider client written in Ruby.
+The library was designed to make easy the development of programs that spider the web.
+RW Spider design comes from the direct experiences of the development of another PHP library that is currently used as the engine for an freeware SEO tool.
+== Features
+* Multithreading spider
+* Customizable options for the spider job
+* Robots.txt support
+* Indexing of web pages and others files (images, CSS, JavaScript, PDF and more)
+* Following redirects
+== Requirements
+* Ruby >= 1.8.7
+* Hpricot >= 0.8.2
+* Robotstxt >= 0.5.2
+== Installation
+This library is intended to be installed via the
+Gemcutter[http://gemcutter.org] system.
+  $ gem install rwspider
+You might need administrator privileges on your system to install it.
+== Author
+Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
+== Resources
+* {Homepage}[http://www.rwspider.com/]
+* {Author}[http://www.simonerinzivillo.it/]
+* {GitHub}[http://github.com/rinzi/rwspider/]
+== Changelog
+See the CHANGELOG.rdoc file for details.
+== License
+Copyright (c) 2009 Simone Rinzivillo, RW Spider is released under the MIT license.

data/Rakefile ADDED Viewed

@@ -0,0 +1,59 @@
+$:.unshift(File.dirname(__FILE__) + "/lib")
+require 'rubygems'
+require 'rake'
+require 'echoe'
+require 'rwspider'
+# Common package properties
+PKG_NAME    = 'rwspider'
+PKG_VERSION = Rwspider::VERSION
+RUBYFORGE_PROJECT = 'rwspider'
+if ENV['SNAPSHOT'].to_i == 1
+  PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
+end
+Echoe.new(PKG_NAME, PKG_VERSION) do |p|
+  p.author        = "Simone Rinzivillo"
+  p.email         = "srinzivillo@gmail.com"
+  p.summary       = "RW Spider is an multithreading spider client written in Ruby"
+  p.url           = "http://www.rwspider.com"
+  p.project       = RUBYFORGE_PROJECT
+  p.description   = <<-EOD
+    RW Spider is an multithreading spider client written in Ruby designed to make easy \
+		the development of programs that spider the web.
+  EOD
+  p.need_zip      = true
+  p.development_dependencies += ["rake  ~>0.8",
+																 "hpricot  ~>0.8.2",
+																 "robotstxt  ~>0.5.2",
+                                 "echoe ~>3.1"]
+  p.rcov_options  = ["-Itest -x mocha,rcov,Rakefile"]
+end
+desc "Open an irb session preloaded with this library"
+task :console do
+  sh "irb -rubygems -I lib -r rwspider.rb"
+end
+begin
+  require 'code_statistics'
+  desc "Show library's code statistics"
+  task :stats do
+    CodeStatistics.new(["Rwspider", "lib"],
+                       ["Tests", "test"]).to_s
+  end
+rescue LoadError
+  puts "CodeStatistics (Rails) is not available"
+end
+Dir["tasks/**/*.rake"].each do |file|
+  load(file)
+end

data/lib/rwspider/client.rb ADDED Viewed

@@ -0,0 +1,235 @@
+#
+# = Ruby RW Spider
+#
+# RW Spider is an multithreading spider client written in Ruby.
+#
+#
+# Category::    Net
+# Package::     RWSpider
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+require 'net/http'
+require 'openssl'
+require 'uri'
+require 'robotstxt'
+require 'rwspider/document'
+require 'rwspider/queue'
+require 'rwspider/version'
+require 'hpricot'
+module Rwspider
+	class Client
+    # Hash of options for the spider job
+		attr_accessor :opts
+		# DEFAULT_OPTIONS properties
+		#
+		# useragent: The User Agent that RW Spider must apply in HTTP requests
+		#
+		# robot_name: The Robot name that RW Spider must apply in HTTP requests
+		#
+		# scan_documents_limit: The limit of the documents that RW Spider can download,
+		# set as <tt>nil</tt> for start the indexing job without restriction on the number of the download
+		#
+		# scan_domain_limit: Set to restrict the indexing job to the current domain name
+		#
+    # scan_images -Set as <tt>true</tt> to enable the download of the image files
+		#
+		# scan_other_files: Set as <tt>true</tt> to enable the download of the other files as javascript and css
+		#
+    # follow_robotstxt_directive: Set as <tt>true</tt> to enable the analysis of the Robots.txt rules to check the accessibility of URLs
+		#
+		# follow_HTTP_redirection: Set as <tt>true</tt> to follow the HTTP redirections
+		#
+		# timeout: The timeout of single URL analysis
+    DEFAULT_OPTIONS = {
+      :useragent => 'RW Spider/' + Rwspider::VERSION,
+      :robot_name => 'rwspider',
+		  :scan_documents_limit => 100,
+      :scan_domain_limit => nil,
+      :scan_images => false,
+      :scan_other_files => false,
+      :follow_robotstxt_directive => true,
+      :follow_HTTP_redirection => true,
+			:timeout => 5
+    }
+	# Inizialize a new Rwspider::Client instance, accept an <tt>Hash</tt> of options.
+	# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
+	#
+  #  opts = {
+  #    :useragent => 'My user agent',
+  #    :robot_name => 'my_spider_name',
+  #    :scan_documents_limit => 100,
+  #    :scan_domain_limit => nil,
+  #    :scan_images => true,
+  #    :scan_other_files => false,
+  #    :follow_robotstxt_directive => true,
+  #    :follow_HTTP_redirection => true,
+	#    :timeout => 5
+	#  }
+  #  spider = Rwspider::Client::new(opts)
+  #
+		def initialize (options = {})
+      load_options options
+			@robotstxt_cache = Hash.new()
+      @main_hostname = ''
+      @scanned_documents = 0
+			@queue = Rwspider::Queue.new
+		end
+  # Start the crawling from the <tt>URL</tt>.
+  #
+  # Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded.
+  # At the end of execution returns an <tt>Array</tt> of Rwspider::Document instances.
+  #
+  #  Rwspider::Client::start('http://www.rwspider.com') {do |d|
+  #     puts 'Current URL ' + d.url.normalize.to_s
+  #  }
+  #
+  #  arr = Rwspider::Client::start('http://www.rwspider.com')
+  #  arr.each{do |d|
+  #     puts 'Current URL ' + d.url.normalize.to_s
+  #  }
+  #
+		def start (start_url)
+			@queue << Rwspider::Document.new(start_url)
+			@queue.each do |link|
+				@main_url = link.url
+				if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
+					set_as_visited link
+					@main_hostname = link.url.host.downcase if @main_hostname.length == 0
+					t = Thread.new(link) { |link|
+						begin
+							Timeout::timeout(@opts[:timeout]){
+								beginning = Time.now
+								response = get_uri(link.url)
+							  link.download_time = Time.now - beginning
+								link.as_downloaded = true
+								link.http_response = response
+								yield link if block_given?
+								case response
+									when Net::HTTPSuccess then
+									if  response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
+										link.get_links
+										link.get_images if @opts[:scan_images]
+										link.get_other_files if @opts[:scan_other_files]
+									  link.documents.each do |doc|
+											add_to_queue doc
+										end
+									end
+									when Net::HTTPRedirection then
+									add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection]
+								end
+							}
+						rescue StandardError => e
+								link.as_downloaded = false
+								yield link if block_given?
+						end
+					}
+					t.join
+				end
+			end
+			return @queue
+		end
+		private
+		def add_to_queue (document)
+			if follow?(document)
+				@queue.each do |link|
+					if link.url.normalize == document.url.normalize
+						document.as_visited = true
+						link.inbound_links << @main_url.normalize.to_s if !link.inbound_links.include?(@main_url.normalize.to_s)
+						break
+					end
+				end
+				document.inbound_links << @main_url.normalize.to_s
+				@queue << document if !document.as_visited
+			end
+		end
+		def load_options(opts)
+       @opts = DEFAULT_OPTIONS.merge opts
+    end
+		def get_uri(url)
+			@ehttp = true
+			begin
+				http = Net::HTTP.new(url.host, url.port)
+				if url.scheme == 'https'
+					http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+					http.use_ssl = true
+				end
+				r =  http.request(Net::HTTP::Get.new(url.request_uri, {'User-Agent' => @opts[:useragent]}))
+				return r
+				rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
+				if @ehttp
+					@ettp = false
+					retry
+				end
+			end
+    end
+		def set_as_visited(var)
+			@scanned_documents = @scanned_documents + 1
+			var.as_visited = true
+		end
+		def follow?(document)
+			follow = true
+			if @opts[:follow_robotstxt_directive]
+				if @robotstxt_cache.include?(document.url.host)
+					r = @robotstxt_cache[document.url.host]
+					else
+					r = Robotstxt::Parser.new(@opts[:robot_name])
+					r.get(document.url.scheme + '://' + document.url.host)
+					@robotstxt_cache[document.url.host] = r
+				end
+				follow = r.allowed?(document.url.normalize.to_s)
+			end
+			follow
+		end
+	end
+end

data/lib/rwspider/document.rb ADDED Viewed

@@ -0,0 +1,183 @@
+#
+# = Ruby RW Spider
+#
+# RW Spider is an multithreading spider client written in Ruby.
+#
+#
+# Category::    Net
+# Package::     RWSpider
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+require 'uri/generic'
+require 'hpricot'
+module Rwspider
+	class Document
+		include URI
+    # instance of <tt>URI</tt>
+		attr_accessor :url
+		# Returns <tt>true</tt> if the Rwspider::Document::url was request
+		attr_accessor :as_visited
+		# An instance of Net::HTTPResponse that contains the response returned from the web server
+		attr_accessor :http_response
+		# An <tt>Array</tt> of Rwspider::Document found in the HTML code of the current Rwspider::Document
+		attr_accessor :documents
+		# The time spent to download the Rwspider::Document
+		attr_accessor :download_time
+		# Returns <tt>true</tt> if the Rwspider::Document::url was downloaded correctly
+		attr_accessor :as_downloaded
+		# An <tt>Array</tt> of <tt>String</tt> hat contains the URLs of the documents where was found an link at the current Rwspider::Document
+		attr_reader :inbound_links
+	# Inizialize a new Rwspider::Document instance with the <tt>url</tt>
+	#
+  #  doc = Rwspider::Document::new('http://www.rwspider.com')
+  #
+		def initialize (url)
+			parse(url)
+			@tag_type = Array.new
+			@tag_type << ['a','href']
+			@tag_type << ['img','src']
+			@tag_type << ['link','href']
+			@inbound_links = []
+			@documents = []
+			@as_visited = false
+		end
+	# Rwspider::Document::parse load or replace the Rwspider::Document.url with the new <tt>url</tt>
+	#
+	#  doc = Rwspider::Document::new('http://www.rwspider.com')
+  #  doc.parse('http://www.rwspider.com/sitemap.html')
+  #
+		def parse (url)
+			begin
+				@url = URI.parse(url.gsub(/\\/,'/'))
+				rescue Exception  => e
+				nil
+			end
+		end
+	# Analyze the HTML code of the current Rwspider::Document to extract the links at other documents.
+  #
+  #  doc = Rwspider::Document::new('http://www.rwspider.com')
+	#  http = Net::HTTP.new(doc.url.host, doc.url.port)
+	#  doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
+	#  arr = doc.get_links
+	#
+	# This method returns an <tt>Array</tt> of instances of Rwspider::Document
+	# and append the Array at the documents attribute.
+  # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
+	# was different from 'text/html'.
+	#
+		def get_links()
+			get_document(@tag_type[0])
+		end
+	# Analyze the HTML code of the current Rwspider::Document to extract the links at images.
+  #
+  #  doc = Rwspider::Document::new('http://www.rwspider.com')
+	#  http = Net::HTTP.new(doc.url.host, doc.url.port)
+	#  doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
+	#  arr = doc.get_images
+	#
+	# This method returns an <tt>Array</tt> of instances of Rwspider::Document
+	# and append the Array at the documents attribute.
+  # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
+	# was different from 'text/html'.
+	#
+		def get_images()
+			get_document(@tag_type[1])
+		end
+	# Analyze the HTML code of the current Rwspider::Document to extract the links at other files
+	# as javascript and css.
+  #
+  #  doc = Rwspider::Document::new('http://www.rwspider.com')
+	#  http = Net::HTTP.new(doc.url.host, doc.url.port)
+	#  doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
+	#  arr = doc.get_other_files
+	#
+	# This method returns an <tt>Array</tt> of instances of Rwspider::Document
+	# and append the Array at the documents attribute.
+  # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
+	# was different from 'text/html'.
+	#
+		def get_other_files()
+			get_document(@tag_type[2])
+		end
+  # Normalize the url if the path is relative and returns an <tt>String</tt> with the absolute version.
+  #
+  #  doc = Rwspider::Document::new('http://www.rwspider.com')
+	#  doc.normalize_url(URI.parse('/sitemap.html'))
+	#
+		def normalize_url(var)
+			querystring = (!var.query.nil?) ? '?' + var.query : ''
+			if  var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
+				if var.relative?
+					path = var.path
+					if url.path.nil?
+						main_path = url.path.slice(0..url.path.rindex('/'))
+						else
+						main_path = '/'
+					end
+					if path.match('^\/')
+						path = url.scheme + '://' + url.host  + path + querystring
+						else
+						path = url.scheme + '://'  + url.host + main_path + path + querystring
+					end
+					else
+					path = var.scheme + '://'  + var.host + var.path + querystring
+				end
+			end
+			return path
+		end
+		private
+		def get_document(tag)
+		  return unless !@http_response.nil? && @http_response.content_type == 'text/html'
+					sourcecode = Hpricot(@http_response.body)
+					lnks = sourcecode.search("//" + tag[0])
+					docs = []
+					lnks.each { |link|
+						url = link.attributes[tag[1]].strip
+						doc = Document.new(url) if !url.nil?
+						if !doc.nil? && !doc.url.nil?
+							path = normalize_url(doc.url)
+							if !path.nil?
+								doc.parse path
+								docs << doc
+							end
+						end
+					}
+					@documents = @documents + docs
+					docs
+		end
+	end
+end

data/lib/rwspider/queue.rb ADDED Viewed

@@ -0,0 +1,25 @@
+#
+# = Ruby RW Spider
+#
+# RW Spider is an multithreading spider client written in Ruby.
+#
+#
+# Category::    Net
+# Package::     RWSpider
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+module Rwspider
+	class Queue < Array
+		def initialize ()
+		end
+	end
+end

data/lib/rwspider/version.rb ADDED Viewed

@@ -0,0 +1,29 @@
+#
+# = Ruby RW Spider
+#
+# RW Spider is an multithreading spider client written in Ruby.
+#
+#
+# Category::    Net
+# Package::     RWSpider
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+module Rwspider
+	module Version
+    MAJOR = 0
+    MINOR = 4
+    TINY = 2
+    ALPHA = nil
+    STRING = [MAJOR, MINOR, TINY, ALPHA].compact.join('.')
+  end
+  VERSION = Version::STRING
+end

data/lib/rwspider.rb ADDED Viewed

@@ -0,0 +1,57 @@
+#
+# = Ruby RW Spider
+#
+# RW Spider is an multithreading spider client written in Ruby.
+#
+#
+# Category::    Net
+# Package::     RWSpider
+# Author::      Simone Rinzivillo <srinzivillo@gmail.com>
+# License::     MIT License
+#
+#--
+#
+#++
+require 'rwspider/client'
+require 'uri'
+module Rwspider
+  NAME            = 'Rwspider'
+  GEM             = 'rwspider'
+  AUTHORS         = ['Simone Rinzivillo <srinzivillo@gmail.com>']
+  # Start the crawling from the <tt>URL</tt> with the personalized <tt>options</tt>.
+	# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
+  # Rwspider::start yield an instance of Rwspider::Document Class for each page downloaded.
+  #
+	#  opts = {
+  #    :useragent => 'My user agent',
+  #    :robot_name => 'my_spider_name',
+  #    :scan_documents_limit => 100,
+  #    :scan_domain_limit => nil,
+  #    :scan_images => true,
+  #    :scan_other_files => false,
+  #    :follow_robotstxt_directive => true,
+  #    :follow_HTTP_redirection => true,
+	#    :timeout => 5
+	#  }
+  #  Rwspider.start('http://www.rwspider.com', opts) {do |d|
+  #     puts 'Current URL ' + d.url.normalize.to_s
+  #  }
+  #
+  def self.start(url, options = {})
+		@client = Rwspider::Client.new(options)
+		@client.start(url)do |doc|
+			yield doc if block_given?
+		end
+  end
+end

data/rwspider.gemspec ADDED Viewed

@@ -0,0 +1,44 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{rwspider}
+  s.version = "0.4.2"
+  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Simone Rinzivillo"]
+  s.date = %q{2010-02-06}
+  s.description = %q{    RW Spider is an multithreading spider client written in Ruby designed to make easy 		the development of programs that spider the web.
+}
+  s.email = %q{srinzivillo@gmail.com}
+  s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb"]
+  s.files = ["Changelog.rdoc", "LICENSE.rdoc", "README.rdoc", "Rakefile", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb", "test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb", "Manifest", "rwspider.gemspec"]
+  s.homepage = %q{http://www.rwspider.com}
+  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rwspider", "--main", "README.rdoc"]
+  s.require_paths = ["lib"]
+  s.rubyforge_project = %q{rwspider}
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{RW Spider is an multithreading spider client written in Ruby}
+  s.test_files = ["test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb"]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<rake>, ["~> 0.8"])
+      s.add_development_dependency(%q<hpricot>, ["~> 0.8.2"])
+      s.add_development_dependency(%q<robotstxt>, ["~> 0.5.2"])
+      s.add_development_dependency(%q<echoe>, ["~> 3.1"])
+    else
+      s.add_dependency(%q<rake>, ["~> 0.8"])
+      s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
+      s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
+      s.add_dependency(%q<echoe>, ["~> 3.1"])
+    end
+  else
+    s.add_dependency(%q<rake>, ["~> 0.8"])
+    s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
+    s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
+    s.add_dependency(%q<echoe>, ["~> 3.1"])
+  end
+end

data/test/client_test.rb ADDED Viewed

@@ -0,0 +1,34 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'rwspider'
+class TestClient < Test::Unit::TestCase
+	def setup
+	  opts = {
+			:useragent => 'My user agent',
+			:robot_name => 'my_spider_name',
+			:scan_documents_limit => 10,
+			:scan_domain_limit => nil,
+			:scan_images => true,
+			:scan_other_files => false,
+			:follow_robotstxt_directive => true,
+			:follow_HTTP_redirection => true,
+			:timeout => 5
+		}
+		@client = Rwspider::Client.new(opts)
+	end
+	def test_initialize
+		client = Rwspider::Client.new
+		assert_instance_of Rwspider::Client, client
+	end
+	def test_start
+		r = @client.start('http://www.rwspider.com')
+		assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
+		assert_instance_of Rwspider::Queue, r
+  end
+end

data/test/document_test.rb ADDED Viewed

@@ -0,0 +1,55 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'rwspider'
+class TestDocument < Test::Unit::TestCase
+	def setup
+	  opts = {
+			:scan_documents_limit => 1,
+			:scan_domain_limit => 'www.rwspider.com',
+			:timeout => 10
+		}
+		client = Rwspider::Client.new(opts)
+		@doc = client.start('http://www.rwspider.com')[0]
+	end
+	def test_initialize
+		d = Rwspider::Document.new('http://www.rwspider.com')
+		assert_instance_of Rwspider::Document, d
+	end
+	def test_normalize_relative_url
+    doc = Rwspider::Document::new('http://www.rwspider.com')
+		assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('/sitemap.html')))
+	end
+	def test_normalize_absolute_url
+    doc = Rwspider::Document::new('http://www.rwspider.com')
+		assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('http://www.rwspider.com/sitemap.html')))
+	end
+	def test_parse
+    doc = Rwspider::Document::new('http://www.rwspider.com')
+		doc.parse('http://www.rwspider.com/sitemap.html')
+		assert_equal('http://www.rwspider.com/sitemap.html', doc.url.normalize.to_s)
+	end
+	def test_get_links
+		arr = @doc.get_links
+		assert_instance_of Array, arr
+	end
+	def test_get_images
+		arr = @doc.get_images
+		assert_instance_of Array, arr
+	end
+	def test_get_other_files
+		arr = @doc.get_other_files
+		assert_instance_of Array, arr
+	end
+end

data/test/rwspider_test.rb ADDED Viewed

@@ -0,0 +1,30 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'rwspider'
+class TestRwspider < Test::Unit::TestCase
+	def test_start
+		opts = {
+			:useragent => 'My user agent',
+			:robot_name => 'my_spider_name',
+			:scan_documents_limit => 100,
+			:scan_domain_limit => nil,
+			:scan_images => true,
+			:scan_other_files => false,
+			:follow_robotstxt_directive => true,
+			:follow_HTTP_redirection => true,
+			:timeout => 5
+		}
+		r = Rwspider.start('http://www.rwspider.com', opts)
+		assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
+	end
+	def test_start_without_options
+		r = Rwspider.start('http://www.rwspider.com')
+		assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
+	end
+end

metadata ADDED Viewed

@@ -0,0 +1,120 @@
+--- !ruby/object:Gem::Specification
+name: rwspider
+version: !ruby/object:Gem::Version
+  version: 0.4.2
+platform: ruby
+authors:
+- Simone Rinzivillo
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-06 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: "0.8"
+    version:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.8.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: robotstxt
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.5.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: echoe
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: "3.1"
+    version:
+description: "    RW Spider is an multithreading spider client written in Ruby designed to make easy \t\tthe development of programs that spider the web.\n"
+email: srinzivillo@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.rdoc
+- README.rdoc
+- lib/rwspider.rb
+- lib/rwspider/client.rb
+- lib/rwspider/document.rb
+- lib/rwspider/queue.rb
+- lib/rwspider/version.rb
+files:
+- Changelog.rdoc
+- LICENSE.rdoc
+- README.rdoc
+- Rakefile
+- lib/rwspider.rb
+- lib/rwspider/client.rb
+- lib/rwspider/document.rb
+- lib/rwspider/queue.rb
+- lib/rwspider/version.rb
+- test/client_test.rb
+- test/document_test.rb
+- test/rwspider_test.rb
+- Manifest
+- rwspider.gemspec
+has_rdoc: true
+homepage: http://www.rwspider.com
+licenses: []
+post_install_message:
+rdoc_options:
+- --line-numbers
+- --inline-source
+- --title
+- Rwspider
+- --main
+- README.rdoc
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "1.2"
+  version:
+requirements: []
+rubyforge_project: rwspider
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: RW Spider is an multithreading spider client written in Ruby
+test_files:
+- test/client_test.rb
+- test/document_test.rb
+- test/rwspider_test.rb