RubyGems - rcrawl - Versions diffs - 0.2.5 - Mend

rcrawl 0.2.5

Files changed (7) hide show

data/MIT-LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2006 Shawn Hansen
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED Viewed

@@ -0,0 +1,22 @@
+Rcrawl is intended to be a web crawler written entirely in ruby.
+It's limited right now by the fact that it will stay on the original domain provided.
+I decided to roll my own crawler in ruby after finding only snippets of code on
+various web sites or newsgroups, for crawlers written in ruby.
+The structure of the crawling process was inspired by the specs of the Mercator crawler (http://www.cindoc.csic.es/cybermetrics/pdf/68.pdf).
+== Examples
+bot = Rcrawl.new(url) # This instantiates a new Rcrawl object
+bot.crawl # This will actually crawl the website
+== After the bot is done crawling
+bot.visited_links # Returns an array of visited links
+bot.dump # Returns a hash where the key is a url and the value is
+         # the raw html from that url
+bot.errors # Returns a hash where the key is a URL and the value is
+           # the error message from stderr
+bot.external_links # Returns an array of external links

data/Rakefile ADDED Viewed

@@ -0,0 +1,39 @@
+require 'rubygems'
+Gem::manage_gems
+require 'rake'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+desc "Generate documentation"
+Rake::RDocTask.new(:rdoc) do |rdoc|
+  rdoc.rdoc_dir = "rdoc"
+  rdoc.title    = "Crawler"
+  rdoc.options << "--line-numbers"
+  rdoc.options << "--inline-source"
+  rdoc.rdoc_files.include("README")
+  rdoc.rdoc_files.include("lib/**/*.rb")
+end
+spec = Gem::Specification.new do |s|
+  s.name         = "rcrawl"
+  s.version      = "0.2.5"
+  s.author       = "Shawn Hansen"
+  s.email        = "shawn.hansen@gmail.com"
+  s.homepage     = "http://blog.denomi.net"
+  s.platform     = Gem::Platform::RUBY
+  s.summary      = "A web crawler written in ruby"
+  s.files        = FileList["{test,lib}/**/*", "README", "MIT-LICENSE", "Rakefile", "TODO"].to_a
+  s.require_path = "lib"
+  s.autorequire  = "rcrawl.rb"
+  s.has_rdoc     = true
+  s.extra_rdoc_files = ["README", "MIT-LICENSE", "TODO"]
+  s.add_dependency("scrapi", ">=1.2.0")
+  s.rubyforge_project = "rcrawl"
+end
+gem = Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.need_tar = true
+  pkg.need_zip = true
+end

data/TODO ADDED Viewed

	@@ -0,0 +1 @@
1	+ Lots! TODO will be updated soon.

data/lib/rcrawl.rb ADDED Viewed

@@ -0,0 +1,177 @@
+#!/usr/bin/env ruby
+# rcrawl/0.2.0
+require 'rubygems'
+require 'open-uri'
+require 'scrapi'
+require 'robot_rules'
+# Rcrawl will retrieve an entire website, one page at a time,
+# parsing the page using whatever modules you pass it to.
+class Rcrawl
+	# Initializes various variables when a new Rcrawl object is instantiated
+	def initialize(site)
+		@links_to_visit = Array.new
+		@visited_links = Array.new
+		@external_links = Array.new
+		@raw_html = Hash.new
+		@rules = RobotRules.new("Rcrawl")
+		@sites = Hash.new
+		@site = URI.parse(site)
+		@links_to_visit << site
+		@errors = Hash.new
+		puts "Site is #{site}"
+	end
+	# Coordinates the whole crawling process
+	def crawl
+		until @links_to_visit.empty? do
+			begin
+				# Get link
+				url_server
+				next unless robot_safe? @url
+				# Parse robots.txt, then download document if robot_safe
+				fetch_http(@url)
+				# Store raw HTML in variable to read/reread as needed
+				# Then call any processing modules you need for the current document
+				ris(@document)
+			rescue
+				puts ""
+				puts "I died on #{@url}"
+				$stderr.puts $!
+				@errors[@url] = $!
+				next
+			ensure
+				# Stuff you want to make sure gets printed out
+				puts " done!"
+			end
+		end
+		puts "Visited #{@visited_links.size} links."
+	end
+	# Authoritative list of URLs to be processed by Rcrawl
+	def url_server
+		unless @links_to_visit.empty?
+			@url = @links_to_visit.pop
+		end
+	end
+	# Download the document
+   def fetch_http(url)
+      # Make sure robots.txt has been parsed for this site first,
+		# if not, parse robots.txt then grab document.
+		uri = URI.parse(url)
+		print "Visiting: #{url}"
+		@document = uri.read
+		@visited_links << url
+   end
+   # Rewind Input Stream, for storing and reading of raw HTML
+   def ris(document)
+      # Store raw HTML into local variable
+      # Based on MIME type, invoke the proper processing modules
+		if document.content_type == "text/html"
+			print "."
+			link_extractor(document) # If HTML
+			process_html(document)   # If HTML
+        else
+          print "... not HTML, skipping..."
+		end
+   end
+	# HTML processing module for extracting links
+	def link_extractor(document)
+		print "."
+		# Parse all links from HTML into an array
+		# Set up the scrAPI (http://labnotes.org)
+		links = Scraper.define do
+			array :urls
+			process "a[href]", :urls => "@href"
+			result :urls
+		end
+		urls = links.scrape(document)
+		urls.each { |url|
+			uri = URI.parse(url)
+			# Derelativeize links if necessary
+			if uri.relative?
+				url = @site.merge(url).to_s
+				uri = URI.parse(url)
+			end
+			# Check domain, if in same domain, keep link, else trash it
+			if uri.host != @site.host
+				@external_links << url
+				@external_links.uniq!
+				next
+			end
+			# Find out if we've seen this link already
+			if (@visited_links.include? url) || (@links_to_visit.include? url)
+				next
+			end
+			@links_to_visit << url
+		}
+	end
+	# HTML processing module for raw HTML storage
+	def process_html(document)
+		# Add link and raw HTML to a hash as key/value
+		# for later storage in database
+		unless @raw_html.has_value?(document)
+		  print "."
+		  @raw_html[document.base_uri] = document
+		end
+	end
+	# robots.txt parsing
+	def robot_safe?(url)
+		uri = URI.parse(url)
+		location = "#{uri.host}:#{uri.port}"
+		return true unless %w{http https}.include?(uri.scheme)
+		unless @sites.include? location
+			@sites[location] = true
+			robot_url = "http://#{location}/robots.txt"
+			begin
+				robot_file = open(robot_url) { |page| page.read }
+			rescue
+				return true
+			end
+			@rules.parse(robot_url, robot_file)
+		end
+		@rules.allowed? url
+	end
+   # Returns array of links visited during crawl
+	def visited_links
+		return @visited_links
+	end
+	# Returns array of external links
+	def external_links
+		return @external_links
+	end
+	# Returns a hash where {key => URL, value => HTML} from all pages crawled
+	def dump
+		return @raw_html
+	end
+   # Returns a hash where {key => URL, value => "Error message"} from any
+   # errors encountered during the crawl
+	def errors
+		return @errors
+	end
+end

data/lib/robot_rules.rb ADDED Viewed

@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+# robot_rules.rb
+#
+#  Created by James Edward Gray II on 2006-01-31.
+#  Copyright 2006 Gray Productions. All rights reserved.
+#  Included with rcrawl by permission from James Edward Gray II
+require "uri"
+# Based on Perl's WWW::RobotRules module, by Gisle Aas.
+class RobotRules
+   def initialize( user_agent )
+     @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
+																"").downcase
+     @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
+   end
+   def parse( text_uri, robots_data )
+     uri      = URI.parse(text_uri)
+     location = "#{uri.host}:#{uri.port}"
+     @rules.delete(location)
+     rules      = robots_data.split(/[\015\012]+/).
+                              map { |rule| rule.sub(/\s*#.*$/, "") }
+     anon_rules = Array.new
+     my_rules   = Array.new
+     current    = anon_rules
+     rules.each do |rule|
+       case rule
+       when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
+         break unless my_rules.empty?
+         current = if $1 == "*"
+           anon_rules
+         elsif $1.downcase.index(@user_agent)
+           my_rules
+         else
+           nil
+         end
+       when /^\s*Disallow\s*:\s*(.*?)\s*$/i
+         next if current.nil?
+         if $1.empty?
+           current << nil
+         else
+           disallow = URI.parse($1)
+           next unless disallow.scheme.nil? or disallow.scheme ==
+uri.scheme
+           next unless disallow.port.nil?   or disallow.port == uri.port
+           next unless disallow.host.nil?   or
+                       disallow.host.downcase == uri.host.downcase
+           disallow = disallow.path
+           disallow = "/"            if disallow.empty?
+           disallow = "/#{disallow}" unless disallow[0] == ?/
+           current << disallow
+         end
+       end
+     end
+     @rules[location] = if my_rules.empty?
+       anon_rules.compact
+     else
+       my_rules.compact
+     end
+   end
+   def allowed?( text_uri )
+     uri      = URI.parse(text_uri)
+     location = "#{uri.host}:#{uri.port}"
+     path     = uri.path
+     return true unless %w{http https}.include?(uri.scheme)
+     not @rules[location].any? { |rule| path.index(rule) == 0 }
+   end
+end

metadata ADDED Viewed

@@ -0,0 +1,61 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.0
+specification_version: 1
+name: rcrawl
+version: !ruby/object:Gem::Version
+  version: 0.2.5
+date: 2006-09-20 00:00:00 -05:00
+summary: A web crawler written in ruby
+require_paths:
+- lib
+email: shawn.hansen@gmail.com
+homepage: http://blog.denomi.net
+rubyforge_project: rcrawl
+description:
+autorequire: rcrawl.rb
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+post_install_message:
+authors:
+- Shawn Hansen
+files:
+- lib/rcrawl.rb
+- lib/robot_rules.rb
+- README
+- MIT-LICENSE
+- Rakefile
+- TODO
+test_files: []
+rdoc_options: []
+extra_rdoc_files:
+- README
+- MIT-LICENSE
+- TODO
+executables: []
+extensions: []
+requirements: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: scrapi
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.0
+    version: