RubyGems - rcrawl - Versions diffs - 0.3.5 → 0.4.5 - Mend

rcrawl 0.3.5 → 0.4.5

Files changed (7) hide show

data/README CHANGED Viewed

@@ -32,7 +32,7 @@ The structure of the crawling process was inspired by the specs of the Mercator
 # Returns a hash where the key is a url and the value is
 # the raw html from that url
-	crawler.dump
+	crawler.raw_html
 # Returns a hash where the key is a URL and the value is
@@ -48,4 +48,4 @@ Copyright © 2006 Digital Duckies, LLC, under MIT License
 Developed for http://digitalduckies.net
-News, code, and documentation at http://digitalduckies.net
+News, code, and documentation at http://blog.digitalduckies.net

data/Rakefile CHANGED Viewed

@@ -18,7 +18,7 @@ end
 spec = Gem::Specification.new do |s|
   s.name         = "rcrawl"
-  s.version      = "0.3.5"
+  s.version      = "0.4.5"
   s.author       = "Digital Duckies"
   s.email        = "rcrawl@digitalduckies.net"
   s.homepage     = "http://digitalduckies.net"

data/lib/rcrawl/crawler.rb CHANGED Viewed

@@ -1,4 +1,155 @@
 module Rcrawl
 	class Crawler
-	end
-end
+		attr_accessor :links_to_visit, :site
+		attr_reader :visited_links, :external_links, :raw_html, :rules, :sites,
+						:errors
+		# Initializes various variables when a new Crawler object is instantiated
+		def initialize(site)
+			puts "Rcrawl Version #{VERSION} initializing..."
+			@links_to_visit = Array.new
+			@visited_links = Array.new
+			@external_links = Array.new
+			@raw_html = Hash.new
+			@rules = RobotRules.new("rcrawl/#{VERSION}")
+			@sites = Hash.new
+			@errors = Hash.new
+			@site = URI.parse(site) || raise("You didn't give me a site to crawl")
+			@links_to_visit << site
+			puts "Ready to crawl #{site}"
+		end
+		# Coordinates the whole crawling process
+		def crawl
+			until @links_to_visit.empty? do
+				begin
+					# Get link
+					url_server
+					next unless robot_safe? @url
+					# Parse robots.txt, then download document if robot_safe
+					fetch_http(@url)
+					# Store raw HTML in variable to read/reread as needed
+					# Then call any processing modules you need for the current document
+					ris(@document)
+				rescue
+					puts ""
+					puts "I died on #{@url}"
+					$stderr.puts $!
+					@errors[@url] = $!
+					next
+				ensure
+					# Stuff you want to make sure gets printed out
+					puts " done!"
+				end
+			end
+			puts "Visited #{@visited_links.size} links."
+		end
+		# Authoritative list of URLs to be processed by Rcrawl
+		def url_server
+			unless @links_to_visit.empty?
+				@url = @links_to_visit.pop
+			end
+		end
+		# Download the document
+		def fetch_http(url)
+			# Make sure robots.txt has been parsed for this site first,
+			# if not, parse robots.txt then grab document.
+			uri = URI.parse(url)
+			print "Visiting: #{url}"
+			@document = uri.read
+			@visited_links << url
+		end
+		# Rewind Input Stream, for storing and reading of raw HTML
+		def ris(document)
+			print "."
+			# Store raw HTML into local variable
+			# Based on MIME type, invoke the proper processing modules
+			case document.content_type
+				when "text/html"
+					link_extractor(document)
+					process_html(document)
+				else
+					print "... not HTML, skipping..."
+			end
+		end
+		# HTML processing module for extracting links
+		def link_extractor(document)
+			print "."
+			# Parse all links from HTML into an array
+			# Set up the scrAPI (http://labnotes.org)
+			links = Scraper.define do
+				array :urls
+				process "a[href]", :urls => "@href"
+				result :urls
+			end
+			urls = links.scrape(document)
+			urls.each { |url|
+			uri = URI.parse(url)
+			# Derelativeize links if necessary
+			if uri.relative?
+				url = @site.merge(url).to_s
+				uri = URI.parse(url)
+			end
+			# Check domain, if in same domain, keep link, else trash it
+			if uri.host != @site.host
+				@external_links << url
+				@external_links.uniq!
+				next
+			end
+			# Find out if we've seen this link already
+			if (@visited_links.include? url) || (@links_to_visit.include? url)
+				next
+			end
+			@links_to_visit << url
+		}
+		end
+		# HTML processing module for raw HTML storage
+		def process_html(document)
+			# Add link and raw HTML to a hash as key/value
+			# for later storage in database
+			unless @raw_html.has_value?(document)
+		  		print "."
+		  		@raw_html[@document.base_uri] = document
+			end
+		end
+		# robots.txt parsing
+		def robot_safe?(url)
+			uri = URI.parse(url)
+			location = "#{uri.host}:#{uri.port}"
+			return true unless %w{http https}.include?(uri.scheme)
+			unless @sites.include? location
+				@sites[location] = true
+				robot_url = "http://#{location}/robots.txt"
+				begin
+					robot_file = open(robot_url) { |page| page.read }
+				rescue
+					return true
+				end
+				@rules.parse(robot_url, robot_file)
+			end
+			@rules.allowed? url
+		end
+	end # class Crawler
+end # module Rcrawl

data/lib/rcrawl/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Rcrawl
+	class Crawler
+		VERSION = "0.4.5"
+	end
+end

data/lib/rcrawl.rb CHANGED Viewed

@@ -4,128 +4,6 @@ require 'rubygems'
 require 'open-uri'
 require 'scrapi'
 require 'rcrawl/robot_rules'
-require 'rcrawl/process/html'
+require 'rcrawl/crawler'
+require 'rcrawl/version'
-module Rcrawl
-	# Crawler will retrieve an entire website, one page at a time,
-	# parsing the page using whatever modules you pass it to.
-	class Crawler
-		# Initializes various variables when a new Rcrawl object is instantiated
-		def initialize(site)
-			@links_to_visit = Array.new
-			@visited_links = Array.new
-			@external_links = Array.new
-			@raw_html = Hash.new
-			@rules = RobotRules.new("Rcrawl")
-			@sites = Hash.new
-			@site = URI.parse(site)
-			@links_to_visit << site
-			@errors = Hash.new
-			puts "Site is #{site}"
-		end
-		# Coordinates the whole crawling process
-		def crawl
-			until @links_to_visit.empty? do
-				begin
-					# Get link
-					url_server
-					next unless robot_safe? @url
-					# Parse robots.txt, then download document if robot_safe
-					fetch_http(@url)
-					# Store raw HTML in variable to read/reread as needed
-					# Then call any processing modules you need for the current document
-					ris(@document)
-				rescue
-					puts ""
-					puts "I died on #{@url}"
-					$stderr.puts $!
-					@errors[@url] = $!
-					next
-				ensure
-					# Stuff you want to make sure gets printed out
-					puts " done!"
-				end
-			end
-			puts "Visited #{@visited_links.size} links."
-		end
-		# Authoritative list of URLs to be processed by Rcrawl
-		def url_server
-			unless @links_to_visit.empty?
-				@url = @links_to_visit.pop
-			end
-		end
-		# Download the document
-		def fetch_http(url)
-			# Make sure robots.txt has been parsed for this site first,
-			# if not, parse robots.txt then grab document.
-			uri = URI.parse(url)
-			print "Visiting: #{url}"
-			@document = uri.read
-			@visited_links << url
-		end
-		# Rewind Input Stream, for storing and reading of raw HTML
-		def ris(document)
-			print "."
-			# Store raw HTML into local variable
-			# Based on MIME type, invoke the proper processing modules
-			case document.content_type
-				when "text/html"
-					Rcrawl::Process::HTML.link_extractor(document)
-					Rcrawl::Process::HTML.process_html(document)
-				else
-					print "... not HTML, skipping..."
-			end
-		end
-		# robots.txt parsing
-		def robot_safe?(url)
-			uri = URI.parse(url)
-			location = "#{uri.host}:#{uri.port}"
-			return true unless %w{http https}.include?(uri.scheme)
-			unless @sites.include? location
-				@sites[location] = true
-				robot_url = "http://#{location}/robots.txt"
-				begin
-					robot_file = open(robot_url) { |page| page.read }
-				rescue
-					return true
-				end
-				@rules.parse(robot_url, robot_file)
-			end
-			@rules.allowed? url
-		end
-		# Returns array of links visited during crawl
-		def visited_links
-			return @visited_links
-		end
-		# Returns array of external links
-		def external_links
-			return @external_links
-		end
-		# Returns a hash where {key => URL, value => HTML} from all pages crawled
-		def dump
-			return @raw_html
-		end
-		# Returns a hash where {key => URL, value => "Error message"} from any
-		# errors encountered during the crawl
-		def errors
-			return @errors
-		end
-	end
-end

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: rcrawl
 version: !ruby/object:Gem::Version
-  version: 0.3.5
-date: 2006-09-23 00:00:00 -05:00
+  version: 0.4.5
+date: 2006-09-26 00:00:00 -05:00
 summary: A web crawler written in ruby
 require_paths:
 - lib
@@ -31,10 +31,9 @@ authors:
 files:
 - lib/rcrawl.rb
 - lib/rcrawl
-- lib/rcrawl/crawler.rb
 - lib/rcrawl/robot_rules.rb
-- lib/rcrawl/process
-- lib/rcrawl/process/html.rb
+- lib/rcrawl/crawler.rb
+- lib/rcrawl/version.rb
 - README
 - MIT-LICENSE
 - Rakefile

data/lib/rcrawl/process/html.rb DELETED Viewed

@@ -1,61 +0,0 @@
-module Rcrawl
-	module Process
-		module HTML
-			# HTML processing module for extracting links
-			def HTML.link_extractor(document)
-				print "."
-				# Parse all links from HTML into an array
-				# Set up the scrAPI (http://labnotes.org)
-				links = Scraper.define do
-					array :urls
-					process "a[href]", :urls => "@href"
-					result :urls
-				end
-				urls = links.scrape(document)
-				urls.each { |url|
-				uri = URI.parse(url)
-				# Derelativeize links if necessary
-				if uri.relative?
-					url = @site.merge(url).to_s
-					uri = URI.parse(url)
-				end
-				# Check domain, if in same domain, keep link, else trash it
-				if uri.host != @site.host
-					@external_links << url
-					@external_links.uniq!
-					next
-				end
-				# Find out if we've seen this link already
-				if (@visited_links.include? url) || (@links_to_visit.include? url)
-					next
-				end
-				@links_to_visit << url
-			}
-			end
-			# HTML processing module for raw HTML storage
-			def HTML.process_html(document)
-				# Add link and raw HTML to a hash as key/value
-				# for later storage in database
-				unless @raw_html.has_value?(document)
-			  		print "."
-			  		@raw_html[document.base_uri] = document
-				end
-			end
-		end
-	end
-end