RubyGems - rcrawl - Versions diffs - 0.3.0 → 0.3.5 - Mend

rcrawl 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/Rakefile CHANGED Viewed

@@ -18,7 +18,7 @@ end
 spec = Gem::Specification.new do |s|
   s.name         = "rcrawl"
-  s.version      = "0.3.0"
+  s.version      = "0.3.5"
   s.author       = "Digital Duckies"
   s.email        = "rcrawl@digitalduckies.net"
   s.homepage     = "http://digitalduckies.net"

data/lib/rcrawl/process/html.rb CHANGED Viewed

@@ -1,8 +1,61 @@
 module Rcrawl
 	module Process
 		module HTML
-			def link_extractor
+			# HTML processing module for extracting links
+			def HTML.link_extractor(document)
+				print "."
+				# Parse all links from HTML into an array
+				# Set up the scrAPI (http://labnotes.org)
+				links = Scraper.define do
+					array :urls
+					process "a[href]", :urls => "@href"
+					result :urls
+				end
+				urls = links.scrape(document)
+				urls.each { |url|
+				uri = URI.parse(url)
+				# Derelativeize links if necessary
+				if uri.relative?
+					url = @site.merge(url).to_s
+					uri = URI.parse(url)
+				end
+				# Check domain, if in same domain, keep link, else trash it
+				if uri.host != @site.host
+					@external_links << url
+					@external_links.uniq!
+					next
+				end
+				# Find out if we've seen this link already
+				if (@visited_links.include? url) || (@links_to_visit.include? url)
+					next
+				end
+				@links_to_visit << url
+			}
 			end
+			# HTML processing module for raw HTML storage
+			def HTML.process_html(document)
+				# Add link and raw HTML to a hash as key/value
+				# for later storage in database
+				unless @raw_html.has_value?(document)
+			  		print "."
+			  		@raw_html[document.base_uri] = document
+				end
+			end
 		end
 	end
 end

data/lib/rcrawl.rb CHANGED Viewed

@@ -77,63 +77,13 @@ module Rcrawl
 			# Based on MIME type, invoke the proper processing modules
 			case document.content_type
 				when "text/html"
-					link_extractor(document)
-					process_html(document)
+					Rcrawl::Process::HTML.link_extractor(document)
+					Rcrawl::Process::HTML.process_html(document)
 				else
 					print "... not HTML, skipping..."
 			end
 		end
-		# HTML processing module for extracting links
-		def link_extractor(document)
-			print "."
-			# Parse all links from HTML into an array
-			# Set up the scrAPI (http://labnotes.org)
-			links = Scraper.define do
-				array :urls
-				process "a[href]", :urls => "@href"
-				result :urls
-			end
-			urls = links.scrape(document)
-			urls.each { |url|
-				uri = URI.parse(url)
-				# Derelativeize links if necessary
-				if uri.relative?
-					url = @site.merge(url).to_s
-					uri = URI.parse(url)
-				end
-				# Check domain, if in same domain, keep link, else trash it
-				if uri.host != @site.host
-					@external_links << url
-					@external_links.uniq!
-					next
-				end
-				# Find out if we've seen this link already
-				if (@visited_links.include? url) || (@links_to_visit.include? url)
-					next
-				end
-				@links_to_visit << url
-			}
-		end
-		# HTML processing module for raw HTML storage
-		def process_html(document)
-			# Add link and raw HTML to a hash as key/value
-			# for later storage in database
-			unless @raw_html.has_value?(document)
-			  print "."
-			  @raw_html[document.base_uri] = document
-			end
-		end
 		# robots.txt parsing
 		def robot_safe?(url)
 			uri = URI.parse(url)

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: rcrawl
 version: !ruby/object:Gem::Version
-  version: 0.3.0
-date: 2006-09-22 00:00:00 -05:00
+  version: 0.3.5
+date: 2006-09-23 00:00:00 -05:00
 summary: A web crawler written in ruby
 require_paths:
 - lib