RubyGems - pantopoda - Versions diffs - 0.0.2 → 0.0.3 - Mend

pantopoda 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 821e68617f3b8e7b7c746f27e5976dbbb8f002f6
-  data.tar.gz: 159c0c2d53776430972374d543376fcab41eee6e
+  metadata.gz: 95eb1fb6916786582ba766cc5b7afae0adab80d2
+  data.tar.gz: ad00abae14a4638d1cf2cb49aea278e176f15aaf
 SHA512:
-  metadata.gz: 11c83070a2c952e3364c4d80e698e3adcee8593fb5e1814ea56f67f87bd2a696ed52acf2a07afd4e01a9e4f614819c262f4a0718d7015c15b01f379ea4d6b032
-  data.tar.gz: 48fb7e3b406d21d7d2aadb74b15b7b3e9487aee2b1045b0bb40f9e8676da20978ec7a4c0aae968e55b121fdf4d3ed254f9e8c89d6716c52f164bb90f09535715
+  metadata.gz: d7894b766dfeb8171c91bd720610d81a8a7a6c50a874fa574bef9ad37ca7f184640502d6d0dfd87404cc00d3aadc3f67bdb5fcd2d8e94313973135fb24c739af
+  data.tar.gz: 3ae3b9be93335e67c387dab9287f5376284eac5d30633f57b041f205e5067917654f451b648a6eaf9c379c9e661ba7efdc2aae1accc43049be77c1ff775ae721

data/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # Pantopoda
-TODO: Write a gem description
+Pantopoda is a ruby spidering library that was built out of sheer frustration at the lack of good, modern web crawling tools that Python enjoys. Pantopoda uses bloom filters to store the list of visited urls for efficient querying up to hundreds of thousands of urls, and the requests are handled by Typhoeus to allow for multi-threaded crawling.
+Pantopoda will crawl every single page it can find on a particular domain.
 ## Installation

data/lib/pantopoda/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Pantopoda
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/lib/pantopoda.rb CHANGED Viewed

@@ -5,137 +5,139 @@ require 'nokogiri'
 require 'domainatrix'
 require 'uri'
-class Pantopoda
-	def initialize(url, options = {})
-		@start_url = url
-		@domain = parse_domain(url)
-		@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
-		@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
-		@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
-	end
+module Pantopoda
+	class Pantopoda
+		def initialize(url, options = {})
+			@start_url = url
+			@domain = parse_domain(url)
+			@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
+			@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
+			@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
+		end
-	def crawl(options = {})
-		# Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
-		threads = options[:threads] ? options[:threads] : 1
-		# Defaults to -1 so it will always keep running until it runs out of urls
-		max_urls = options[:max_urls] ? options[:max_urls] : nil
-		@hydra = Typheous::Hydra.new(:max_concurrency => threads)
-		@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
-		@global_queue = []
-		@global_queue << @start_url
-		while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
-			temp_queue = @global_queue
-			temp_queue.each do |q|
-				begin
-					ip,port,user,pass = nil
-					request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
-					request.on_complete do |response|
-						yield response
-						links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
-						links.each do |link|
-							if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
-								sanitized_link = sanitized_link(split_url_at_hash(link))
-								if (sanitized_link)
-									absolute_link = make_absolute(sanitized_link, response.effective_url)
-									if absolute_link
-										@global_queue << absolute_link
+		def crawl(options = {})
+			# Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
+			threads = options[:threads] ? options[:threads] : 1
+			# Defaults to -1 so it will always keep running until it runs out of urls
+			max_urls = options[:max_urls] ? options[:max_urls] : nil
+			@hydra = Typheous::Hydra.new(:max_concurrency => threads)
+			@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
+			@global_queue = []
+			@global_queue << @start_url
+			while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
+				temp_queue = @global_queue
+				temp_queue.each do |q|
+					begin
+						ip,port,user,pass = nil
+						request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
+						request.on_complete do |response|
+							yield response
+							links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
+							links.each do |link|
+								if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
+									sanitized_link = sanitized_link(split_url_at_hash(link))
+									if (sanitized_link)
+										absolute_link = make_absolute(sanitized_link, response.effective_url)
+										if absolute_link
+											@global_queue << absolute_link
+										end
 									end
 								end
 							end
 						end
-					end
-					@hydra.queue request
+						@hydra.queue request
-				rescue URI::InvalidURIError, NoMethodError => e
-					puts "Exception caught: #{e}" if @debug == true
+					rescue URI::InvalidURIError, NoMethodError => e
+						puts "Exception caught: #{e}" if @debug == true
+					end
+					@global_visited.insert(q)
+					@global_queue.delete(q)
 				end
-				@global_visited.insert(q)
-				@global_queue.delete(q)
+				@hydra.run
 			end
-			@hydra.run
 		end
-	end
-	def parse_domain(url)
-		puts "Parsing URL: #{url}"
+		def parse_domain(url)
+			puts "Parsing URL: #{url}"
-		begin
-			parsed_domain = Domainatrix.parse(url)
-			if (parsed_domain.subdomain != "")
-				parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
-			else
-				parse_domain.domain + '.' + parsed_domain.public_suffix
-			end
+			begin
+				parsed_domain = Domainatrix.parse(url)
+				if (parsed_domain.subdomain != "")
+					parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
+				else
+					parse_domain.domain + '.' + parsed_domain.public_suffix
+				end
-		rescue NoMethodError, Addressable::URI::InvalidURIError => e
-			puts "URL Parsing Exception (#{url}) : #{e}"
-			return nil
+			rescue NoMethodError, Addressable::URI::InvalidURIError => e
+				puts "URL Parsing Exception (#{url}) : #{e}"
+				return nil
+			end
 		end
-	end
-	def internal_link?(url, effective_url)
-		absolute_url = make_absolute(url, effective_url)
-		parsed_url = parse_domain(absolute_url)
-		if (@domain == parsed_url)
-			return true
-		else
-			return false
+		def internal_link?(url, effective_url)
+			absolute_url = make_absolute(url, effective_url)
+			parsed_url = parse_domain(absolute_url)
+			if (@domain == parsed_url)
+				return true
+			else
+				return false
+			end
 		end
-	end
-	def split_url_at_hash(url)
-		return url.to_s unless @split_url_at_hash
-		return url.to_s.split('#')[0]
-	end
+		def split_url_at_hash(url)
+			return url.to_s unless @split_url_at_hash
+			return url.to_s.split('#')[0]
+		end
-	def no_hash_in_url?(url)
-		return true unless @exclude_urls_with_hash
+		def no_hash_in_url?(url)
+			return true unless @exclude_urls_with_hash
-		if(url.to_s.scan(/#/).size > 0)
-			return false
-		else
-			return true
+			if(url.to_s.scan(/#/).size > 0)
+				return false
+			else
+				return true
+			end
 		end
-	end
-	def ignore_extensions(url)
-		return true if url.to_s.length == 0
-		return true unless @exclude_urls_with_extensions
+		def ignore_extensions(url)
+			return true if url.to_s.length == 0
+			return true unless @exclude_urls_with_extensions
-		not_found = true
+			not_found = true
-		@exclude_urls_with_extensions.each do |e|
-			if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
-				not_found = false
-				puts "#{e} Found At URL: #{url}"
+			@exclude_urls_with_extensions.each do |e|
+				if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
+					not_found = false
+					puts "#{e} Found At URL: #{url}"
+				end
 			end
-		end
-		return not_found
-	end
+			return not_found
+		end
-	def sanitize_link(url)
-		begin
-			return url.gsub(/\s+/, "%20")
-		rescue
-			return false
+		def sanitize_link(url)
+			begin
+				return url.gsub(/\s+/, "%20")
+			rescue
+				return false
+			end
 		end
-	end
-	def make_absolute(href, root)
-		begin
-	  		URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
-	  	rescue URI::InvalidURIError, URI::InvalidComponentError => e
-	  		return false
-	  	end
+		def make_absolute(href, root)
+			begin
+		  		URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
+		  	rescue URI::InvalidURIError, URI::InvalidComponentError => e
+		  		return false
+		  	end
+		end
 	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pantopoda
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Gabriel Lim