RubyGems - rubyretriever - Versions diffs - 1.0.0 → 1.0.1 - Mend

rubyretriever 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/retriever/fetch.rb +6 -6
data/lib/retriever/fetchfiles.rb +3 -3
data/lib/retriever/fetchseo.rb +1 -1
data/lib/retriever/fetchsitemap.rb +2 -2
data/lib/retriever/version.rb +1 -1
data/readme.md +4 -2
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
-  data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
+  metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
+  data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
 SHA512:
-  metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
-  data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
+  metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
+  data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Retriever
 	class Fetch
 		attr_reader :maxPages, :t
-		def initialize(url,options)
+		def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
 			@connection_tally = {
 				:success => 0,
 				:error => 0,
@@ -58,7 +58,7 @@ module Retriever
 		def lg(msg)
 			puts "### #{msg}" if @v
 		end
-		def dump
+		def dump #prints current data collection to STDOUT, meant for CLI use.
 			puts "###############################"
 			if @v
 				puts "Connection Tally:"
@@ -85,7 +85,7 @@ module Retriever
 			puts "###############################"
 			puts
 		end
-		def write
+		def write #writes current data collection out to CSV in current directory
 			if @output
 				i = 0
 				CSV.open("#{@output}.csv", "w") do |csv|
@@ -104,7 +104,7 @@ module Retriever
 				puts
 			end
 		end
-		def async_crawl_and_collect()
+		def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
 			while (@already_crawled.size < @maxPages)
 				if @linkStack.empty?
 					if @prgrss
@@ -116,11 +116,11 @@ module Retriever
 				end
 				new_links_arr = self.asyncGetWave()
 				next if (new_links_arr.nil? || new_links_arr.empty?)
-				new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
+				new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
 				@linkStack.concat(new_links_arr).uniq!
 				@data.concat(new_links_arr) if @s
 			end
-			@progressbar.finish if @prgrss
+			@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
 		end
 		def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
 			return false if !resp

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchFiles < Fetch
-		def initialize(url,options)
+		def initialize(url,options)  #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
 			super
 			@data = []
 			page_one = Retriever::Page.new(@t.source,@t)
@@ -21,7 +21,7 @@ module Retriever
 			@data.sort_by! {|x| x.length}
 			@data.uniq!
 		end
-		def download_file(path)
+		def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
 			arr = path.split('/')
 			shortname = arr.pop
 			puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
@@ -32,7 +32,7 @@ module Retriever
 			end
 			puts "	SUCCESS: Download Complete"
 		end
-		def autodownload()
+		def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
 			lenny = @data.count
 			puts "###################"
 			puts "### Initiating Autodownload..."

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchSEO < Fetch
-		def initialize(url,options)
+		def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
 			super
 			@data = []
 			page_one = Retriever::Page.new(@t.source,@t)

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchSitemap < Fetch
-		def initialize(url,options)
+		def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
 			super
 			@data = [@t.target]
 			page_one = Retriever::Page.new(@t.source,@t)
@@ -18,7 +18,7 @@ module Retriever
 			@data.sort_by!	 {|x| x.length} if @data.size>1
 			@data.uniq!
 		end
-		def gen_xml
+		def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
 			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
 			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
 				@data.each do |url|

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '1.0.0'
+  VERSION = '1.0.1'
 end

data/readme.md CHANGED Viewed

@@ -1,4 +1,4 @@
-[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
+[RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
 ==============
 [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)  [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
 RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
+v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
 getting started
 -----------
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
 Where MODE FLAG is required, and is either:
 	-s, --sitemap FORMAT  (only accepts CSV or XML atm)
-	-f, --files FILETYPE
+	-f, --files FILETYPE
 	-e, --seo
 and OPTIONS is the applicable:

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-07 00:00:00.000000000 Z
+date: 2014-06-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony