RubyGems - rubyretriever - Versions diffs - 1.0.0 → 1.0.1 - Mend

rubyretriever 1.0.0 → 1.0.1

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/retriever/fetch.rb +6 -6
data/lib/retriever/fetchfiles.rb +3 -3
data/lib/retriever/fetchseo.rb +1 -1
data/lib/retriever/fetchsitemap.rb +2 -2
data/lib/retriever/version.rb +1 -1
data/readme.md +4 -2
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
-  data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
+  metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
+  data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
 SHA512:
-  metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
-  data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
+  metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
+  data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Retriever
 	class Fetch
 		attr_reader :maxPages, :t
-		def initialize(url,options)
+		def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
 			@connection_tally = {
 				:success => 0,
 				:error => 0,
@@ -58,7 +58,7 @@ module Retriever
 		def lg(msg)
 			puts "### #{msg}" if @v
 		end
-		def dump
+		def dump #prints current data collection to STDOUT, meant for CLI use.
 			puts "###############################"
 			if @v
 				puts "Connection Tally:"
@@ -85,7 +85,7 @@ module Retriever
 			puts "###############################"
 			puts
 		end
-		def write
+		def write #writes current data collection out to CSV in current directory
 			if @output
 				i = 0
 				CSV.open("#{@output}.csv", "w") do |csv|
@@ -104,7 +104,7 @@ module Retriever
 				puts
 			end
 		end
-		def async_crawl_and_collect()
+		def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
 			while (@already_crawled.size < @maxPages)
 				if @linkStack.empty?
 					if @prgrss
@@ -116,11 +116,11 @@ module Retriever
 				end
 				new_links_arr = self.asyncGetWave()
 				next if (new_links_arr.nil? || new_links_arr.empty?)
-				new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
+				new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
 				@linkStack.concat(new_links_arr).uniq!
 				@data.concat(new_links_arr) if @s
 			end
-			@progressbar.finish if @prgrss
+			@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
 		end
 		def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
 			return false if !resp

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchFiles < Fetch
-		def initialize(url,options)
+		def initialize(url,options)  #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
 			super
 			@data = []
 			page_one = Retriever::Page.new(@t.source,@t)
@@ -21,7 +21,7 @@ module Retriever
 			@data.sort_by! {|x| x.length}
 			@data.uniq!
 		end
-		def download_file(path)
+		def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
 			arr = path.split('/')
 			shortname = arr.pop
 			puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
@@ -32,7 +32,7 @@ module Retriever
 			end
 			puts "	SUCCESS: Download Complete"
 		end
-		def autodownload()
+		def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
 			lenny = @data.count
 			puts "###################"
 			puts "### Initiating Autodownload..."

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchSEO < Fetch
-		def initialize(url,options)
+		def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
 			super
 			@data = []
 			page_one = Retriever::Page.new(@t.source,@t)

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Retriever
 	class FetchSitemap < Fetch
-		def initialize(url,options)
+		def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
 			super
 			@data = [@t.target]
 			page_one = Retriever::Page.new(@t.source,@t)
@@ -18,7 +18,7 @@ module Retriever
 			@data.sort_by!	 {|x| x.length} if @data.size>1
 			@data.uniq!
 		end
-		def gen_xml
+		def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
 			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
 			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
 				@data.each do |url|

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '1.0.0'
+  VERSION = '1.0.1'
 end

data/readme.md CHANGED Viewed

@@ -1,4 +1,4 @@
-[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
+[RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
 ==============
 [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)  [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
 RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
+v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
 getting started
 -----------
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
 Where MODE FLAG is required, and is either:
 	-s, --sitemap FORMAT  (only accepts CSV or XML atm)
-	-f, --files FILETYPE
+	-f, --files FILETYPE
 	-e, --seo
 and OPTIONS is the applicable:

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-07 00:00:00.000000000 Z
+date: 2014-06-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony