RubyGems - rubyretriever - Versions diffs - 0.1.4 → 1.0.0 - Mend

rubyretriever 0.1.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/bin/rr +12 -6
data/lib/retriever/cli.rb +27 -0
data/lib/retriever/fetch.rb +84 -45
data/lib/retriever/fetchfiles.rb +11 -17
data/lib/retriever/fetchseo.rb +23 -0
data/lib/retriever/fetchsitemap.rb +10 -14
data/lib/retriever/openuri-redirect-patch.rb +6 -0
data/lib/retriever/page.rb +68 -0
data/lib/retriever/target.rb +20 -9
data/lib/retriever/version.rb +1 -1
data/lib/retriever.rb +4 -0
data/readme.md +18 -6
data/spec/link_spec.rb +2 -2
data/spec/page_spec.rb +94 -0
data/spec/retriever_spec.rb +1 -48
data/spec/target_spec.rb +7 -2
metadata +8 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
-  data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
+  metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
+  data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
 SHA512:
-  metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
-  data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
+  metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
+  data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39

data/bin/rr CHANGED Viewed

@@ -1,18 +1,24 @@
 #! /usr/bin/env ruby
 require 'retriever'
 require 'optparse'
 options = {}
  optparse = OptionParser.new do|opts|
    # Set a banner, displayed at the top
    # of the help screen.
    opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
   options[:sitemap] = false
-   opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
-     options[:sitemap] = output_type
+   opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
+     options[:sitemap] = output_type||''
    end
   options[:fileharvest] = false
-   opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
+   opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
      options[:fileharvest] = file_ext
+   end
+   options[:seo] = false
+   opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
+     options[:seo] = true
    end
     options[:filename] = nil
    opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
@@ -56,14 +62,14 @@ ARGV.each do|q|
     puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
     puts "### Performing File Harvest" if options[:fileharvest]
     puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
+    puts "### Performing SEO Scrape" if options[:seo]
     puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
     puts "### Being verbose"
-    puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
+    puts "### Stopping after #{options[:maxpages]} pages"
   end
   puts "###############################"
   puts "### [RubyRetriever] go fetch #{q}"
-  Retriever::FetchFiles.new(q, options) if options[:fileharvest]
-  Retriever::FetchSitemap.new(q, options) if options[:sitemap]
+  Retriever::CLI.new(q, options)
   puts "### [RubyRetriever] is done."
   puts "###############################"
   puts

data/lib/retriever/cli.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Retriever
+	class CLI
+		def initialize(url,options)
+			#kick off the fetch mode of choice
+			if options[:fileharvest]
+				@fetch = Retriever::FetchFiles.new(url, options)
+			elsif options[:sitemap]
+				@fetch = Retriever::FetchSitemap.new(url, options)
+			elsif options[:seo]
+				@fetch = Retriever::FetchSEO.new(url, options)
+			else
+				fail "### Error: No Mode Selected"
+			end
+			#all fetch modes
+			@fetch.dump
+			@fetch.write if options[:filename]
+			#fileharvest only
+			@fetch.autodownload if options[:autodown] && options[:fileharvest]
+			#sitemap only
+			@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
+		end
+	end
+end

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -9,12 +9,14 @@ require 'bloomfilter-rb'
 module Retriever
 	class Fetch
 		attr_reader :maxPages, :t
-		#constants
-		HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
-		NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
 		def initialize(url,options)
-			@t = Retriever::Target.new(url)
+			@connection_tally = {
+				:success => 0,
+				:error => 0,
+				:error_client => 0,
+				:error_server => 0
+			}
 			#OPTIONS
 			@prgrss = options[:progress] ? options[:progress] : false
 			@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -23,6 +25,7 @@ module Retriever
 			@fh = options[:fileharvest] ? options[:fileharvest] : false
 			@file_ext = @fh.to_s
 			@s = options[:sitemap] ? options[:sitemap] : false
+			@seo = options[:seo] ? true : false
 			@autodown = options[:autodown] ? true : false
 			#
 			if @fh
@@ -30,9 +33,6 @@ module Retriever
 				@file_re = Regexp.new(tempExtStr).freeze
 			else
 				errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
-				if !@output
-					@output = "rr-#{@t.host.split('.')[1]}"
-				end
 			end
 			if @prgrss
 				errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -44,8 +44,13 @@ module Retriever
 				}
 				@progressbar = ProgressBar.create(prgressVars)
 			end
+			@t = Retriever::Target.new(url,@file_re)
 			@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
 			@already_crawled.insert(@t.target)
+			if (@fh && !@output)
+				@output = "rr-#{@t.host.split('.')[1]}"
+			end
+			fail "bad page source on target -- try HTTPS?" if !@t.source
 		end
 		def errlog(msg)
 			raise "ERROR: #{msg}"
@@ -53,52 +58,52 @@ module Retriever
 		def lg(msg)
 			puts "### #{msg}" if @v
 		end
-		def dump(data)
+		def dump
 			puts "###############################"
+			if @v
+				puts "Connection Tally:"
+				puts @connection_tally.to_s
+				puts "###############################"
+			end
 			if @s
 				puts "#{@t.target} Sitemap"
-				puts "Page Count: #{data.size}"
+				puts "Page Count: #{@data.size}"
 			elsif @fh
 				puts "Target URL: #{@t.target}"
 				puts "Filetype: #{@file_ext}"
-				puts "File Count: #{data.size}"
+				puts "File Count: #{@data.size}"
+			elsif @seo
+				puts "#{@t.target} SEO Metrics"
+				puts "Page Count: #{@data.size}"
 			else
-				puts "ERROR"
+				fail "ERROR - Cannot dump - Mode Not Found"
 			end
 			puts "###############################"
-			puts data
+			@data.each do |line|
+				puts line
+			end
 			puts "###############################"
 			puts
 		end
-		def write(data)
+		def write
 			if @output
+				i = 0
 				CSV.open("#{@output}.csv", "w") do |csv|
-				  data.each do |entry|
-				  	csv << [entry]
-				  end
+					if ((i == 0) && @seo)
+						csv << ['URL','Page Title','Meta Description','H1','H2']
+						i +=1
+					end
+					@data.each do |entry|
+						csv << entry
+					end
 				end
 				puts "###############################"
 				puts "File Created: #{@output}.csv"
-				puts "Object Count: #{data.size}"
+				puts "Object Count: #{@data.size}"
 				puts "###############################"
 				puts
 			end
 		end
-		#recieves page source as string
-		#returns array of unique href links
-		def fetchLinks(doc)
-			return false if !doc
-			doc.scan(HREF_CONTENTS_RE).map do |match|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
-				link = match[0]
-				Link.new(@t.host, link).path
-			end.uniq
-		end
-		def parseInternalLinks(all_links)
-				all_links.select{ |linky| (@t.host_re =~ linky) }
-		end
-		def parseInternalVisitableLinks(all_links)
-				parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
-		end
 		def async_crawl_and_collect()
 			while (@already_crawled.size < @maxPages)
 				if @linkStack.empty?
@@ -112,11 +117,41 @@ module Retriever
 				new_links_arr = self.asyncGetWave()
 				next if (new_links_arr.nil? || new_links_arr.empty?)
 				new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
-				@linkStack.concat(new_links_arr)
-				@sitemap.concat(new_links_arr) if @s
+				@linkStack.concat(new_links_arr).uniq!
+				@data.concat(new_links_arr) if @s
 			end
 			@progressbar.finish if @prgrss
 		end
+		def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
+			return false if !resp
+			if resp.response_header.redirection? #we got redirected
+				loc = resp.response_header.location
+				lg("#{url} Redirected to #{loc}")
+				if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
+			    	@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
+			    	lg("--Added to linkStack for later")
+			    	return false
+			    end
+			    lg("Redirection outside of target host. No - go. #{loc}")
+			    return false
+			end
+			if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
+				lg("UNSUCCESSFUL CONNECTION -- #{url}")
+				@connection_tally[:error] += 1
+				@connection_tally[:error_server] += 1 if resp.response_header.server_error?
+				@connection_tally[:error_client] += 1 if resp.response_header.client_error?
+				return false
+			end
+			if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
+				@already_crawled.insert(url)
+				@linkStack.delete(url)
+				lg("Page Not text/html -- #{url}")
+				return false
+			end
+			@connection_tally[:success] += 1
+			return true
+		end
 		def asyncGetWave() #send a new wave of GET requests, using current @linkStack
 			new_stuff = []
 			EM.synchrony do
@@ -129,20 +164,27 @@ module Retriever
 			    		next
 			    	end
 			    	resp = EventMachine::HttpRequest.new(url).get
-					lg("URL Crawled: #{url}")
+			    	next if !good_response?(resp,url)
+			    	new_page = Retriever::Page.new(resp.response,@t)
+			    	lg("Page Fetched: #{url}")
 			    	@already_crawled.insert(url)
 					if @prgrss
 						@progressbar.increment if @already_crawled.size < @maxPages
 					end
-					new_links_arr = self.fetchLinks(resp.response)
-					if new_links_arr
-						lg("#{new_links_arr.size} new links found")
-						internal_links_arr = self.parseInternalLinks(new_links_arr)
+					if @seo
+						seos = [url]
+						seos.concat(new_page.parseSEO)
+						@data.push(seos)
+						lg("--page SEO scraped")
+					end
+					if new_page.links
+						lg("--#{new_page.links.size} links found")
+						internal_links_arr = new_page.parseInternalVisitable
 						new_stuff.push(internal_links_arr)
 						if @fh
-							filez = self.parseFiles(new_links_arr)
-							@fileStack.concat(filez) if !filez.empty?
-							lg("#{filez.size} files found")
+							filez = new_page.parseFiles
+							@data.concat(filez) if !filez.empty?
+							lg("--#{filez.size} files found")
 						end
 					end
 			    end
@@ -151,8 +193,5 @@ module Retriever
 			end
 			new_stuff.uniq!
 		end
-		def parseFiles(all_links)
-			all_links.select{ |linky| (@file_re =~ linky)}
-		end
 	end
 end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,17 +1,16 @@
 module Retriever
 	class FetchFiles < Fetch
-		attr_reader :fileStack
 		def initialize(url,options)
 			super
-			@fileStack = []
-			all_links = self.fetchLinks(@t.source)
-			@linkStack = self.parseInternalVisitableLinks(all_links)
+			@data = []
+			page_one = Retriever::Page.new(@t.source,@t)
+			@linkStack = page_one.parseInternalVisitable
 			lg("URL Crawled: #{@t.target}")
-			self.lg("#{@linkStack.size-1} new links found")
+			lg("#{@linkStack.size-1} new links found")
-			tempFileCollection = self.parseFiles(all_links)
-			@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
-			self.lg("#{@fileStack.size} new files found")
+			tempFileCollection = page_one.parseFiles
+			@data.concat(tempFileCollection) if tempFileCollection.size>0
+			lg("#{@data.size} new files found")
 			errlog("Bad URL -- #{@t.target}") if !@linkStack
 			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
@@ -19,19 +18,14 @@ module Retriever
 			self.async_crawl_and_collect()
-			@fileStack.sort_by! {|x| x.length}
-			@fileStack.uniq!
-			self.dump(self.fileStack)
-			self.write(@output,self.fileStack) if @output
-			self.autodownload() if @autodown
+			@data.sort_by! {|x| x.length}
+			@data.uniq!
 		end
 		def download_file(path)
 			arr = path.split('/')
 			shortname = arr.pop
 			puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
 			File.open(shortname, "wb") do |saved_file|
-			  # the following "open" is provided by open-uri
 			  open(path) do |read_file|
 			    saved_file.write(read_file.read)
 			  end
@@ -39,7 +33,7 @@ module Retriever
 			puts "	SUCCESS: Download Complete"
 		end
 		def autodownload()
-			lenny = @fileStack.count
+			lenny = @data.count
 			puts "###################"
 			puts "### Initiating Autodownload..."
 			puts "###################"
@@ -53,7 +47,7 @@ module Retriever
 			 Dir.chdir("rr-downloads")
 			end
 			file_counter = 0
-			@fileStack.each do |entry|
+			@data.each do |entry|
 				begin
 					self.download_file(entry)
 					file_counter+=1

data/lib/retriever/fetchseo.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Retriever
+	class FetchSEO < Fetch
+		def initialize(url,options)
+			super
+			@data = []
+			page_one = Retriever::Page.new(@t.source,@t)
+			@linkStack = page_one.parseInternalVisitable
+			lg("URL Crawled: #{@t.target}")
+			lg("#{@linkStack.size-1} new links found")
+			@data.push(page_one.parseSEO)
+			lg("#{@data.size} pages scraped")
+			errlog("Bad URL -- #{@t.target}") if !@linkStack
+			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
+			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
+			self.async_crawl_and_collect()
+			@data.sort_by! {|x| x[0].length}
+		end
+	end
+end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,38 +1,34 @@
 module Retriever
 	class FetchSitemap < Fetch
-		attr_reader :sitemap
 		def initialize(url,options)
 			super
-			@sitemap = [@t.target]
-			@linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
+			@data = [@t.target]
+			page_one = Retriever::Page.new(@t.source,@t)
+			@linkStack = page_one.parseInternalVisitable
 			lg("URL Crawled: #{@t.target}")
-			self.lg("#{@linkStack.size-1} new links found")
+			lg("#{@linkStack.size-1} new links found")
 			errlog("Bad URL -- #{@t.target}") if !@linkStack
 			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
 			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
-			@sitemap.concat(@linkStack)
+			@data.concat(@linkStack)
 			self.async_crawl_and_collect()
-			@sitemap.sort_by!	 {|x| x.length} if @sitemap.size>1
-			@sitemap.uniq!
-			self.dump(self.sitemap)
-			self.write(self.sitemap) if /CSV/i =~ @s
-			self.gen_xml(self.sitemap) if /XML/i =~ @s
+			@data.sort_by!	 {|x| x.length} if @data.size>1
+			@data.uniq!
 		end
-		def gen_xml(data)
+		def gen_xml
 			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
 			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-				data.each do |url|
+				@data.each do |url|
 					f << "<url><loc>#{url}</loc></url>"
 				end
 			f << "</urlset>"
 			f.close
 			puts "###############################"
 			puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
-			puts "Object Count: #{@sitemap.size}"
+			puts "Object Count: #{@data.size}"
 			puts "###############################"
 			puts
 		end

data/lib/retriever/openuri-redirect-patch.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module OpenURI
+  def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
+    uri1.scheme.downcase == uri2.scheme.downcase ||
+    (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
+  end
+end

data/lib/retriever/page.rb ADDED Viewed

@@ -0,0 +1,68 @@
+module Retriever
+  class Page
+    HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
+    NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
+    HTTP_RE = Regexp.new(/^http/i).freeze
+    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
+    DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
+    H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
+    H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
+    attr_reader :links, :source, :t
+    def initialize(source,t)
+      @t = t
+      @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
+      @links = nil
+    end
+    #recieves page source as string
+    #returns array of unique href links
+    def links
+      return @links if @links
+      return false if !@source
+      @links = @source.scan(HREF_CONTENTS_RE).map do |match|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
+        link = match[0]
+        Link.new(@t.host, link).path
+      end.uniq
+    end
+    def parseInternal
+      links.select{ |linky| (@t.host_re =~ linky) }
+    end
+    def parseInternalVisitable
+      parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
+    end
+    def parseFiles
+      links.select{ |linky| (@t.file_re =~ linky)}
+    end
+    def title
+      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
+    end
+    def desc
+      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
+    end
+    def h1
+      H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
+    end
+    def h2
+      H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
+    end
+    def parseSEO
+      return [title,desc,h1,h2]
+    end
+  end
+end

data/lib/retriever/target.rb CHANGED Viewed

@@ -1,17 +1,22 @@
 require 'open-uri'
 module Retriever
   class Target
     HTTP_RE = Regexp.new(/^http/i).freeze
     DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
-    attr_reader :host, :target, :host_re, :source
-    def initialize(url)
+    attr_reader :host, :target, :host_re, :source, :file_re
+    def initialize(url,file_re=nil)
       url = "http://#{url}" if (!(HTTP_RE =~ url))
       fail "Bad URL" if (!(/\./ =~ url))
       new_uri = URI(url)
       @target = new_uri.to_s
       @host = new_uri.host
-      @host_re = Regexp.new(@host).freeze
+      @host_re = Regexp.new(@host.sub('www.',''))
+      @file_re ||= file_re
     end
     def source
@@ -19,23 +24,29 @@ module Retriever
       begin
         resp = open(@target)
       rescue StandardError => e
-        #puts e.message + " ## " + url
-        #the trap abrt is nescessary to handle the SSL error
-        #for some ungodly reason it's the only way I found to handle it
         trap("ABRT"){
           puts "#{@target} failed SSL Certification Verification"
         }
         return false
       end
-      if (@target != resp.base_uri.to_s)
-          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
+      resp_url = resp.base_uri.to_s
+      if (@target != resp_url)
+          if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
+            new_t = Retriever::Target.new(resp_url)
+            @target = new_t.target
+            @host = new_t.host
+            return new_t.source
+          end
+          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
       end
       resp = resp.read
       if resp == ""
         fail "Domain is not working. Try the non-WWW version."
       end
-      return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
+      fail "Domain not working. Try HTTPS???" if !resp
+      return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
     end
   end
 end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '0.1.4'
+  VERSION = '1.0.0'
 end

data/lib/retriever.rb CHANGED Viewed

@@ -1,8 +1,12 @@
 require 'retriever/fetch'
 require 'retriever/fetchfiles'
 require 'retriever/fetchsitemap'
+require 'retriever/fetchseo'
+require 'retriever/cli'
 require 'retriever/link'
 require 'retriever/target'
+require 'retriever/page'
+require 'retriever/openuri-redirect-patch'
 module Retriever

data/readme.md CHANGED Viewed

@@ -17,7 +17,7 @@ Install the gem
 ```sh
 gem install rubyretriever
 ```
  **Example: Sitemap mode**
 ```sh
 rr --sitemap CSV --progress --limit 100 http://www.cnet.com
@@ -31,14 +31,25 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
  **Example: File Harvesting mode**
 ```sh
-rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
+rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
+```
+OR -- SAME COMMAND
+```sh
+rr -f pdf -p -l 100 http://www.hubspot.com
+```
+This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
+**Example: SEO mode**
+```sh
+rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
 ```
 OR -- SAME COMMAND
 ```sh
-rr -f pdf -p -l 1000 http://www.hubspot.com
+rr -e -p -l 10 -o cnet-seo http://www.cnet.com
 ```
-This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
+This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
 command-line arguments
@@ -47,10 +58,11 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
 Where MODE FLAG is required, and is either:
 	-s, --sitemap FORMAT  (only accepts CSV or XML atm)
-	-f, --files FILETYPE
+	-f, --files FILETYPE
+	-e, --seo
 and OPTIONS is the applicable:
-    -o, --out FILENAME                  *Dump output to selected filename --being phased out*
+    -o, --out FILENAME                  *Dump fetch data as CSV*
     -p, --progress						*Outputs a progressbar*
     -v, --verbose                       *Output more information*
     -l, --limit PAGE_LIMIT_#            *set a max on the total number of crawled pages*

data/spec/link_spec.rb CHANGED Viewed

@@ -2,8 +2,8 @@ require 'retriever'
 describe "Link" do
-    r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
-    let(:links) { r.fetchLinks(@source) }
+    t = Retriever::Target.new("http://www.cnet.com/reviews/")
+    let(:links) { Retriever::Page.new(@source,t).links }
     it "collects links in anchor tags" do
       @source = (<<SOURCE).strip

data/spec/page_spec.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require 'retriever/page'
+require 'retriever/fetch'
+t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
+describe "Page" do
+  describe "#links" do
+    let (:links){Retriever::Page.new(@source,t).links}
+    it "collects all unique href links on the page" do
+            @source = (<<SOURCE).strip
+<a href='www.cnet.com/download.exe'>download</a>
+<a href='/test.html'>test</a>
+<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
+<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
+SOURCE
+      expect(links).to have(4).items
+    end
+  end
+  describe "#parseInternal" do
+    let (:links){Retriever::Page.new(@source,t).parseInternal}
+    it "filters links by host" do
+            @source = (<<SOURCE).strip
+<a href='http://www.cnet.com/'>download</a>
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
+SOURCE
+        expect(links).to have(1).items
+    end
+  end
+  describe "#parseInternalVisitable" do
+    let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
+    it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
+            @source = (<<SOURCE).strip
+ <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
+SOURCE
+        expect(links).to have(0).items
+    end
+  end
+  describe "#parseFiles" do
+    let (:links){Retriever::Page.new(@source,t).parseFiles}
+    it "filters links by filetype" do
+                  @source = (<<SOURCE).strip
+<a href='www.cnet.com/download.exe'>download</a>
+http://www.google.com
+<a href='/test.html'>test</a>
+SOURCE
+        expect(links).to have(1).items
+    end
+  end
+    describe "#title" do
+      let (:page){Retriever::Page.new(@source,t)}
+    it "returns page title" do
+                  @source = (<<SOURCE).strip
+<title>test</title>
+SOURCE
+        expect(page.title).to eq('test')
+    end
+  end
+      describe "#desc" do
+      let (:page){Retriever::Page.new(@source,t)}
+    it "returns meta description" do
+                  @source = (<<SOURCE).strip
+<meta name='description' content="test2 ">
+SOURCE
+        expect(page.desc).to eq('test2 ')
+    end
+  end
+        describe "#h1" do
+      let (:page){Retriever::Page.new(@source,t)}
+    it "returns h1 text" do
+                  @source = (<<SOURCE).strip
+<h1>test 3</h1>
+SOURCE
+        expect(page.h1).to eq('test 3')
+    end
+  end
+        describe "#h2" do
+      let (:page){Retriever::Page.new(@source,t)}
+    it "returns h2 text" do
+                  @source = (<<SOURCE).strip
+<h2> test 4 </h2>
+SOURCE
+        expect(page.h2).to eq(' test 4 ')
+    end
+  end
+end

data/spec/retriever_spec.rb CHANGED Viewed

@@ -1,52 +1,5 @@
 require 'retriever'
-r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
-test_html = "<a href='www.cnet.com/download.exe'>download</a>
-http://www.google.com
-<a href='/test.html'>test</a>
-<a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
-<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
-<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
- <a href='http://www.yahoo.com/test/'>yahoo</a>
- test.com
- <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
- <a href='cpage_18'>about</a>"
-links_collection = r.fetchLinks(test_html)
 describe "Fetch" do
-	describe "#fetchLinks" do
-		it "collects all unique href links on the page" do
-			expect(links_collection).to have(6).items
-		end
-	end
-	describe "#parseInternalLinks" do
-		let (:filtered_links) {r.parseInternalLinks(links_collection)}
-		it "filters links by host" do
-			filtered_links.each do |link|
-				expect(link).to include("www.cnet.com")
-			end
-		end
-	end
-	describe "#parseInternalVisitableLinks" do
-		let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
-		it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
-			filtered_links.each do |link|
-				expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
-			end
-		end
-	end
-	describe "#parseFiles" do
-		let(:file_list) {r.parseFiles(links_collection)}
-		it "filters links by filetype" do
-			file_list.each do |link|
-				expect(link).to include(".exe")
-			end
-		end
-	end
 end

data/spec/target_spec.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require 'retriever'
 require 'open-uri'
-t = Retriever::Target.new("http://www.cnet.com/reviews/")
+t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
 describe "Target" do
@@ -14,7 +14,11 @@ describe "Target" do
     end
     it "creates host_re var" do
-      expect(t.host_re).to eq(/www.cnet.com/)
+      expect(t.host_re).to eq(/cnet.com/)
+    end
+    it "creates file_re var (when provided)" do
+      expect(t.file_re).to eq(/\.exe\z/)
     end
     it "adds protocol to Target URL if none given" do
@@ -34,6 +38,7 @@ describe "Target" do
     it "fails if target redirects to new host" do
       expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 1.0.0
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-05-26 00:00:00.000000000 Z
+date: 2014-06-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony
@@ -119,18 +119,23 @@ files:
 - LICENSE
 - bin/rr
 - lib/retriever.rb
+- lib/retriever/cli.rb
 - lib/retriever/fetch.rb
 - lib/retriever/fetchfiles.rb
+- lib/retriever/fetchseo.rb
 - lib/retriever/fetchsitemap.rb
 - lib/retriever/link.rb
+- lib/retriever/openuri-redirect-patch.rb
+- lib/retriever/page.rb
 - lib/retriever/target.rb
 - lib/retriever/version.rb
 - readme.md
 - spec/link_spec.rb
+- spec/page_spec.rb
 - spec/retriever_spec.rb
 - spec/spec_helper.rb
 - spec/target_spec.rb
-homepage: http://www.softwarebyjoe.com/rubyretriever/
+homepage: http://softwarebyjoe.com/rubyretriever/
 licenses:
 - MIT
 metadata: {}