RubyGems - rubyretriever - Versions diffs - 1.0.3 → 1.1.0 - Mend

rubyretriever 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/bin/rr +41 -41
data/lib/retriever/cli.rb +20 -26
data/lib/retriever/fetch.rb +209 -186
data/lib/retriever/fetchfiles.rb +65 -60
data/lib/retriever/fetchseo.rb +20 -18
data/lib/retriever/fetchsitemap.rb +37 -32
data/lib/retriever/link.rb +6 -2
data/lib/retriever/openuri-redirect-patch.rb +3 -2
data/lib/retriever/page.rb +20 -21
data/lib/retriever/target.rb +22 -30
data/lib/retriever/version.rb +1 -1
data/lib/retriever.rb +2 -2
data/readme.md +5 -4
data/spec/page_spec.rb +6 -7
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0aa827221b6c3034f4463c376b29e47b740580e6
-  data.tar.gz: c800c5820d62e45c140dea2d94140a3a9636aeff
+  metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
+  data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
 SHA512:
-  metadata.gz: fba8ef21412309bdfe3435caf8fe4ec01d197cce5cb1698fc9536bc8127bcd9c45d51c6a908bb642c5bfb3ff9caca1e73f98c78669036dc94c38704412a0461a
-  data.tar.gz: 092058e59d4c591be1d5ceab99dfa4219f86929897d2d1ab25859d03bf918622f4289ab8bca5fff4e87d6b9af228af80ac129deec1122e18d2f33db38544dea6
+  metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
+  data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64

data/bin/rr CHANGED Viewed

@@ -4,73 +4,73 @@ require 'retriever'
 require 'optparse'
 options = {}
- optparse = OptionParser.new do|opts|
+ optparse = OptionParser.new do |opts|
    # Set a banner, displayed at the top
    # of the help screen.
-   opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
-  options[:sitemap] = false
-   opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
-     options[:sitemap] = output_type||''
+   opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
+   options['sitemap'] = false
+   opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
+     options['sitemap'] = output_type || ''
    end
-  options[:fileharvest] = false
-   opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
-     options[:fileharvest] = file_ext
+   options['fileharvest'] = false
+   opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
+     options['fileharvest'] = file_ext
    end
-   options[:seo] = false
-   opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
-     options[:seo] = true
+   options['seo'] = false
+   opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
+     options['seo'] = true
    end
-    options[:filename] = nil
-   opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
-     options[:filename] = filename
+   options['filename'] = nil
+   opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
+     options['filename'] = filename
    end
    # Define the options, and what they do
-   options[:verbose] = false
-   opts.on( '-v', '--verbose', 'Output more information' ) do
-     options[:verbose] = true
+   options['verbose'] = false
+   opts.on('-v', '--verbose', 'Output more information') do
+     options['verbose'] = true
    end
-   options[:progress] = false
-   opts.on( '-p', '--progress', 'Output progress bar' ) do
-     options[:progress] = true
+   options['progress'] = false
+   opts.on('-p', '--progress', 'Output progress bar') do
+     options['progress'] = true
    end
-  options[:maxpages] = false
-   opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
+   options['maxpages'] = false
+   opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
      options[:maxpages] = maxpages
    end
-   options[:autodown] = false
-   opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
+   options['autodown'] = false
+   opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
      options[:autodown] = true
    end
    # This displays the help screen, all programs are
    # assumed to have this option.
-   opts.on( '-h', '--help', 'Display this screen' ) do
+   opts.on('-h', '--help', 'Display this screen') do
      puts opts
      exit
    end
  end
- optparse.parse!
+optparse.parse!
 if ARGV[0].nil?
-	abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
+  abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
 end
 ARGV.each do|q|
   if options[:verbose]
-    puts "###############################"
-    puts "### [RubyRetriever]"
-    puts "### Creating Sitemap" if options[:sitemap]
-    puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
-    puts "### Performing File Harvest" if options[:fileharvest]
-    puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
-    puts "### Performing SEO Scrape" if options[:seo]
-    puts "### Writing output to filename: #{options[:filename]}" if options[:filename]
-    puts "### Being verbose"
-    puts "### Stopping after #{options[:maxpages]} pages"
+    puts '###############################'
+    puts '### [RubyRetriever]'
+    puts '### Creating Sitemap' if options['sitemap']
+    puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
+    puts '### Performing File Harvest' if options['fileharvest']
+    puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
+    puts '### Performing SEO Scrape' if options['seo']
+    puts "### Writing output to filename: #{options['filename']}" if options['filename']
+    puts '### Being verbose'
+    puts "### Stopping after #{options['maxpages']} pages"
   end
-  puts "###############################"
+  puts '###############################'
   puts "### [RubyRetriever] go fetch #{q}"
   Retriever::CLI.new(q, options)
-  puts "### [RubyRetriever] is done."
-  puts "###############################"
+  puts '### [RubyRetriever] is done.'
+  puts '###############################'
   puts
 end

data/lib/retriever/cli.rb CHANGED Viewed

@@ -1,27 +1,21 @@
 module Retriever
-	class CLI
-		def initialize(url,options)
-			#kick off the fetch mode of choice
-			if options[:fileharvest]
-				@fetch = Retriever::FetchFiles.new(url, options)
-			elsif options[:sitemap]
-				@fetch = Retriever::FetchSitemap.new(url, options)
-			elsif options[:seo]
-				@fetch = Retriever::FetchSEO.new(url, options)
-			else
-				fail "### Error: No Mode Selected"
-			end
-			#all fetch modes
-			@fetch.dump
-			@fetch.write if options[:filename]
-			#fileharvest only
-			@fetch.autodownload if options[:autodown] && options[:fileharvest]
-			#sitemap only
-			@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
-		end
-	end
-end
+  #
+  class CLI
+    def initialize(url, options)
+      # kick off the fetch mode of choice
+      if options['fileharvest']
+        @fetch = Retriever::FetchFiles.new(url, options)
+      elsif options['sitemap']
+        @fetch = Retriever::FetchSitemap.new(url, options)
+      elsif options['seo']
+        @fetch = Retriever::FetchSEO.new(url, options)
+      else
+        fail '### Error: No Mode Selected'
+      end
+      @fetch.dump
+      @fetch.write if options['filename']
+      @fetch.autodownload if options['autodown'] && options['fileharvest']
+      @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
+    end
+  end
+end

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -7,191 +7,214 @@ require 'csv'
 require 'bloomfilter-rb'
 module Retriever
-	class Fetch
-		attr_reader :maxPages, :t
+  #
+  class Fetch
+    attr_reader :max_pages, :t
+    # given target URL and RR options, creates a fetch object.
+    # There is no direct output
+    # this is a parent class that the other fetch classes build off of.
+    def initialize(url, options)
+      @connection_tally = {
+        :success => 0,
+        :error => 0,
+        :error_client => 0,
+        :error_server => 0
+      }
+      # OPTIONS
+      @prgrss = options['progress']
+      @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
+      @v = options['verbose']
+      @output = options['filename']
+      @fh = options['fileharvest']
+      @file_ext = @fh.to_s
+      @s = options['sitemap']
+      @seo = options['seo']
+      @autodown = options['autodown']
+      #
+      if @fh
+        temp_ext_str = '.' + @file_ext + '\z'
+        @file_re = Regexp.new(temp_ext_str).freeze
+      else
+        # when FH is not true, and autodown is true
+        errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
+      end
+      if @prgrss
+        # verbose & progressbar conflict
+        errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
+        prgress_vars = {
+          :title => 'Pages',
+          :starting_at => 1,
+          :total => @max_pages,
+          :format => '%a |%b>%i| %c/%C %t'
+        }
+        @progressbar = ProgressBar.create(prgress_vars)
+      end
+      @t = Retriever::Target.new(url, @file_re)
+      @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
+      @already_crawled = BloomFilter::Native.new(
+        :size => 1_000_000,
+        :hashes => 5,
+        :seed => 1,
+        :bucket => 8,
+        :raise => false
+      )
+      @already_crawled.insert(@t.target)
+    end
-		def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
-			@connection_tally = {
-				:success => 0,
-				:error => 0,
-				:error_client => 0,
-				:error_server => 0
-			}
-			#OPTIONS
-			@prgrss = options[:progress] ? options[:progress] : false
-			@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
-			@v= options[:verbose] ? true : false
-			@output=options[:filename] ? options[:filename] : false
-			@fh = options[:fileharvest] ? options[:fileharvest] : false
-			@file_ext = @fh.to_s
-			@s = options[:sitemap] ? options[:sitemap] : false
-			@seo = options[:seo] ? true : false
-			@autodown = options[:autodown] ? true : false
-			#
-			if @fh
-				tempExtStr = "."+@file_ext+'\z'
-				@file_re = Regexp.new(tempExtStr).freeze
-			else
-				errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
-			end
-			if @prgrss
-				errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
-				prgressVars = {
-					:title => "Pages Crawled",
-					:starting_at => 1,
-					:total => @maxPages,
-					:format => '%a |%b>%i| %c/%C %t',
-				}
-				@progressbar = ProgressBar.create(prgressVars)
-			end
-			@t = Retriever::Target.new(url,@file_re)
-			@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
-			@already_crawled.insert(@t.target)
-			if (@fh && !@output)
-				@output = "rr-#{@t.host.split('.')[1]}"
-			end
-			fail "bad page source on target -- try HTTPS?" if !@t.source
-		end
-		def errlog(msg)
-			raise "ERROR: #{msg}"
-		end
-		def lg(msg)
-			puts "### #{msg}" if @v
-		end
-		def dump #prints current data collection to STDOUT, meant for CLI use.
-			puts "###############################"
-			if @v
-				puts "Connection Tally:"
-				puts @connection_tally.to_s
-				puts "###############################"
-			end
-			if @s
-				puts "#{@t.target} Sitemap"
-				puts "Page Count: #{@data.size}"
-			elsif @fh
-				puts "Target URL: #{@t.target}"
-				puts "Filetype: #{@file_ext}"
-				puts "File Count: #{@data.size}"
-			elsif @seo
-				puts "#{@t.target} SEO Metrics"
-				puts "Page Count: #{@data.size}"
-			else
-				fail "ERROR - Cannot dump - Mode Not Found"
-			end
-			puts "###############################"
-			@data.each do |line|
-				puts line
-			end
-			puts "###############################"
-			puts
-		end
-		def write #writes current data collection out to CSV in current directory
-			if @output
-				i = 0
-				CSV.open("#{@output}.csv", "w") do |csv|
-					if ((i == 0) && @seo)
-						csv << ['URL','Page Title','Meta Description','H1','H2']
-						i +=1
-					end
-					@data.each do |entry|
-						csv << entry
-					end
-				end
-				puts "###############################"
-				puts "File Created: #{@output}.csv"
-				puts "Object Count: #{@data.size}"
-				puts "###############################"
-				puts
-			end
-		end
-		def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
-			while (@already_crawled.size < @maxPages)
-				if @linkStack.empty?
-					if @prgrss
-						@progressbar.log("Can't find any more links. Site might be completely mapped.")
-					else
-						lg("Can't find any more links. Site might be completely mapped.")
-					end
-					break;
-				end
-				new_links_arr = self.asyncGetWave()
-				next if (new_links_arr.nil? || new_links_arr.empty?)
-				new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
-				@linkStack.concat(new_links_arr).uniq!
-				@data.concat(new_links_arr) if @s
-			end
-			@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
-		end
-		def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
-			return false if !resp
-			if resp.response_header.redirection? #we got redirected
-				loc = resp.response_header.location
-				lg("#{url} Redirected to #{loc}")
-				if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
-			    	@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
-			    	lg("--Added to linkStack for later")
-			    	return false
-			    end
-			    lg("Redirection outside of target host. No - go. #{loc}")
-			    return false
-			end
-			if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
-				lg("UNSUCCESSFUL CONNECTION -- #{url}")
-				@connection_tally[:error] += 1
-				@connection_tally[:error_server] += 1 if resp.response_header.server_error?
-				@connection_tally[:error_client] += 1 if resp.response_header.client_error?
-				return false
-			end
-			if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
-				@already_crawled.insert(url)
-				@linkStack.delete(url)
-				lg("Page Not text/html -- #{url}")
-				return false
-			end
-			@connection_tally[:success] += 1
-			return true
-		end
+    def errlog(msg)
+      fail "ERROR: #{msg}"
+    end
-		def asyncGetWave() #send a new wave of GET requests, using current @linkStack
-			new_stuff = []
-			EM.synchrony do
-				lenny = 0
-			    concurrency = 10
-			    EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
-			    	next if (@already_crawled.size >= @maxPages)
-			    	if @already_crawled.include?(url)
-			    		@linkStack.delete(url)
-			    		next
-			    	end
-			    	resp = EventMachine::HttpRequest.new(url).get
-			    	next if !good_response?(resp,url)
-			    	new_page = Retriever::Page.new(resp.response,@t)
-			    	lg("Page Fetched: #{url}")
-			    	@already_crawled.insert(url)
-					if @prgrss
-						@progressbar.increment if @already_crawled.size < @maxPages
-					end
-					if @seo
-						seos = [url]
-						seos.concat(new_page.parseSEO)
-						@data.push(seos)
-						lg("--page SEO scraped")
-					end
-					if new_page.links
-						lg("--#{new_page.links.size} links found")
-						internal_links_arr = new_page.parseInternalVisitable
-						new_stuff.push(internal_links_arr)
-						if @fh
-							filez = new_page.parseFiles
-							@data.concat(filez) if !filez.empty?
-							lg("--#{filez.size} files found")
-						end
-					end
-			    end
-			    new_stuff = new_stuff.flatten # all completed requests
-			    EventMachine.stop
-			end
-			new_stuff.uniq!
-		end
-	end
-end
+    def lg(msg)
+      puts "### #{msg}" if @v
+    end
+    # prints current data collection to STDOUT
+    def dump
+      puts '###############################'
+      if @v
+        puts 'Connection Tally:'
+        puts @connection_tally.to_s
+        puts '###############################'
+      end
+      if @s
+        puts "#{@t.target} Sitemap"
+        puts "Page Count: #{@data.size}"
+      elsif @fh
+        puts "Target URL: #{@t.target}"
+        puts "Filetype: #{@file_ext}"
+        puts "File Count: #{@data.size}"
+      elsif @seo
+        puts "#{@t.target} SEO Metrics"
+        puts "Page Count: #{@data.size}"
+      else
+        fail 'ERROR - Cannot dump - Mode Not Found'
+      end
+      puts '###############################'
+      @data.each do |line|
+        puts line
+      end
+      puts '###############################'
+      puts
+    end
+    # writes current data collection out to CSV in current directory
+    def write
+      return false unless @output
+      i = 0
+      CSV.open("#{@output}.csv", 'w') do |csv|
+        if (i == 0) && @seo
+          csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
+          i += 1
+        end
+        @data.each do |entry|
+          csv << entry
+        end
+      end
+      puts '###############################'
+      puts "File Created: #{@output}.csv"
+      puts "Object Count: #{@data.size}"
+      puts '###############################'
+      puts
+    end
+    # iterates over the existing @link_stack
+    # running until we reach the @max_pages value.
+    def async_crawl_and_collect
+      while @already_crawled.size < @max_pages
+        if @link_stack.empty?
+          if @prgrss
+            @progressbar.log("Can't find any more links.")
+          else
+            lg("Can't find any more links.")
+          end
+          break
+        end
+        new_links_arr = process_link_stack
+        next if new_links_arr.nil? || new_links_arr.empty?
+        # set operations to see are these in our previous visited pages arr
+        new_links_arr -= @link_stack
+        @link_stack.concat(new_links_arr).uniq!
+        @data.concat(new_links_arr) if @s
+      end
+      # done, make sure progress bar says we are done
+      @progressbar.finish if @prgrss
+    end
+    # returns true is resp is ok to continue
+    def good_response?(resp, url)
+      return false unless resp
+      hdr = resp.response_header
+      if hdr.redirection?
+        loc = hdr.location
+        lg("#{url} Redirected to #{loc}")
+        if t.host_re =~ loc
+          @link_stack.push(loc) unless @already_crawled.include?(loc)
+          lg('--Added to linkStack for later')
+          return false
+        end
+        lg("Redirection outside of target host. No - go. #{loc}")
+        return false
+      end
+      # lets not continue if unsuccessful connection
+      unless hdr.successful?
+        lg("UNSUCCESSFUL CONNECTION -- #{url}")
+        @connection_tally[:error] += 1
+        @connection_tally[:error_server] += 1 if hdr.server_error?
+        @connection_tally[:error_client] += 1 if hdr.client_error?
+        return false
+      end
+      # let's not continue if not text/html
+      unless hdr['CONTENT_TYPE'].include?('text/html')
+        @already_crawled.insert(url)
+        @link_stack.delete(url)
+        lg("Page Not text/html -- #{url}")
+        return false
+      end
+      @connection_tally[:success] += 1
+      true
+    end
+    # send a new wave of GET requests, using current @link_stack
+    def process_link_stack
+      new_stuff = []
+      EM.synchrony do
+        concurrency = 10
+        EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
+          next if @already_crawled.size >= @max_pages
+          next if @already_crawled.include?(url)
+          resp = EventMachine::HttpRequest.new(url).get
+          next unless good_response?(resp, url)
+          lg("Page Fetched: #{url}")
+          @already_crawled.insert(url)
+          new_page = Retriever::Page.new(resp.response, @t)
+          if @prgrss
+            @progressbar.increment if @already_crawled.size < @max_pages
+          end
+          if @seo
+            seos = [url]
+            seos.concat(new_page.parse_seo)
+            @data.push(seos)
+            lg('--page SEO scraped')
+          end
+          next if new_page.links.size == 0
+          lg("--#{new_page.links.size} links found")
+          internal_links_arr = new_page.parse_internal_visitable
+          new_stuff.push(internal_links_arr)
+          if @fh
+            filez = new_page.parse_files
+            @data.concat(filez) unless filez.empty?
+            lg("--#{filez.size} files found")
+          end
+        end
+        new_stuff = new_stuff.flatten # all completed requests
+        EventMachine.stop
+      end
+      new_stuff.uniq!
+    end
+  end
+end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,65 +1,70 @@
 module Retriever
-	class FetchFiles < Fetch
-		def initialize(url,options)  #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
-			super
-			@data = []
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
+  # recieves target url and RR options
+  # returns an array of all unique files (based on given filetype)
+  #   found on the target site
+  class FetchFiles < Fetch
+    def initialize(url, options)
+      super
+      @data = []
+      page_one = Retriever::Page.new(@t.source, @t)
+      @link_stack = page_one.parse_internal_visitable
+      lg("URL Crawled: #{@t.target}")
+      lg("#{@link_stack.size - 1} new links found")
-			tempFileCollection = page_one.parseFiles
-			@data.concat(tempFileCollection) if tempFileCollection.size>0
-			lg("#{@data.size} new files found")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+      temp_file_collection = page_one.parse_files
+      @data.concat(tempFileCollection) if temp_file_collection.size > 0
+      lg("#{@data.size} new files found")
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      @link_stack.delete(@t.target)
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
+      async_crawl_and_collect
-			self.async_crawl_and_collect()
+      @data.sort_by! { |x| x.length }
+      @data.uniq!
+    end
-			@data.sort_by! {|x| x.length}
-			@data.uniq!
-		end
-		def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
-			arr = path.split('/')
-			shortname = arr.pop
-			puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
-			File.open(shortname, "wb") do |saved_file|
-			  open(path) do |read_file|
-			    saved_file.write(read_file.read)
-			  end
-			end
-			puts "	SUCCESS: Download Complete"
-		end
-		def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
-			lenny = @data.count
-			puts "###################"
-			puts "### Initiating Autodownload..."
-			puts "###################"
-			puts "#{lenny} - #{@file_ext}'s Located"
-			puts "###################"
-			if File::directory?("rr-downloads")
-			 Dir.chdir("rr-downloads")
-			else
-			puts "creating rr-downloads Directory"
-			 Dir.mkdir("rr-downloads")
-			 Dir.chdir("rr-downloads")
-			end
-			file_counter = 0
-			@data.each do |entry|
-				begin
-					self.download_file(entry)
-					file_counter+=1
-					lg("		File [#{file_counter} of #{lenny}]")
-					puts
-				rescue StandardError => e
-					puts "ERROR: failed to download - #{entry}"
-					puts e.message
-					puts
-				end
-			end
-			Dir.chdir("..")
-		end
-	end
-end
+    def download_file(path)
+      # given valid url, downloads file to current directory in /rr-downloads/
+      arr = path.split('/')
+      shortname = arr.pop
+      puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
+      File.open(shortname, 'wb') do |saved_file|
+        open(path) do |read_file|
+          saved_file.write(read_file.read)
+        end
+      end
+      puts '  SUCCESS: Download Complete'
+    end
+    def autodownload
+      # go through the fetched file URL collection and download each one.
+      lenny = @data.count
+      puts '###################'
+      puts '### Initiating Autodownload...'
+      puts '###################'
+      puts "#{lenny} - #{@file_ext}'s Located"
+      puts '###################'
+      if File.directory?('rr-downloads')
+        Dir.chdir('rr-downloads')
+      else
+        puts 'creating rr-downloads Directory'
+        Dir.mkdir('rr-downloads')
+        Dir.chdir('rr-downloads')
+      end
+      file_counter = 0
+      @data.each do |entry|
+        begin
+          download_file(entry)
+          file_counter += 1
+          lg('    File [#{file_counter} of #{lenny}]')
+          puts
+        rescue StandardError => e
+          puts 'ERROR: failed to download - #{entry}'
+          puts e.message
+          puts
+        end
+      end
+      Dir.chdir('..')
+    end
+  end
+end

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,23 +1,25 @@
 module Retriever
-	class FetchSEO < Fetch
-		def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
-			super
-			@data = []
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
+  #
+  class FetchSEO < Fetch
+    # recieves target url and RR options
+    # returns an array of onpage SEO related fields
+    #   on all unique pages found on the site
+    def initialize(url, options)
+      super
+      @data = []
+      page_one = Retriever::Page.new(@t.source, @t)
+      lg("URL Crawled: #{@t.target}")
-			@data.push(page_one.parseSEO)
-			lg("#{@data.size} pages scraped")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+      @link_stack = page_one.parse_internal_visitable
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      lg("#{@link_stack.size - 1} links found")
+      @link_stack.delete(@t.target)
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
+      @data.push(page_one.parse_seo)
-			self.async_crawl_and_collect()
+      async_crawl_and_collect
-			@data.sort_by! {|x| x[0].length}
-		end
-	end
-end
+      @data.sort_by! { |x| x[0].length }
+    end
+  end
+end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,36 +1,41 @@
 module Retriever
-	class FetchSitemap < Fetch
-		def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
-			super
-			@data = [@t.target]
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+  #
+  class FetchSitemap < Fetch
+    # recieves target URL and RR options
+    # returns an array of all unique pages found on the site
+    def initialize(url, options)
+      super
+      @data = [@t.target]
+      page_one = Retriever::Page.new(@t.source, @t)
+      lg("URL Crawled: #{@t.target}")
+      @link_stack = page_one.parse_internal_visitable
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      lg("#{@link_stack.size - 1} links found")
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
-			@data.concat(@linkStack)
+      @link_stack.delete(@t.target)
+      @data.concat(@link_stack)
-			self.async_crawl_and_collect()
+      async_crawl_and_collect
-			@data.sort_by!	 {|x| x.length} if @data.size>1
-			@data.uniq!
-		end
-		def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
-			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
-			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-				@data.each do |url|
-					f << "<url><loc>#{url}</loc></url>"
-				end
-			f << "</urlset>"
-			f.close
-			puts "###############################"
-			puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
-			puts "Object Count: #{@data.size}"
-			puts "###############################"
-			puts
-		end
-	end
-end
+      @data.sort_by! { |x| x.length } if @data.size > 1
+      @data.uniq!
+    end
+    # produces valid XML sitemap based on page collection fetched.
+    # Writes to current directory.
+    def gen_xml
+      f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
+      f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
+        @data.each do |url|
+          f << "<url><loc>#{url}</loc></url>"
+        end
+      f << '</urlset>'
+      f.close
+      puts '###############################'
+      puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
+      puts "Object Count: #{@data.size}"
+      puts '###############################'
+      puts
+    end
+  end
+end

data/lib/retriever/link.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 module Retriever
+  #
   class Link
     HTTP_RE = Regexp.new(/^http/i).freeze
     SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
@@ -18,12 +19,15 @@ module Retriever
       return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
-      return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
+      # link begins with '//'
+      return "http:#{link}" if DOUBLE_SLASH_RE =~ link
-      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
+      # link uses relative path with no slashes at all
+      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
     end
     private
     attr_reader :host, :link
   end
 end

data/lib/retriever/openuri-redirect-patch.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module OpenURI
-  def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
+  # nesc patch otherwise OPENURI blocks redirects to and from https
+  def OpenURI.redirectable?(uri1, uri2)
     uri1.scheme.downcase == uri2.scheme.downcase ||
     (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
   end
-end
+end

data/lib/retriever/page.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 module Retriever
+  #
   class Page
     HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
     NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
     HTTP_RE = Regexp.new(/^http/i).freeze
@@ -14,55 +13,55 @@ module Retriever
     attr_reader :links, :source, :t
-    def initialize(source,t)
+    def initialize(source, t)
       @t = t
       @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
       @links = nil
     end
-    #recieves page source as string
-    #returns array of unique href links
+    # recieves page source as string
+    # returns array of unique href links
     def links
       return @links if @links
-      return false if !@source
-      @links = @source.scan(HREF_CONTENTS_RE).map do |match|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
+      return false unless @source
+      @links = @source.scan(HREF_CONTENTS_RE).map do |match|
+        # filter some malformed URLS that come in
+        # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
         Link.new(@t.host, link).path
       end.uniq
     end
-    def parseInternal
-      links.select{ |linky| (@t.host_re =~ linky) }
+    def parse_internal
+      links.select { |linky| (@t.host_re =~ linky) }
     end
-    def parseInternalVisitable
-      parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
+    def parse_internal_visitable
+      parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
     end
-    def parseFiles
-      links.select{ |linky| (@t.file_re =~ linky)}
+    def parse_files
+      links.select { |linky| (@t.file_re =~ linky) }
     end
     def title
-      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
+      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
     end
     def desc
-      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
+      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
     end
     def h1
-      H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
+      H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
     end
     def h2
-      H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
+      H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
     end
-    def parseSEO
-      return [title,desc,h1,h2]
+    def parse_seo
+      [title, desc, h1, h2]
     end
   end
 end

data/lib/retriever/target.rb CHANGED Viewed

@@ -1,52 +1,44 @@
 require 'open-uri'
 module Retriever
+  #
   class Target
     HTTP_RE = Regexp.new(/^http/i).freeze
     DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
     attr_reader :host, :target, :host_re, :source, :file_re
-    def initialize(url,file_re=nil)
-      url = "http://#{url}" if (!(HTTP_RE =~ url))
-      fail "Bad URL" if (!(/\./ =~ url))
+    def initialize(url, file_re = nil)
+      url = "http://#{url}" unless HTTP_RE =~ url
+      fail 'Bad URL' unless /\./ =~ url
       new_uri = URI(url)
       @target = new_uri.to_s
       @host = new_uri.host
-      @host_re = Regexp.new(@host.sub('www.',''))
+      @host_re = Regexp.new(@host.sub('www.', ''))
       @file_re ||= file_re
     end
     def source
-      resp = false
-      begin
-        resp = open(@target)
-      rescue StandardError => e
-        trap("ABRT"){
-          puts "#{@target} failed SSL Certification Verification"
-        }
-        return false
-      end
+      resp = open(@target)
       resp_url = resp.base_uri.to_s
-      if (@target != resp_url)
-          if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
-            new_t = Retriever::Target.new(resp_url)
-            @target = new_t.target
-            @host = new_t.host
-            return new_t.source
-          end
-          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
+      if @target != resp_url
+        fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
+        # if redirect URL is same host, we want to re-sync @target
+        return resync_target_and_return_source(resp_url)
       end
       resp = resp.read
-      if resp == ""
-        fail "Domain is not working. Try the non-WWW version."
-      end
-      fail "Domain not working. Try HTTPS???" if !resp
-      return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
+      #
+      fail 'Domain is not working. Try the non-WWW version.' if resp == ''
+      fail 'Domain not working. Try HTTPS???' unless resp
+      # consider using scrub from ruby 2.1? this misses some things
+      resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
     end
+    def resync_target_and_return_source(url)
+      new_t = Retriever::Target.new(url)
+      @target = new_t.target
+      @host = new_t.host
+      new_t.source
+    end
   end
 end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '1.0.3'
+  VERSION = '1.1.0'
 end

data/lib/retriever.rb CHANGED Viewed

@@ -8,6 +8,6 @@ require 'retriever/target'
 require 'retriever/page'
 require 'retriever/openuri-redirect-patch'
+#
 module Retriever
-end
+end

data/readme.md CHANGED Viewed

@@ -4,13 +4,14 @@
 By Joe Norton
-RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
+RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
-RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
+RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
-RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
+**Use at Own Risk**
+RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
-v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
+**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
 getting started

data/spec/page_spec.rb CHANGED Viewed

@@ -20,8 +20,8 @@ SOURCE
     end
   end
-  describe "#parseInternal" do
-    let (:links){Retriever::Page.new(@source,t).parseInternal}
+  describe "#parse_internal" do
+    let (:links){Retriever::Page.new(@source,t).parse_internal}
     it "filters links by host" do
             @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/'>download</a>
@@ -32,8 +32,8 @@ SOURCE
     end
   end
-  describe "#parseInternalVisitable" do
-    let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
+  describe "#parse_internal_visitable" do
+    let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
     it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
             @source = (<<SOURCE).strip
  <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
@@ -43,7 +43,7 @@ SOURCE
   end
   describe "#parseFiles" do
-    let (:links){Retriever::Page.new(@source,t).parseFiles}
+    let (:links){Retriever::Page.new(@source,t).parse_files}
     it "filters links by filetype" do
                   @source = (<<SOURCE).strip
 <a href='www.cnet.com/download.exe'>download</a>
@@ -90,5 +90,4 @@ SOURCE
         expect(page.h2).to eq(' test 4 ')
     end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.1.0
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-08 00:00:00.000000000 Z
+date: 2014-06-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony