RubyGems - rubyretriever - Versions diffs - 1.0.3 → 1.1.0 - Mend

rubyretriever 1.0.3 → 1.1.0

Files changed (16) hide show

checksums.yaml +4 -4
data/bin/rr +41 -41
data/lib/retriever/cli.rb +20 -26
data/lib/retriever/fetch.rb +209 -186
data/lib/retriever/fetchfiles.rb +65 -60
data/lib/retriever/fetchseo.rb +20 -18
data/lib/retriever/fetchsitemap.rb +37 -32
data/lib/retriever/link.rb +6 -2
data/lib/retriever/openuri-redirect-patch.rb +3 -2
data/lib/retriever/page.rb +20 -21
data/lib/retriever/target.rb +22 -30
data/lib/retriever/version.rb +1 -1
data/lib/retriever.rb +2 -2
data/readme.md +5 -4
data/spec/page_spec.rb +6 -7
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0aa827221b6c3034f4463c376b29e47b740580e6
-  data.tar.gz: c800c5820d62e45c140dea2d94140a3a9636aeff
+  metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
+  data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
 SHA512:
-  metadata.gz: fba8ef21412309bdfe3435caf8fe4ec01d197cce5cb1698fc9536bc8127bcd9c45d51c6a908bb642c5bfb3ff9caca1e73f98c78669036dc94c38704412a0461a
-  data.tar.gz: 092058e59d4c591be1d5ceab99dfa4219f86929897d2d1ab25859d03bf918622f4289ab8bca5fff4e87d6b9af228af80ac129deec1122e18d2f33db38544dea6
+  metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
+  data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64

data/bin/rr CHANGED Viewed

@@ -4,73 +4,73 @@ require 'retriever'
 require 'optparse'
 options = {}
- optparse = OptionParser.new do|opts|
+ optparse = OptionParser.new do |opts|
    # Set a banner, displayed at the top
    # of the help screen.
-   opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
-  options[:sitemap] = false
-   opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
-     options[:sitemap] = output_type||''
+   opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
+   options['sitemap'] = false
+   opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
+     options['sitemap'] = output_type || ''
    end
-  options[:fileharvest] = false
-   opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
-     options[:fileharvest] = file_ext
+   options['fileharvest'] = false
+   opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
+     options['fileharvest'] = file_ext
    end
-   options[:seo] = false
-   opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
-     options[:seo] = true
+   options['seo'] = false
+   opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
+     options['seo'] = true
    end
-    options[:filename] = nil
-   opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
-     options[:filename] = filename
+   options['filename'] = nil
+   opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
+     options['filename'] = filename
    end
    # Define the options, and what they do
-   options[:verbose] = false
-   opts.on( '-v', '--verbose', 'Output more information' ) do
-     options[:verbose] = true
+   options['verbose'] = false
+   opts.on('-v', '--verbose', 'Output more information') do
+     options['verbose'] = true
    end
-   options[:progress] = false
-   opts.on( '-p', '--progress', 'Output progress bar' ) do
-     options[:progress] = true
+   options['progress'] = false
+   opts.on('-p', '--progress', 'Output progress bar') do
+     options['progress'] = true
    end
-  options[:maxpages] = false
-   opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
+   options['maxpages'] = false
+   opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
      options[:maxpages] = maxpages
    end
-   options[:autodown] = false
-   opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
+   options['autodown'] = false
+   opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
      options[:autodown] = true
    end
    # This displays the help screen, all programs are
    # assumed to have this option.
-   opts.on( '-h', '--help', 'Display this screen' ) do
+   opts.on('-h', '--help', 'Display this screen') do
      puts opts
      exit
    end
  end
- optparse.parse!
+optparse.parse!
 if ARGV[0].nil?
-	abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
+  abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
 end
 ARGV.each do|q|
   if options[:verbose]
-    puts "###############################"
-    puts "### [RubyRetriever]"
-    puts "### Creating Sitemap" if options[:sitemap]
-    puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
-    puts "### Performing File Harvest" if options[:fileharvest]
-    puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
-    puts "### Performing SEO Scrape" if options[:seo]
-    puts "### Writing output to filename: #{options[:filename]}" if options[:filename]
-    puts "### Being verbose"
-    puts "### Stopping after #{options[:maxpages]} pages"
+    puts '###############################'
+    puts '### [RubyRetriever]'
+    puts '### Creating Sitemap' if options['sitemap']
+    puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
+    puts '### Performing File Harvest' if options['fileharvest']
+    puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
+    puts '### Performing SEO Scrape' if options['seo']
+    puts "### Writing output to filename: #{options['filename']}" if options['filename']
+    puts '### Being verbose'
+    puts "### Stopping after #{options['maxpages']} pages"
   end
-  puts "###############################"
+  puts '###############################'
   puts "### [RubyRetriever] go fetch #{q}"
   Retriever::CLI.new(q, options)
-  puts "### [RubyRetriever] is done."
-  puts "###############################"
+  puts '### [RubyRetriever] is done.'
+  puts '###############################'
   puts
 end

data/lib/retriever/cli.rb CHANGED Viewed

@@ -1,27 +1,21 @@
 module Retriever
-	class CLI
-		def initialize(url,options)
-			#kick off the fetch mode of choice
-			if options[:fileharvest]
-				@fetch = Retriever::FetchFiles.new(url, options)
-			elsif options[:sitemap]
-				@fetch = Retriever::FetchSitemap.new(url, options)
-			elsif options[:seo]
-				@fetch = Retriever::FetchSEO.new(url, options)
-			else
-				fail "### Error: No Mode Selected"
-			end
-			#all fetch modes
-			@fetch.dump
-			@fetch.write if options[:filename]
-			#fileharvest only
-			@fetch.autodownload if options[:autodown] && options[:fileharvest]
-			#sitemap only
-			@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
-		end
-	end
-end
+  #
+  class CLI
+    def initialize(url, options)
+      # kick off the fetch mode of choice
+      if options['fileharvest']
+        @fetch = Retriever::FetchFiles.new(url, options)
+      elsif options['sitemap']
+        @fetch = Retriever::FetchSitemap.new(url, options)
+      elsif options['seo']
+        @fetch = Retriever::FetchSEO.new(url, options)
+      else
+        fail '### Error: No Mode Selected'
+      end
+      @fetch.dump
+      @fetch.write if options['filename']
+      @fetch.autodownload if options['autodown'] && options['fileharvest']
+      @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
+    end
+  end
+end

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -7,191 +7,214 @@ require 'csv'
 require 'bloomfilter-rb'
 module Retriever
-	class Fetch
-		attr_reader :maxPages, :t
+  #
+  class Fetch
+    attr_reader :max_pages, :t
+    # given target URL and RR options, creates a fetch object.
+    # There is no direct output
+    # this is a parent class that the other fetch classes build off of.
+    def initialize(url, options)
+      @connection_tally = {
+        :success => 0,
+        :error => 0,
+        :error_client => 0,
+        :error_server => 0
+      }
+      # OPTIONS
+      @prgrss = options['progress']
+      @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
+      @v = options['verbose']
+      @output = options['filename']
+      @fh = options['fileharvest']
+      @file_ext = @fh.to_s
+      @s = options['sitemap']
+      @seo = options['seo']
+      @autodown = options['autodown']
+      #
+      if @fh
+        temp_ext_str = '.' + @file_ext + '\z'
+        @file_re = Regexp.new(temp_ext_str).freeze
+      else
+        # when FH is not true, and autodown is true
+        errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
+      end
+      if @prgrss
+        # verbose & progressbar conflict
+        errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
+        prgress_vars = {
+          :title => 'Pages',
+          :starting_at => 1,
+          :total => @max_pages,
+          :format => '%a |%b>%i| %c/%C %t'
+        }
+        @progressbar = ProgressBar.create(prgress_vars)
+      end
+      @t = Retriever::Target.new(url, @file_re)
+      @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
+      @already_crawled = BloomFilter::Native.new(
+        :size => 1_000_000,
+        :hashes => 5,
+        :seed => 1,
+        :bucket => 8,
+        :raise => false
+      )
+      @already_crawled.insert(@t.target)
+    end
-		def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
-			@connection_tally = {
-				:success => 0,
-				:error => 0,
-				:error_client => 0,
-				:error_server => 0
-			}
-			#OPTIONS
-			@prgrss = options[:progress] ? options[:progress] : false
-			@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
-			@v= options[:verbose] ? true : false
-			@output=options[:filename] ? options[:filename] : false
-			@fh = options[:fileharvest] ? options[:fileharvest] : false
-			@file_ext = @fh.to_s
-			@s = options[:sitemap] ? options[:sitemap] : false
-			@seo = options[:seo] ? true : false
-			@autodown = options[:autodown] ? true : false
-			#
-			if @fh
-				tempExtStr = "."+@file_ext+'\z'
-				@file_re = Regexp.new(tempExtStr).freeze
-			else
-				errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
-			end
-			if @prgrss
-				errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
-				prgressVars = {
-					:title => "Pages Crawled",
-					:starting_at => 1,
-					:total => @maxPages,
-					:format => '%a |%b>%i| %c/%C %t',
-				}
-				@progressbar = ProgressBar.create(prgressVars)
-			end
-			@t = Retriever::Target.new(url,@file_re)
-			@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
-			@already_crawled.insert(@t.target)
-			if (@fh && !@output)
-				@output = "rr-#{@t.host.split('.')[1]}"
-			end
-			fail "bad page source on target -- try HTTPS?" if !@t.source
-		end
-		def errlog(msg)
-			raise "ERROR: #{msg}"
-		end
-		def lg(msg)
-			puts "### #{msg}" if @v
-		end
-		def dump #prints current data collection to STDOUT, meant for CLI use.
-			puts "###############################"
-			if @v
-				puts "Connection Tally:"
-				puts @connection_tally.to_s
-				puts "###############################"
-			end
-			if @s
-				puts "#{@t.target} Sitemap"
-				puts "Page Count: #{@data.size}"
-			elsif @fh
-				puts "Target URL: #{@t.target}"
-				puts "Filetype: #{@file_ext}"
-				puts "File Count: #{@data.size}"
-			elsif @seo
-				puts "#{@t.target} SEO Metrics"
-				puts "Page Count: #{@data.size}"
-			else
-				fail "ERROR - Cannot dump - Mode Not Found"
-			end
-			puts "###############################"
-			@data.each do |line|
-				puts line
-			end
-			puts "###############################"
-			puts
-		end
-		def write #writes current data collection out to CSV in current directory
-			if @output
-				i = 0
-				CSV.open("#{@output}.csv", "w") do |csv|
-					if ((i == 0) && @seo)
-						csv << ['URL','Page Title','Meta Description','H1','H2']
-						i +=1
-					end
-					@data.each do |entry|
-						csv << entry
-					end
-				end
-				puts "###############################"
-				puts "File Created: #{@output}.csv"
-				puts "Object Count: #{@data.size}"
-				puts "###############################"
-				puts
-			end
-		end
-		def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
-			while (@already_crawled.size < @maxPages)
-				if @linkStack.empty?
-					if @prgrss
-						@progressbar.log("Can't find any more links. Site might be completely mapped.")
-					else
-						lg("Can't find any more links. Site might be completely mapped.")
-					end
-					break;
-				end
-				new_links_arr = self.asyncGetWave()
-				next if (new_links_arr.nil? || new_links_arr.empty?)
-				new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
-				@linkStack.concat(new_links_arr).uniq!
-				@data.concat(new_links_arr) if @s
-			end
-			@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
-		end
-		def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
-			return false if !resp
-			if resp.response_header.redirection? #we got redirected
-				loc = resp.response_header.location
-				lg("#{url} Redirected to #{loc}")
-				if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
-			    	@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
-			    	lg("--Added to linkStack for later")
-			    	return false
-			    end
-			    lg("Redirection outside of target host. No - go. #{loc}")
-			    return false
-			end
-			if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
-				lg("UNSUCCESSFUL CONNECTION -- #{url}")
-				@connection_tally[:error] += 1
-				@connection_tally[:error_server] += 1 if resp.response_header.server_error?
-				@connection_tally[:error_client] += 1 if resp.response_header.client_error?
-				return false
-			end
-			if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
-				@already_crawled.insert(url)
-				@linkStack.delete(url)
-				lg("Page Not text/html -- #{url}")
-				return false
-			end
-			@connection_tally[:success] += 1
-			return true
-		end
+    def errlog(msg)
+      fail "ERROR: #{msg}"
+    end
-		def asyncGetWave() #send a new wave of GET requests, using current @linkStack
-			new_stuff = []
-			EM.synchrony do
-				lenny = 0
-			    concurrency = 10
-			    EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
-			    	next if (@already_crawled.size >= @maxPages)
-			    	if @already_crawled.include?(url)
-			    		@linkStack.delete(url)
-			    		next
-			    	end
-			    	resp = EventMachine::HttpRequest.new(url).get
-			    	next if !good_response?(resp,url)
-			    	new_page = Retriever::Page.new(resp.response,@t)
-			    	lg("Page Fetched: #{url}")
-			    	@already_crawled.insert(url)
-					if @prgrss
-						@progressbar.increment if @already_crawled.size < @maxPages
-					end
-					if @seo
-						seos = [url]
-						seos.concat(new_page.parseSEO)
-						@data.push(seos)
-						lg("--page SEO scraped")
-					end
-					if new_page.links
-						lg("--#{new_page.links.size} links found")
-						internal_links_arr = new_page.parseInternalVisitable
-						new_stuff.push(internal_links_arr)
-						if @fh
-							filez = new_page.parseFiles
-							@data.concat(filez) if !filez.empty?
-							lg("--#{filez.size} files found")
-						end
-					end
-			    end
-			    new_stuff = new_stuff.flatten # all completed requests
-			    EventMachine.stop
-			end
-			new_stuff.uniq!
-		end
-	end
-end
+    def lg(msg)
+      puts "### #{msg}" if @v
+    end
+    # prints current data collection to STDOUT
+    def dump
+      puts '###############################'
+      if @v
+        puts 'Connection Tally:'
+        puts @connection_tally.to_s
+        puts '###############################'
+      end
+      if @s
+        puts "#{@t.target} Sitemap"
+        puts "Page Count: #{@data.size}"
+      elsif @fh
+        puts "Target URL: #{@t.target}"
+        puts "Filetype: #{@file_ext}"
+        puts "File Count: #{@data.size}"
+      elsif @seo
+        puts "#{@t.target} SEO Metrics"
+        puts "Page Count: #{@data.size}"
+      else
+        fail 'ERROR - Cannot dump - Mode Not Found'
+      end
+      puts '###############################'
+      @data.each do |line|
+        puts line
+      end
+      puts '###############################'
+      puts
+    end
+    # writes current data collection out to CSV in current directory
+    def write
+      return false unless @output
+      i = 0
+      CSV.open("#{@output}.csv", 'w') do |csv|
+        if (i == 0) && @seo
+          csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
+          i += 1
+        end
+        @data.each do |entry|
+          csv << entry
+        end
+      end
+      puts '###############################'
+      puts "File Created: #{@output}.csv"
+      puts "Object Count: #{@data.size}"
+      puts '###############################'
+      puts
+    end
+    # iterates over the existing @link_stack
+    # running until we reach the @max_pages value.
+    def async_crawl_and_collect
+      while @already_crawled.size < @max_pages
+        if @link_stack.empty?
+          if @prgrss
+            @progressbar.log("Can't find any more links.")
+          else
+            lg("Can't find any more links.")
+          end
+          break
+        end
+        new_links_arr = process_link_stack
+        next if new_links_arr.nil? || new_links_arr.empty?
+        # set operations to see are these in our previous visited pages arr
+        new_links_arr -= @link_stack
+        @link_stack.concat(new_links_arr).uniq!
+        @data.concat(new_links_arr) if @s
+      end
+      # done, make sure progress bar says we are done
+      @progressbar.finish if @prgrss
+    end
+    # returns true is resp is ok to continue
+    def good_response?(resp, url)
+      return false unless resp
+      hdr = resp.response_header
+      if hdr.redirection?
+        loc = hdr.location
+        lg("#{url} Redirected to #{loc}")
+        if t.host_re =~ loc
+          @link_stack.push(loc) unless @already_crawled.include?(loc)
+          lg('--Added to linkStack for later')
+          return false
+        end
+        lg("Redirection outside of target host. No - go. #{loc}")
+        return false
+      end
+      # lets not continue if unsuccessful connection
+      unless hdr.successful?
+        lg("UNSUCCESSFUL CONNECTION -- #{url}")
+        @connection_tally[:error] += 1
+        @connection_tally[:error_server] += 1 if hdr.server_error?
+        @connection_tally[:error_client] += 1 if hdr.client_error?
+        return false
+      end
+      # let's not continue if not text/html
+      unless hdr['CONTENT_TYPE'].include?('text/html')
+        @already_crawled.insert(url)
+        @link_stack.delete(url)
+        lg("Page Not text/html -- #{url}")
+        return false
+      end
+      @connection_tally[:success] += 1
+      true
+    end
+    # send a new wave of GET requests, using current @link_stack
+    def process_link_stack
+      new_stuff = []
+      EM.synchrony do
+        concurrency = 10
+        EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
+          next if @already_crawled.size >= @max_pages
+          next if @already_crawled.include?(url)
+          resp = EventMachine::HttpRequest.new(url).get
+          next unless good_response?(resp, url)
+          lg("Page Fetched: #{url}")
+          @already_crawled.insert(url)
+          new_page = Retriever::Page.new(resp.response, @t)
+          if @prgrss
+            @progressbar.increment if @already_crawled.size < @max_pages
+          end
+          if @seo
+            seos = [url]
+            seos.concat(new_page.parse_seo)
+            @data.push(seos)
+            lg('--page SEO scraped')
+          end
+          next if new_page.links.size == 0
+          lg("--#{new_page.links.size} links found")
+          internal_links_arr = new_page.parse_internal_visitable
+          new_stuff.push(internal_links_arr)
+          if @fh
+            filez = new_page.parse_files
+            @data.concat(filez) unless filez.empty?
+            lg("--#{filez.size} files found")
+          end
+        end
+        new_stuff = new_stuff.flatten # all completed requests
+        EventMachine.stop
+      end
+      new_stuff.uniq!
+    end
+  end
+end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,65 +1,70 @@
 module Retriever
-	class FetchFiles < Fetch
-		def initialize(url,options)  #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
-			super
-			@data = []
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
+  # recieves target url and RR options
+  # returns an array of all unique files (based on given filetype)
+  #   found on the target site
+  class FetchFiles < Fetch
+    def initialize(url, options)
+      super
+      @data = []
+      page_one = Retriever::Page.new(@t.source, @t)
+      @link_stack = page_one.parse_internal_visitable
+      lg("URL Crawled: #{@t.target}")
+      lg("#{@link_stack.size - 1} new links found")
-			tempFileCollection = page_one.parseFiles
-			@data.concat(tempFileCollection) if tempFileCollection.size>0
-			lg("#{@data.size} new files found")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+      temp_file_collection = page_one.parse_files
+      @data.concat(tempFileCollection) if temp_file_collection.size > 0
+      lg("#{@data.size} new files found")
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      @link_stack.delete(@t.target)
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
+      async_crawl_and_collect
-			self.async_crawl_and_collect()
+      @data.sort_by! { |x| x.length }
+      @data.uniq!
+    end
-			@data.sort_by! {|x| x.length}
-			@data.uniq!
-		end
-		def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
-			arr = path.split('/')
-			shortname = arr.pop
-			puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
-			File.open(shortname, "wb") do |saved_file|
-			  open(path) do |read_file|
-			    saved_file.write(read_file.read)
-			  end
-			end
-			puts "	SUCCESS: Download Complete"
-		end
-		def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
-			lenny = @data.count
-			puts "###################"
-			puts "### Initiating Autodownload..."
-			puts "###################"
-			puts "#{lenny} - #{@file_ext}'s Located"
-			puts "###################"
-			if File::directory?("rr-downloads")
-			 Dir.chdir("rr-downloads")
-			else
-			puts "creating rr-downloads Directory"
-			 Dir.mkdir("rr-downloads")
-			 Dir.chdir("rr-downloads")
-			end
-			file_counter = 0
-			@data.each do |entry|
-				begin
-					self.download_file(entry)
-					file_counter+=1
-					lg("		File [#{file_counter} of #{lenny}]")
-					puts
-				rescue StandardError => e
-					puts "ERROR: failed to download - #{entry}"
-					puts e.message
-					puts
-				end
-			end
-			Dir.chdir("..")
-		end
-	end
-end
+    def download_file(path)
+      # given valid url, downloads file to current directory in /rr-downloads/
+      arr = path.split('/')
+      shortname = arr.pop
+      puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
+      File.open(shortname, 'wb') do |saved_file|
+        open(path) do |read_file|
+          saved_file.write(read_file.read)
+        end
+      end
+      puts '  SUCCESS: Download Complete'
+    end
+    def autodownload
+      # go through the fetched file URL collection and download each one.
+      lenny = @data.count
+      puts '###################'
+      puts '### Initiating Autodownload...'
+      puts '###################'
+      puts "#{lenny} - #{@file_ext}'s Located"
+      puts '###################'
+      if File.directory?('rr-downloads')
+        Dir.chdir('rr-downloads')
+      else
+        puts 'creating rr-downloads Directory'
+        Dir.mkdir('rr-downloads')
+        Dir.chdir('rr-downloads')
+      end
+      file_counter = 0
+      @data.each do |entry|
+        begin
+          download_file(entry)
+          file_counter += 1
+          lg('    File [#{file_counter} of #{lenny}]')
+          puts
+        rescue StandardError => e
+          puts 'ERROR: failed to download - #{entry}'
+          puts e.message
+          puts
+        end
+      end
+      Dir.chdir('..')
+    end
+  end
+end

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,23 +1,25 @@
 module Retriever
-	class FetchSEO < Fetch
-		def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
-			super
-			@data = []
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
+  #
+  class FetchSEO < Fetch
+    # recieves target url and RR options
+    # returns an array of onpage SEO related fields
+    #   on all unique pages found on the site
+    def initialize(url, options)
+      super
+      @data = []
+      page_one = Retriever::Page.new(@t.source, @t)
+      lg("URL Crawled: #{@t.target}")
-			@data.push(page_one.parseSEO)
-			lg("#{@data.size} pages scraped")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+      @link_stack = page_one.parse_internal_visitable
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      lg("#{@link_stack.size - 1} links found")
+      @link_stack.delete(@t.target)
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
+      @data.push(page_one.parse_seo)
-			self.async_crawl_and_collect()
+      async_crawl_and_collect
-			@data.sort_by! {|x| x[0].length}
-		end
-	end
-end
+      @data.sort_by! { |x| x[0].length }
+    end
+  end
+end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,36 +1,41 @@
 module Retriever
-	class FetchSitemap < Fetch
-		def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
-			super
-			@data = [@t.target]
-			page_one = Retriever::Page.new(@t.source,@t)
-			@linkStack = page_one.parseInternalVisitable
-			lg("URL Crawled: #{@t.target}")
-			lg("#{@linkStack.size-1} new links found")
-			errlog("Bad URL -- #{@t.target}") if !@linkStack
+  #
+  class FetchSitemap < Fetch
+    # recieves target URL and RR options
+    # returns an array of all unique pages found on the site
+    def initialize(url, options)
+      super
+      @data = [@t.target]
+      page_one = Retriever::Page.new(@t.source, @t)
+      lg("URL Crawled: #{@t.target}")
+      @link_stack = page_one.parse_internal_visitable
+      errlog("Bad URL -- #{@t.target}") unless @link_stack
+      lg("#{@link_stack.size - 1} links found")
-			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
-			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
-			@data.concat(@linkStack)
+      @link_stack.delete(@t.target)
+      @data.concat(@link_stack)
-			self.async_crawl_and_collect()
+      async_crawl_and_collect
-			@data.sort_by!	 {|x| x.length} if @data.size>1
-			@data.uniq!
-		end
-		def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
-			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
-			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-				@data.each do |url|
-					f << "<url><loc>#{url}</loc></url>"
-				end
-			f << "</urlset>"
-			f.close
-			puts "###############################"
-			puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
-			puts "Object Count: #{@data.size}"
-			puts "###############################"
-			puts
-		end
-	end
-end
+      @data.sort_by! { |x| x.length } if @data.size > 1
+      @data.uniq!
+    end
+    # produces valid XML sitemap based on page collection fetched.
+    # Writes to current directory.
+    def gen_xml
+      f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
+      f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
+        @data.each do |url|
+          f << "<url><loc>#{url}</loc></url>"
+        end
+      f << '</urlset>'
+      f.close
+      puts '###############################'
+      puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
+      puts "Object Count: #{@data.size}"
+      puts '###############################'
+      puts
+    end
+  end
+end

data/lib/retriever/link.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 module Retriever
+  #
   class Link
     HTTP_RE = Regexp.new(/^http/i).freeze
     SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
@@ -18,12 +19,15 @@ module Retriever
       return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
-      return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
+      # link begins with '//'
+      return "http:#{link}" if DOUBLE_SLASH_RE =~ link
-      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
+      # link uses relative path with no slashes at all
+      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
     end
     private
     attr_reader :host, :link
   end
 end

data/lib/retriever/openuri-redirect-patch.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module OpenURI
-  def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
+  # nesc patch otherwise OPENURI blocks redirects to and from https
+  def OpenURI.redirectable?(uri1, uri2)
     uri1.scheme.downcase == uri2.scheme.downcase ||
     (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
   end
-end
+end

data/lib/retriever/page.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 module Retriever
+  #
   class Page
     HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
     NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
     HTTP_RE = Regexp.new(/^http/i).freeze
@@ -14,55 +13,55 @@ module Retriever
     attr_reader :links, :source, :t
-    def initialize(source,t)
+    def initialize(source, t)
       @t = t
       @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
       @links = nil
     end
-    #recieves page source as string
-    #returns array of unique href links
+    # recieves page source as string
+    # returns array of unique href links
     def links
       return @links if @links
-      return false if !@source
-      @links = @source.scan(HREF_CONTENTS_RE).map do |match|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
+      return false unless @source
+      @links = @source.scan(HREF_CONTENTS_RE).map do |match|
+        # filter some malformed URLS that come in
+        # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
         Link.new(@t.host, link).path
       end.uniq
     end
-    def parseInternal
-      links.select{ |linky| (@t.host_re =~ linky) }
+    def parse_internal
+      links.select { |linky| (@t.host_re =~ linky) }
     end
-    def parseInternalVisitable
-      parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
+    def parse_internal_visitable
+      parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
     end
-    def parseFiles
-      links.select{ |linky| (@t.file_re =~ linky)}
+    def parse_files
+      links.select { |linky| (@t.file_re =~ linky) }
     end
     def title
-      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
+      TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
     end
     def desc
-      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
+      DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
     end
     def h1
-      H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
+      H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
     end
     def h2
-      H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
+      H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
     end
-    def parseSEO
-      return [title,desc,h1,h2]
+    def parse_seo
+      [title, desc, h1, h2]
     end
   end
 end

data/lib/retriever/target.rb CHANGED Viewed

@@ -1,52 +1,44 @@
 require 'open-uri'
 module Retriever
+  #
   class Target
     HTTP_RE = Regexp.new(/^http/i).freeze
     DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
     attr_reader :host, :target, :host_re, :source, :file_re
-    def initialize(url,file_re=nil)
-      url = "http://#{url}" if (!(HTTP_RE =~ url))
-      fail "Bad URL" if (!(/\./ =~ url))
+    def initialize(url, file_re = nil)
+      url = "http://#{url}" unless HTTP_RE =~ url
+      fail 'Bad URL' unless /\./ =~ url
       new_uri = URI(url)
       @target = new_uri.to_s
       @host = new_uri.host
-      @host_re = Regexp.new(@host.sub('www.',''))
+      @host_re = Regexp.new(@host.sub('www.', ''))
       @file_re ||= file_re
     end
     def source
-      resp = false
-      begin
-        resp = open(@target)
-      rescue StandardError => e
-        trap("ABRT"){
-          puts "#{@target} failed SSL Certification Verification"
-        }
-        return false
-      end
+      resp = open(@target)
       resp_url = resp.base_uri.to_s
-      if (@target != resp_url)
-          if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
-            new_t = Retriever::Target.new(resp_url)
-            @target = new_t.target
-            @host = new_t.host
-            return new_t.source
-          end
-          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
+      if @target != resp_url
+        fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
+        # if redirect URL is same host, we want to re-sync @target
+        return resync_target_and_return_source(resp_url)
       end
       resp = resp.read
-      if resp == ""
-        fail "Domain is not working. Try the non-WWW version."
-      end
-      fail "Domain not working. Try HTTPS???" if !resp
-      return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
+      #
+      fail 'Domain is not working. Try the non-WWW version.' if resp == ''
+      fail 'Domain not working. Try HTTPS???' unless resp
+      # consider using scrub from ruby 2.1? this misses some things
+      resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
     end
+    def resync_target_and_return_source(url)
+      new_t = Retriever::Target.new(url)
+      @target = new_t.target
+      @host = new_t.host
+      new_t.source
+    end
   end
 end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '1.0.3'
+  VERSION = '1.1.0'
 end

data/lib/retriever.rb CHANGED Viewed

@@ -8,6 +8,6 @@ require 'retriever/target'
 require 'retriever/page'
 require 'retriever/openuri-redirect-patch'
+#
 module Retriever
-end
+end

data/readme.md CHANGED Viewed

@@ -4,13 +4,14 @@
 By Joe Norton
-RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
+RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
-RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
+RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
-RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
+**Use at Own Risk**
+RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
-v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
+**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
 getting started

data/spec/page_spec.rb CHANGED Viewed

@@ -20,8 +20,8 @@ SOURCE
     end
   end
-  describe "#parseInternal" do
-    let (:links){Retriever::Page.new(@source,t).parseInternal}
+  describe "#parse_internal" do
+    let (:links){Retriever::Page.new(@source,t).parse_internal}
     it "filters links by host" do
             @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/'>download</a>
@@ -32,8 +32,8 @@ SOURCE
     end
   end
-  describe "#parseInternalVisitable" do
-    let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
+  describe "#parse_internal_visitable" do
+    let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
     it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
             @source = (<<SOURCE).strip
  <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
@@ -43,7 +43,7 @@ SOURCE
   end
   describe "#parseFiles" do
-    let (:links){Retriever::Page.new(@source,t).parseFiles}
+    let (:links){Retriever::Page.new(@source,t).parse_files}
     it "filters links by filetype" do
                   @source = (<<SOURCE).strip
 <a href='www.cnet.com/download.exe'>download</a>
@@ -90,5 +90,4 @@ SOURCE
         expect(page.h2).to eq(' test 4 ')
     end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.1.0
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-08 00:00:00.000000000 Z
+date: 2014-06-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony