RubyGems - rubyretriever - Versions diffs - 1.1.0 → 1.2.0 - Mend

rubyretriever 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/bin/rr +51 -48
data/lib/retriever/cli.rb +11 -7
data/lib/retriever/fetch.rb +134 -105
data/lib/retriever/fetchfiles.rb +32 -34
data/lib/retriever/fetchseo.rb +3 -11
data/lib/retriever/fetchsitemap.rb +19 -18
data/lib/retriever/link.rb +17 -15
data/lib/retriever/{openuri-redirect-patch.rb → openuri_redirect_patch.rb} +2 -1
data/lib/retriever/page.rb +35 -16
data/lib/retriever/target.rb +15 -13
data/lib/retriever/version.rb +3 -2
data/lib/retriever.rb +1 -1
data/readme.md +19 -5
data/spec/link_spec.rb +37 -35
data/spec/page_spec.rb +48 -44
data/spec/retriever_spec.rb +2 -3
data/spec/target_spec.rb +28 -28
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
-  data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
+  metadata.gz: 79f0b251e367f085f7b84dd83a10f6a1dfcddd3c
+  data.tar.gz: 0e9b6bc8f66b9efd14921d8f0fc5fddc45042b5f
 SHA512:
-  metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
-  data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
+  metadata.gz: fe1a6c8e118378513c4a4e72adeccc94e212fd5e0d4244f56240830155e42f7b7bde80acdfabbc9fe9ee5b46687bc33d089d73c57690beadee3d04804a9435ac
+  data.tar.gz: 4e6bec31d3416293f2fb72b39cfab5602692a2b9d94cbfb6fba9a653afc6bf718e5eebf84b4a5745d01d2e55959df4e489342b0deca2060e2653bb5b31f4731e

data/bin/rr CHANGED Viewed

@@ -1,57 +1,58 @@
 #! /usr/bin/env ruby
 require 'retriever'
 require 'optparse'
 options = {}
- optparse = OptionParser.new do |opts|
-   # Set a banner, displayed at the top
-   # of the help screen.
-   opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
-   options['sitemap'] = false
-   opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
-     options['sitemap'] = output_type || ''
-   end
-   options['fileharvest'] = false
-   opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
-     options['fileharvest'] = file_ext
-   end
-   options['seo'] = false
-   opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
-     options['seo'] = true
-   end
-   options['filename'] = nil
-   opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
-     options['filename'] = filename
-   end
-   # Define the options, and what they do
-   options['verbose'] = false
-   opts.on('-v', '--verbose', 'Output more information') do
-     options['verbose'] = true
-   end
-   options['progress'] = false
-   opts.on('-p', '--progress', 'Output progress bar') do
-     options['progress'] = true
-   end
-   options['maxpages'] = false
-   opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
-     options[:maxpages] = maxpages
-   end
-   options['autodown'] = false
-   opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
-     options[:autodown] = true
-   end
-   # This displays the help screen, all programs are
-   # assumed to have this option.
-   opts.on('-h', '--help', 'Display this screen') do
-     puts opts
-     exit
-   end
- end
+optparse = OptionParser.new do |opts|
+  # Set a banner, displayed at the top
+  # of the help screen.
+  opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
+  options['sitemap'] = false
+  opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
+    options['sitemap'] = type || ''
+  end
+  options['fileharvest'] = false
+  opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
+    options['fileharvest'] = file_e
+  end
+  options['seo'] = false
+  opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
+    options['seo'] = true
+  end
+  options['filename'] = nil
+  opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
+    options['filename'] = file
+  end
+  # Define the options, and what they do
+  options['verbose'] = false
+  opts.on('-v', '--verbose', 'Output more information') do
+    options['verbose'] = true
+  end
+  options['progress'] = false
+  opts.on('-p', '--progress', 'Output progress bar') do
+    options['progress'] = true
+  end
+  options['maxpages'] = false
+  opts.on('-l',
+          '--limit PAGE_LIMIT_#',
+          'set a max on the total number of crawled pages') do |maxp|
+    options['maxpages'] = maxp
+  end
+  options['autodown'] = false
+  opts.on('-a', '--auto', 'Automatically download all files located') do
+    options['autodown'] = true
+  end
+  # This displays the help screen, all programs are
+  # assumed to have this option.
+  opts.on('-h', '--help', 'Display this screen') do
+    puts opts
+    exit
+  end
+end
 optparse.parse!
 if ARGV[0].nil?
-  abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
+  abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
 end
 ARGV.each do|q|
@@ -61,9 +62,11 @@ ARGV.each do|q|
     puts '### Creating Sitemap' if options['sitemap']
     puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
     puts '### Performing File Harvest' if options['fileharvest']
-    puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
+    if options['fileharvest']
+      puts "### Searching for filetype: #{options['fileharvest']}"
+    end
     puts '### Performing SEO Scrape' if options['seo']
-    puts "### Writing output to filename: #{options['filename']}" if options['filename']
+    puts "### Writing to file: #{options['filename']}" if options['filename']
     puts '### Being verbose'
     puts "### Stopping after #{options['maxpages']} pages"
   end

data/lib/retriever/cli.rb CHANGED Viewed

@@ -3,19 +3,23 @@ module Retriever
   class CLI
     def initialize(url, options)
       # kick off the fetch mode of choice
+      @fetch = choose_fetch_mode(url, options)
+      @fetch.dump
+      @fetch.write if options['filename']
+      @fetch.autodownload if options['autodown'] && options['fileharvest']
+      @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
+    end
+    def choose_fetch_mode(url, options)
       if options['fileharvest']
-        @fetch = Retriever::FetchFiles.new(url, options)
+        Retriever::FetchFiles.new(url, options)
       elsif options['sitemap']
-        @fetch = Retriever::FetchSitemap.new(url, options)
+        Retriever::FetchSitemap.new(url, options)
       elsif options['seo']
-        @fetch = Retriever::FetchSEO.new(url, options)
+        Retriever::FetchSEO.new(url, options)
       else
         fail '### Error: No Mode Selected'
       end
-      @fetch.dump
-      @fetch.write if options['filename']
-      @fetch.autodownload if options['autodown'] && options['fileharvest']
-      @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
     end
   end
 end

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -9,56 +9,27 @@ require 'bloomfilter-rb'
 module Retriever
   #
   class Fetch
+    HR = '###############################'
     attr_reader :max_pages, :t
     # given target URL and RR options, creates a fetch object.
     # There is no direct output
     # this is a parent class that the other fetch classes build off of.
     def initialize(url, options)
+      @data = []
       @connection_tally = {
-        :success => 0,
-        :error => 0,
-        :error_client => 0,
-        :error_server => 0
+        success: 0,
+        error: 0,
+        error_client: 0,
+        error_server: 0
       }
-      # OPTIONS
-      @prgrss = options['progress']
-      @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
-      @v = options['verbose']
-      @output = options['filename']
-      @fh = options['fileharvest']
-      @file_ext = @fh.to_s
-      @s = options['sitemap']
-      @seo = options['seo']
-      @autodown = options['autodown']
-      #
-      if @fh
-        temp_ext_str = '.' + @file_ext + '\z'
-        @file_re = Regexp.new(temp_ext_str).freeze
-      else
-        # when FH is not true, and autodown is true
-        errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
-      end
-      if @prgrss
-        # verbose & progressbar conflict
-        errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
-        prgress_vars = {
-          :title => 'Pages',
-          :starting_at => 1,
-          :total => @max_pages,
-          :format => '%a |%b>%i| %c/%C %t'
-        }
-        @progressbar = ProgressBar.create(prgress_vars)
-      end
+      setup_options(options)
+      setup_progress_bar if @progress
       @t = Retriever::Target.new(url, @file_re)
-      @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
-      @already_crawled = BloomFilter::Native.new(
-        :size => 1_000_000,
-        :hashes => 5,
-        :seed => 1,
-        :bucket => 8,
-        :raise => false
-      )
-      @already_crawled.insert(@t.target)
+      @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
+      @already_crawled = setup_bloom_filter
+      @page_one = crawl_page_one
+      @link_stack = create_link_stack
+      @temp_link_stack = []
     end
     def errlog(msg)
@@ -66,35 +37,26 @@ module Retriever
     end
     def lg(msg)
-      puts "### #{msg}" if @v
+      puts "### #{msg}" if @verbose
     end
     # prints current data collection to STDOUT
     def dump
-      puts '###############################'
-      if @v
-        puts 'Connection Tally:'
-        puts @connection_tally.to_s
-        puts '###############################'
-      end
-      if @s
-        puts "#{@t.target} Sitemap"
-        puts "Page Count: #{@data.size}"
-      elsif @fh
-        puts "Target URL: #{@t.target}"
-        puts "Filetype: #{@file_ext}"
-        puts "File Count: #{@data.size}"
+      puts HR
+      puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
+      puts "Target URL: #{@t.target}"
+      if @sitemap
+        puts 'Sitemap'
+      elsif @fileharvest
+        puts "File harvest by type: #{@fileharvest}"
       elsif @seo
-        puts "#{@t.target} SEO Metrics"
-        puts "Page Count: #{@data.size}"
-      else
-        fail 'ERROR - Cannot dump - Mode Not Found'
+        puts 'SEO Metrics'
       end
-      puts '###############################'
+      puts "Data Dump -- Object Count: #{@data.size}"
+      puts HR
       @data.each do |line|
         puts line
       end
-      puts '###############################'
       puts
     end
@@ -111,34 +73,90 @@ module Retriever
           csv << entry
         end
       end
-      puts '###############################'
+      puts HR
       puts "File Created: #{@output}.csv"
       puts "Object Count: #{@data.size}"
-      puts '###############################'
+      puts HR
       puts
     end
+    private
+    def setup_options(options)
+      @progress     = options['progress']
+      @max_pages    = options['maxpages'] ? options['maxpages'].to_i : 100
+      @verbose      = options['verbose']
+      @output       = options['filename']
+      @fileharvest  = options['fileharvest']
+      @sitemap      = options['sitemap']
+      @seo          = options['seo']
+      @autodown     = options['autodown']
+      @file_re      = Regexp.new(".#{@fileharvest}\z").freeze if @fileharvest
+    end
+    def setup_bloom_filter
+      already_crawled = BloomFilter::Native.new(
+        size: 1_000_000,
+        hashes: 5,
+        seed: 1,
+        bucket: 8,
+        raise: false
+      )
+      already_crawled.insert(@t.target)
+      already_crawled
+    end
+    def setup_progress_bar
+      # verbose & progressbar conflict
+      errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
+      prgress_vars = {
+        title: 'Pages',
+        starting_at: 1,
+        total: @max_pages,
+        format: '%a |%b>%i| %c/%C %t'
+      }
+      @progressbar = ProgressBar.create(prgress_vars)
+    end
+    def crawl_page_one
+      page_one = Retriever::Page.new(@t.source, @t)
+      lg("URL Crawled: #{@t.target}")
+      page_one
+    end
+    def create_link_stack
+      link_stack = @page_one.parse_internal_visitable
+      errlog("Bad URL -- #{@t.target}") unless link_stack
+      lg("#{link_stack.size - 1} links found")
+      link_stack.delete(@t.target)
+      link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
+      link_stack
+    end
+    def end_crawl_notice
+      notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
+      @progressbar.log(notice) if @progress
+      lg(notice)
+    end
     # iterates over the existing @link_stack
     # running until we reach the @max_pages value.
     def async_crawl_and_collect
       while @already_crawled.size < @max_pages
         if @link_stack.empty?
-          if @prgrss
-            @progressbar.log("Can't find any more links.")
-          else
-            lg("Can't find any more links.")
-          end
+          end_crawl_notice
           break
         end
         new_links_arr = process_link_stack
+        @temp_link_stack = []
         next if new_links_arr.nil? || new_links_arr.empty?
         # set operations to see are these in our previous visited pages arr
-        new_links_arr -= @link_stack
-        @link_stack.concat(new_links_arr).uniq!
-        @data.concat(new_links_arr) if @s
+        next if new_links_arr.empty?
+        @link_stack.concat(new_links_arr)
+        next unless @sitemap
+        @data.concat(new_links_arr)
       end
-      # done, make sure progress bar says we are done
-      @progressbar.finish if @prgrss
+      @data.uniq!
     end
     # returns true is resp is ok to continue
@@ -149,8 +167,8 @@ module Retriever
         loc = hdr.location
         lg("#{url} Redirected to #{loc}")
         if t.host_re =~ loc
-          @link_stack.push(loc) unless @already_crawled.include?(loc)
-          lg('--Added to linkStack for later')
+          @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
+          lg('--Added to stack for later')
           return false
         end
         lg("Redirection outside of target host. No - go. #{loc}")
@@ -159,7 +177,6 @@ module Retriever
       # lets not continue if unsuccessful connection
       unless hdr.successful?
         lg("UNSUCCESSFUL CONNECTION -- #{url}")
         @connection_tally[:error] += 1
         @connection_tally[:error_server] += 1 if hdr.server_error?
         @connection_tally[:error_client] += 1 if hdr.client_error?
@@ -168,7 +185,6 @@ module Retriever
       # let's not continue if not text/html
       unless hdr['CONTENT_TYPE'].include?('text/html')
         @already_crawled.insert(url)
-        @link_stack.delete(url)
         lg("Page Not text/html -- #{url}")
         return false
       end
@@ -176,45 +192,58 @@ module Retriever
       true
     end
+    def push_seo_to_data(url, new_page)
+      seos = [url]
+      seos.concat(new_page.parse_seo)
+      @data.push(seos)
+      lg('--page SEO scraped')
+    end
+    def push_files_to_data(new_page)
+      filez = new_page.parse_files(new_page.parse_internal)
+      @data.concat(filez) unless filez.empty?
+      lg("--#{filez.size} files found")
+    end
+    def page_from_response(url, response)
+      lg("Page Fetched: #{url}")
+      @already_crawled.insert(url)
+      if @progress && (@already_crawled.size < @max_pages)
+        @progressbar.increment
+      end
+      Retriever::Page.new(response, @t)
+    end
+    def new_visitable_links(current_page)
+      lg("--#{current_page.links.size} links found")
+      current_page.parse_internal_visitable
+    end
     # send a new wave of GET requests, using current @link_stack
+    # at end of the loop it empties link_stack
+    # puts new links into temporary stack
     def process_link_stack
-      new_stuff = []
       EM.synchrony do
         concurrency = 10
         EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
           next if @already_crawled.size >= @max_pages
           next if @already_crawled.include?(url)
           resp = EventMachine::HttpRequest.new(url).get
           next unless good_response?(resp, url)
-          lg("Page Fetched: #{url}")
-          @already_crawled.insert(url)
-          new_page = Retriever::Page.new(resp.response, @t)
-          if @prgrss
-            @progressbar.increment if @already_crawled.size < @max_pages
-          end
-          if @seo
-            seos = [url]
-            seos.concat(new_page.parse_seo)
-            @data.push(seos)
-            lg('--page SEO scraped')
-          end
-          next if new_page.links.size == 0
-          lg("--#{new_page.links.size} links found")
-          internal_links_arr = new_page.parse_internal_visitable
-          new_stuff.push(internal_links_arr)
-          if @fh
-            filez = new_page.parse_files
-            @data.concat(filez) unless filez.empty?
-            lg("--#{filez.size} files found")
-          end
+          current_page = page_from_response(url, resp.response)
+          # non-link dependent modes
+          push_seo_to_data(url, current_page) if @seo
+          next unless current_page.links.size > 0
+          @temp_link_stack.push(new_visitable_links(current_page))
+          # link dependent modes
+          next unless @fileharvest
+          push_files_to_data(current_page)
         end
-        new_stuff = new_stuff.flatten # all completed requests
         EventMachine.stop
       end
-      new_stuff.uniq!
+      # empty the stack. most clean way
+      @link_stack = []
+      @temp_link_stack.flatten.uniq!
     end
   end
 end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -5,29 +5,21 @@ module Retriever
   class FetchFiles < Fetch
     def initialize(url, options)
       super
-      @data = []
-      page_one = Retriever::Page.new(@t.source, @t)
-      @link_stack = page_one.parse_internal_visitable
-      lg("URL Crawled: #{@t.target}")
-      lg("#{@link_stack.size - 1} new links found")
-      temp_file_collection = page_one.parse_files
+      temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
       @data.concat(tempFileCollection) if temp_file_collection.size > 0
       lg("#{@data.size} new files found")
-      errlog("Bad URL -- #{@t.target}") unless @link_stack
-      @link_stack.delete(@t.target)
       async_crawl_and_collect
+      # done, make sure progress bar says we are done
+      @progressbar.finish if @progress
       @data.sort_by! { |x| x.length }
-      @data.uniq!
     end
     def download_file(path)
       # given valid url, downloads file to current directory in /rr-downloads/
       arr = path.split('/')
       shortname = arr.pop
-      puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
+      puts "Initiating Download of: #{shortname}"
       File.open(shortname, 'wb') do |saved_file|
         open(path) do |read_file|
           saved_file.write(read_file.read)
@@ -38,33 +30,39 @@ module Retriever
     def autodownload
       # go through the fetched file URL collection and download each one.
-      lenny = @data.count
-      puts '###################'
+      puts HR
       puts '### Initiating Autodownload...'
-      puts '###################'
-      puts "#{lenny} - #{@file_ext}'s Located"
-      puts '###################'
-      if File.directory?('rr-downloads')
-        Dir.chdir('rr-downloads')
-      else
-        puts 'creating rr-downloads Directory'
-        Dir.mkdir('rr-downloads')
-        Dir.chdir('rr-downloads')
-      end
-      file_counter = 0
-      @data.each do |entry|
+      puts HR
+      puts "#{@data.count} - #{@file_ext}'s Located"
+      puts HR
+      move_to_download_dir
+      iterate_thru_collection_and_download
+      Dir.chdir('..')
+    end
+    private
+    def iterate_thru_collection_and_download
+      lenn = @data.count
+      @data.each_with_index do |entry, i|
         begin
           download_file(entry)
-          file_counter += 1
-          lg('    File [#{file_counter} of #{lenny}]')
-          puts
-        rescue StandardError => e
-          puts 'ERROR: failed to download - #{entry}'
-          puts e.message
-          puts
+        rescue StandardError
+          puts "ERROR: failed to download - #{entry}"
         end
+        lg("    File [#{i + 1} of #{lenn}]\n")
       end
-      Dir.chdir('..')
+    end
+    def move_to_download_dir(dir_name = 'rr-downloads')
+      if File.directory?(dir_name)
+        Dir.chdir(dir_name)
+      else
+        puts "creating #{dir_name} Directory"
+        Dir.mkdir(dir_name)
+        Dir.chdir(dir_name)
+      end
+      puts "Downloading files to local directory: '/#{dir_name}/'"
     end
   end
 end

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -6,19 +6,11 @@ module Retriever
     #   on all unique pages found on the site
     def initialize(url, options)
       super
-      @data = []
-      page_one = Retriever::Page.new(@t.source, @t)
-      lg("URL Crawled: #{@t.target}")
-      @link_stack = page_one.parse_internal_visitable
-      errlog("Bad URL -- #{@t.target}") unless @link_stack
-      lg("#{@link_stack.size - 1} links found")
-      @link_stack.delete(@t.target)
-      @data.push(page_one.parse_seo)
+      @data.push(@page_one.parse_seo)
       async_crawl_and_collect
+      # done, make sure progress bar says we are done
+      @progressbar.finish if @progress
       @data.sort_by! { |x| x[0].length }
     end
   end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -5,37 +5,38 @@ module Retriever
     # returns an array of all unique pages found on the site
     def initialize(url, options)
       super
-      @data = [@t.target]
-      page_one = Retriever::Page.new(@t.source, @t)
-      lg("URL Crawled: #{@t.target}")
-      @link_stack = page_one.parse_internal_visitable
-      errlog("Bad URL -- #{@t.target}") unless @link_stack
-      lg("#{@link_stack.size - 1} links found")
-      @link_stack.delete(@t.target)
+      @data.push(@t.target)
       @data.concat(@link_stack)
       async_crawl_and_collect
+      # done, make sure progress bar says we are done
+      @progressbar.finish if @progress
       @data.sort_by! { |x| x.length } if @data.size > 1
       @data.uniq!
     end
+    private
     # produces valid XML sitemap based on page collection fetched.
     # Writes to current directory.
     def gen_xml
-      f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
-      f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-        @data.each do |url|
-          f << "<url><loc>#{url}</loc></url>"
-        end
+      filename = @t.host.split('.')[1]
+      f = File.open("sitemap-#{filename}.xml", 'w+')
+      f << "<?xml version='1.0' encoding='UTF-8'?>"
+      f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
+      @data.each do |url|
+        f << "<url><loc>#{url}</loc></url>"
+      end
       f << '</urlset>'
       f.close
-      puts '###############################'
-      puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
+      print_file_info(filename)
+    end
+    def print_file_info(filename)
+      puts HR
+      puts "File Created: sitemap-#{filename}.xml"
       puts "Object Count: #{@data.size}"
-      puts '###############################'
-      puts
+      puts HR + "\n"
     end
   end
 end

data/lib/retriever/link.rb CHANGED Viewed

@@ -1,33 +1,35 @@
+require 'addressable/uri'
 module Retriever
   #
   class Link
-    HTTP_RE = Regexp.new(/^http/i).freeze
-    SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
-    DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
-    NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
-    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
-    def initialize(host, link)
-      @host = host
-      @link = link
+    # HTTP_RE = Regexp.new(/^http/i).freeze
+    SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
+    DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
+    WWW_DOT_RE = Regexp.new(/^www\./i).freeze
+    def initialize(target_scheme, target_host, this_link)
+      @link_uri = Addressable::URI.parse(this_link)
+      @scheme = target_scheme
+      @host = target_host
+      @this_link = @link_uri.to_s
     end
     def path
-      return link if HTTP_RE =~ link
+      return this_link if link_uri.absolute?
-      return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
+      return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
-      return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
+      return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
       # link begins with '//'
-      return "http:#{link}" if DOUBLE_SLASH_RE =~ link
+      return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
       # link uses relative path with no slashes at all
-      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
+      return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
     end
     private
-    attr_reader :host, :link
+    attr_reader :this_link, :host, :link_uri
   end
 end

data/lib/retriever/{openuri-redirect-patch.rb → openuri_redirect_patch.rb} RENAMED Viewed

@@ -1,6 +1,7 @@
+#
 module OpenURI
   # nesc patch otherwise OPENURI blocks redirects to and from https
-  def OpenURI.redirectable?(uri1, uri2)
+  def self.redirectable?(uri1, uri2)
     uri1.scheme.downcase == uri2.scheme.downcase ||
     (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
   end

data/lib/retriever/page.rb CHANGED Viewed

@@ -1,21 +1,40 @@
+require 'addressable/uri'
 module Retriever
   #
   class Page
-    HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
-    NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
-    HTTP_RE = Regexp.new(/^http/i).freeze
-    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
-    TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
-    DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
-    H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
-    H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
+    HTTP_RE   = Regexp.new(/^http/i).freeze
+    H1_RE     = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
+    H2_RE     = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
+    TITLE_RE  = Regexp.new(/<title>(.*)<\/title>/i).freeze
+    DESC_RE   = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
+                          [^>]*content=[\"]
+                          (
+                            [^\"]*
+                          )
+                          [\"]
+                          [^>]
+                          *>
+                          /ix).freeze
+    HREF_CONTENTS_RE = Regexp.new(/\shref=
+                                  ['|"]
+                                  (
+                                    [^\s]
+                                    [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
+                                  )
+                                  ['|"]
+                                  [\s|\W]
+                                  /ix).freeze
+    NONPAGE_EXT_RE = Regexp.new(/\.
+                                (?:css|js|png|gif|jpg|mp4|
+                                wmv|flv|mp3|wav|doc|txt|ico|xml)
+                                /ix).freeze
     attr_reader :links, :source, :t
     def initialize(source, t)
       @t = t
-      @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
+      @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
       @links = nil
     end
@@ -28,20 +47,20 @@ module Retriever
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
-        Link.new(@t.host, link).path
-      end.uniq
+        Link.new(@t.scheme, @t.host, link).path
+      end.compact.uniq
     end
     def parse_internal
-      links.select { |linky| (@t.host_re =~ linky) }
+      links.select { |x| @t.host == Addressable::URI.parse(x).host }
     end
     def parse_internal_visitable
-      parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
+      parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
     end
-    def parse_files
-      links.select { |linky| (@t.file_re =~ linky) }
+    def parse_files(arr)
+      arr.select { |x| @t.file_re =~ x }
     end
     def title

data/lib/retriever/target.rb CHANGED Viewed

@@ -1,21 +1,22 @@
 require 'open-uri'
+require 'addressable/uri'
 module Retriever
   #
   class Target
-    HTTP_RE = Regexp.new(/^http/i).freeze
-    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    HTTP_RE    = Regexp.new(/^http/i).freeze
-    attr_reader :host, :target, :host_re, :source, :file_re
+    attr_reader :host, :target, :host_re, :source, :file_re, :scheme
     def initialize(url, file_re = nil)
-      url = "http://#{url}" unless HTTP_RE =~ url
-      fail 'Bad URL' unless /\./ =~ url
-      new_uri = URI(url)
-      @target = new_uri.to_s
-      @host = new_uri.host
-      @host_re = Regexp.new(@host.sub('www.', ''))
-      @file_re ||= file_re
+      fail 'Bad URL' unless url.include?('.')
+      url         = "http://#{url}" unless HTTP_RE =~ url
+      target_uri  = Addressable::URI.parse(url)
+      @target     = target_uri.to_s
+      @host       = target_uri.host
+      @host_re    = Regexp.new(@host.sub('www.', ''))
+      @file_re  ||= file_re
+      @scheme     = target_uri.scheme
     end
     def source
@@ -31,13 +32,14 @@ module Retriever
       fail 'Domain is not working. Try the non-WWW version.' if resp == ''
       fail 'Domain not working. Try HTTPS???' unless resp
       # consider using scrub from ruby 2.1? this misses some things
-      resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
+      resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
     end
     def resync_target_and_return_source(url)
-      new_t = Retriever::Target.new(url)
+      new_t   = Retriever::Target.new(url)
       @target = new_t.target
-      @host = new_t.host
+      @host   = new_t.host
+      @scheme = new_t.scheme
       new_t.source
     end
   end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+#
 module Retriever
-  VERSION = '1.1.0'
-end
+  VERSION = '1.2.0'
+end

data/lib/retriever.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'retriever/cli'
 require 'retriever/link'
 require 'retriever/target'
 require 'retriever/page'
-require 'retriever/openuri-redirect-patch'
+require 'retriever/openuri_redirect_patch'
 #
 module Retriever

data/readme.md CHANGED Viewed

@@ -4,15 +4,29 @@
 By Joe Norton
-RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
+RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
-RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
-**Use at Own Risk**
-RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
+RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
 **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
+mission
+-------
+RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
+features
+--------
+* Asynchronous HTTP Requests thru EM & Synchrony
+* Bloom filter for tracking pages visited.
+* 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
+use-cases
+---------
+RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
+1. Crawl your website and output a *valid XML sitemap* based on what it found.
+2. Crawl a target website and *download all files of a given filetype*.
+3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
+Help & Forks Welcome!
 getting started
 -----------

data/spec/link_spec.rb CHANGED Viewed

@@ -1,66 +1,68 @@
 require 'retriever'
-describe "Link" do
+describe 'Link' do
-    t = Retriever::Target.new("http://www.cnet.com/reviews/")
-    let(:links) { Retriever::Page.new(@source,t).links }
+  t = Retriever::Target.new('http://www.cnet.com/reviews/')
+  let(:links) { Retriever::Page.new(@source, t).links }
-    it "collects links in anchor tags" do
-      @source = (<<SOURCE).strip
-<a href='http://www.cnet.com/download.exe'>download</a>
+  it 'collects links in anchor tags' do
+    @source = (<<SOURCE).strip
+    <a href='http://www.cnet.com/download.exe'>download</a>
 SOURCE
-      expect(links).to include('http://www.cnet.com/download.exe')
-    end
+    expect(links).to include('http://www.cnet.com/download.exe')
+  end
-    it "collects links in link tags" do
-      @source = (<<SOURCE).strip
- <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
+  it 'collects links in link tags' do
+    @source = (<<SOURCE).strip
+<link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
 SOURCE
-      expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
-    end
+    expect(links[0]).to include('formreset.css?ver=1.7.12')
+  end
-    it "does not collect bare links (ones not in an href)" do
-      @source = (<<SOURCE).strip
+  it 'does not collect bare links (ones not in an href)' do
+    @source = (<<SOURCE).strip
 http://www.google.com
 SOURCE
-      expect(links).to_not include('http://www.google.com')
-    end
+    expect(links).to_not include('http://www.google.com')
+  end
-    it "collects only unique href links on the page" do
-      @source = (<<SOURCE).strip
+  it 'collects only unique href links on the page' do
+    @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
 <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
 SOURCE
-      expect(links.size).to eq(1)
-    end
+    expect(links.size).to eq(1)
+  end
-    it "adds a protocol to urls missing them (www.)" do
-      @source = (<<SOURCE).strip
+  it 'adds a protocol to urls missing them (www.)' do
+    @source = (<<SOURCE).strip
 <a href='www.cnet.com/download.exe'>download</a>
 SOURCE
-      expect(links).to include('http://www.cnet.com/download.exe')
-    end
+    expect(links).to include('http://www.cnet.com/download.exe')
+  end
-    it "doesn't care about any extra attributes on the anchor tag" do
-      @source = (<<SOURCE).strip
+  it "doesn't care about any extra attributes on the anchor tag" do
+    @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
-<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
+<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
+</a>
 SOURCE
-      expect(links.size).to eq(1)
-    end
+    expect(links.size).to eq(1)
+  end
-    it "returns relative urls with full path based on hostname" do
-      @source = (<<SOURCE).strip
+  it 'returns relative urls with full path based on hostname' do
+    @source = (<<SOURCE).strip
 <a href='/test.html'>test</a>
 <a href='cpage_18'>about</a>
 SOURCE
-      expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
-    end
-end
+    expect(links).to include('http://www.cnet.com/test.html',
+                             'http://www.cnet.com/cpage_18')
+  end
+end

data/spec/page_spec.rb CHANGED Viewed

@@ -1,93 +1,97 @@
 require 'retriever/page'
 require 'retriever/fetch'
-t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
+t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
-describe "Page" do
+describe 'Page' do
-  describe "#links" do
-    let (:links){Retriever::Page.new(@source,t).links}
-    it "collects all unique href links on the page" do
-            @source = (<<SOURCE).strip
+  describe '#links' do
+    let(:links) { Retriever::Page.new(@source, t).links }
+    it 'collects all unique href links on the page' do
+      @source = (<<SOURCE).strip
 <a href='www.cnet.com/download.exe'>download</a>
 <a href='/test.html'>test</a>
-<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
+<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
+</a>
 <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
- <a href='http://www.yahoo.com/test/'>yahoo</a>
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
 SOURCE
       expect(links.size).to eq(4)
     end
   end
-  describe "#parse_internal" do
-    let (:links){Retriever::Page.new(@source,t).parse_internal}
-    it "filters links by host" do
-            @source = (<<SOURCE).strip
+  describe '#parse_internal' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    let(:links) { page.parse_internal }
+    it 'filters links by host' do
+      @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/'>download</a>
- <a href='http://www.yahoo.com/test/'>yahoo</a>
+<a href='http://www.yahoo.com/test/'>yahoo</a>
 SOURCE
-        expect(links.size).to eq(1)
+      expect(links.size).to eq(1)
     end
   end
-  describe "#parse_internal_visitable" do
-    let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
+  describe '#parse_internal_visitable' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    let(:links) { page.parse_internal_visitable }
     it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
-            @source = (<<SOURCE).strip
+      @source = (<<SOURCE).strip
  <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
 SOURCE
-        expect(links.size).to eq(0)
+      expect(links.size).to eq(0)
     end
   end
-  describe "#parseFiles" do
-    let (:links){Retriever::Page.new(@source,t).parse_files}
-    it "filters links by filetype" do
-                  @source = (<<SOURCE).strip
+  describe '#parse_files' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    let(:files) { page.parse_files(page.parse_internal) }
+    it 'filters links by filetype' do
+      @source = (<<SOURCE).strip
 <a href='www.cnet.com/download.exe'>download</a>
-http://www.google.com
+http://www.google.com
 <a href='/test.html'>test</a>
 SOURCE
-        expect(links.size).to eq(1)
+      expect(files.size).to eq(1)
     end
   end
-    describe "#title" do
-      let (:page){Retriever::Page.new(@source,t)}
-    it "returns page title" do
-                  @source = (<<SOURCE).strip
+  describe '#title' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    it 'returns page title' do
+      @source = (<<SOURCE).strip
 <title>test</title>
 SOURCE
-        expect(page.title).to eq('test')
+      expect(page.title).to eq('test')
     end
   end
-      describe "#desc" do
-      let (:page){Retriever::Page.new(@source,t)}
-    it "returns meta description" do
-                  @source = (<<SOURCE).strip
+  describe '#desc' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    it 'returns meta description' do
+      @source = (<<SOURCE).strip
 <meta name='description' content="test2 ">
 SOURCE
-        expect(page.desc).to eq('test2 ')
+      expect(page.desc).to eq('test2 ')
     end
   end
-        describe "#h1" do
-      let (:page){Retriever::Page.new(@source,t)}
-    it "returns h1 text" do
-                  @source = (<<SOURCE).strip
+  describe '#h1' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    it 'returns h1 text' do
+      @source = (<<SOURCE).strip
 <h1>test 3</h1>
 SOURCE
-        expect(page.h1).to eq('test 3')
+      expect(page.h1).to eq('test 3')
     end
   end
-        describe "#h2" do
-      let (:page){Retriever::Page.new(@source,t)}
-    it "returns h2 text" do
-                  @source = (<<SOURCE).strip
+  describe '#h2' do
+    let(:page) { Retriever::Page.new(@source, t) }
+    it 'returns h2 text' do
+      @source = (<<SOURCE).strip
 <h2> test 4 </h2>
 SOURCE
-        expect(page.h2).to eq(' test 4 ')
+      expect(page.h2).to eq(' test 4 ')
     end
   end
 end

data/spec/retriever_spec.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require 'retriever'
-describe "Fetch" do
-end
+describe 'Fetch' do
+end

data/spec/target_spec.rb CHANGED Viewed

@@ -1,44 +1,44 @@
 require 'retriever'
 require 'open-uri'
-t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
+t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
-describe "Target" do
+describe 'Target' do
-    it "creates target var" do
-      expect(t.target).to eq("http://www.cnet.com/reviews/")
-    end
+  it 'creates target var' do
+    expect(t.target).to eq('http://www.cnet.com/reviews/')
+  end
-    it "creates host var" do
-      expect(t.host).to eq("www.cnet.com")
-    end
+  it 'creates host var' do
+    expect(t.host).to eq('www.cnet.com')
+  end
-    it "creates host_re var" do
-      expect(t.host_re).to eq(/cnet.com/)
-    end
+  it 'creates host_re var' do
+    expect(t.host_re).to eq(/cnet.com/)
+  end
-    it "creates file_re var (when provided)" do
-      expect(t.file_re).to eq(/\.exe\z/)
-    end
+  it 'creates file_re var (when provided)' do
+    expect(t.file_re).to eq(/\.exe\z/)
+  end
-    it "adds protocol to Target URL if none given" do
-      expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
-    end
+  it 'adds protocol to Target URL if none given' do
+    expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
+  end
-    it "fails if given URL has no dot in it" do
-      expect{Retriever::Target.new("cnetcom")}.to raise_error
-    end
+  it 'fails if given URL has no dot in it' do
+    expect { Retriever::Target.new('cnetcom') }.to raise_error
+  end
-  describe "#source" do
+  describe '#source' do
-    it "opens URL and returns source as String" do
-      expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
+    it 'opens URL and returns source as String' do
+      expect(Retriever::Target.new('http://techcrunch.com/').source.class)
+      .to eq(String)
     end
-    it "fails if target redirects to new host" do
-      expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
+    it 'fails if target redirects to new host' do
+      expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
+      .to raise_error
     end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.2.0
 platform: ruby
 authors:
 - Joe Norton
@@ -66,6 +66,20 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,7 @@ files:
 - lib/retriever/fetchseo.rb
 - lib/retriever/fetchsitemap.rb
 - lib/retriever/link.rb
-- lib/retriever/openuri-redirect-patch.rb
+- lib/retriever/openuri_redirect_patch.rb
 - lib/retriever/page.rb
 - lib/retriever/target.rb
 - lib/retriever/version.rb