RubyGems - rubyretriever - Versions diffs - 1.2.3 → 1.2.4 - Mend

rubyretriever 1.2.3 → 1.2.4

Files changed (12) hide show

checksums.yaml +4 -4
data/bin/rr +1 -1
data/lib/retriever/fetch.rb +10 -10
data/lib/retriever/fetchfiles.rb +6 -6
data/lib/retriever/fetchseo.rb +2 -2
data/lib/retriever/fetchsitemap.rb +6 -6
data/lib/retriever/link.rb +6 -1
data/lib/retriever/page.rb +1 -1
data/lib/retriever/target.rb +1 -1
data/lib/retriever/version.rb +1 -1
data/readme.md +19 -15
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3bb32aa2e9c8317d2f3cb13572e2cdecb1da24a9
-  data.tar.gz: 732e5610104345efed80651929cb9a050e01d9be
+  metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
+  data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
 SHA512:
-  metadata.gz: 3d4e109785452db3906dc7b66158846cda24e4c3e1b942f600918338e141d6a337f1f9b3087b94b2561c64095fcdc2f2fb439d29b73574a2ddae501a8f0d965b
-  data.tar.gz: 2e0befea22dfc2bc689d15ad3c33efaf015f7b1ee5c53a322cccca7f6394a4def445e362585600e1934f770377ce193c5a762caa3639ccda480f7c481ce64d64
+  metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
+  data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3

data/bin/rr CHANGED Viewed

@@ -56,7 +56,7 @@ if ARGV[0].nil?
 end
 ARGV.each do|q|
-  if options[:verbose]
+  if options['verbose']
     puts '###############################'
     puts '### [RubyRetriever]'
     puts '### Creating Sitemap' if options['sitemap']

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -10,12 +10,12 @@ module Retriever
   #
   class Fetch
     HR = '###############################'
-    attr_reader :max_pages, :t
+    attr_reader :max_pages, :t, :result
     # given target URL and RR options, creates a fetch object.
     # There is no direct output
     # this is a parent class that the other fetch classes build off of.
     def initialize(url, options)
-      @data = []
+      @result = []
       @connection_tally = {
         success: 0,
         error: 0,
@@ -52,9 +52,9 @@ module Retriever
       elsif @seo
         puts 'SEO Metrics'
       end
-      puts "Data Dump -- Object Count: #{@data.size}"
+      puts "Data Dump -- Object Count: #{@result.size}"
       puts HR
-      @data.each do |line|
+      @result.each do |line|
         puts line
       end
       puts
@@ -69,13 +69,13 @@ module Retriever
           csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
           i += 1
         end
-        @data.each do |entry|
+        @result.each do |entry|
           csv << entry
         end
       end
       puts HR
       puts "File Created: #{@output}.csv"
-      puts "Object Count: #{@data.size}"
+      puts "Object Count: #{@result.size}"
       puts HR
       puts
     end
@@ -152,9 +152,9 @@ module Retriever
         next if new_links_arr.nil? || new_links_arr.empty?
         @link_stack.concat(new_links_arr)
         next unless @sitemap
-        @data.concat(new_links_arr)
+        @result.concat(new_links_arr)
       end
-      @data.uniq!
+      @result.uniq!
     end
     # returns true is resp is ok to continue
@@ -193,13 +193,13 @@ module Retriever
     def push_seo_to_data(url, new_page)
       seos = [url]
       seos.concat(new_page.parse_seo)
-      @data.push(seos)
+      @result.push(seos)
       lg('--page SEO scraped')
     end
     def push_files_to_data(new_page)
       filez = new_page.parse_files(new_page.parse_internal)
-      @data.concat(filez) unless filez.empty?
+      @result.concat(filez) unless filez.empty?
       lg("--#{filez.size} files found")
     end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -6,13 +6,13 @@ module Retriever
     def initialize(url, options)
       super
       temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
-      @data.concat(temp_file_collection) if temp_file_collection.size > 0
-      lg("#{@data.size} new files found")
+      @result.concat(temp_file_collection) if temp_file_collection.size > 0
+      lg("#{@result.size} new files found")
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x.length }
+      @result.sort_by! { |x| x.length }
     end
     def download_file(path)
@@ -33,7 +33,7 @@ module Retriever
       puts HR
       puts '### Initiating Autodownload...'
       puts HR
-      puts "#{@data.count} - #{@file_ext}'s Located"
+      puts "#{@result.count} - #{@file_ext}'s Located"
       puts HR
       move_to_download_dir
       iterate_thru_collection_and_download
@@ -43,8 +43,8 @@ module Retriever
     private
     def iterate_thru_collection_and_download
-      lenn = @data.count
-      @data.each_with_index do |entry, i|
+      lenn = @result.count
+      @result.each_with_index do |entry, i|
         begin
           download_file(entry)
         rescue StandardError

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -6,12 +6,12 @@ module Retriever
     #   on all unique pages found on the site
     def initialize(url, options)
       super
-      @data.push(@page_one.parse_seo)
+      @result.push(@page_one.parse_seo)
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x[0].length }
+      @result.sort_by! { |x| x[0].length }
     end
   end
 end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -5,14 +5,14 @@ module Retriever
     # returns an array of all unique pages found on the site
     def initialize(url, options)
       super
-      @data.push(@t.target)
-      @data.concat(@link_stack)
+      @result.push(@t.target)
+      @result.concat(@link_stack)
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x.length } if @data.size > 1
-      @data.uniq!
+      @result.sort_by! { |x| x.length } if @result.size > 1
+      @result.uniq!
     end
     private
@@ -24,7 +24,7 @@ module Retriever
       f = File.open("sitemap-#{filename}.xml", 'w+')
       f << "<?xml version='1.0' encoding='UTF-8'?>"
       f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-      @data.each do |url|
+      @result.each do |url|
         f << "<url><loc>#{url}</loc></url>"
       end
       f << '</urlset>'
@@ -35,7 +35,7 @@ module Retriever
     def print_file_info(filename)
       puts HR
       puts "File Created: sitemap-#{filename}.xml"
-      puts "Object Count: #{@data.size}"
+      puts "Object Count: #{@result.size}"
       puts HR + "\n"
     end
   end

data/lib/retriever/link.rb CHANGED Viewed

@@ -8,7 +8,12 @@ module Retriever
     WWW_DOT_RE = Regexp.new(/^www\./i).freeze
     def initialize(target_scheme, target_host, this_link)
-      @link_uri = Addressable::URI.parse(this_link)
+      begin
+        @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
+      rescue Addressable::URI::InvalidURIError => e
+        dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
+        @link_uri = Addressable::URI.parse(dummy_link.path)
+      end
       @scheme = target_scheme
       @host = target_host
       @this_link = @link_uri.to_s

data/lib/retriever/page.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module Retriever
     end
     def parse_internal
-      links.select { |x| @t.host == Addressable::URI.parse(x).host }
+      links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
     end
     def parse_internal_visitable

data/lib/retriever/target.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Retriever
     def initialize(url, file_re = nil)
       fail 'Bad URL' unless url.include?('.')
       url         = "http://#{url}" unless HTTP_RE =~ url
-      target_uri  = Addressable::URI.parse(url)
+      target_uri  = Addressable::URI.parse(Addressable::URI.encode(url))
       @target     = target_uri.to_s
       @host       = target_uri.host
       @host_re    = Regexp.new(@host.sub('www.', ''))

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 #
 module Retriever
-  VERSION = '1.2.3'
+  VERSION = '1.2.4'
 end

data/readme.md CHANGED Viewed

@@ -6,29 +6,33 @@ By Joe Norton
 RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
-RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
+RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
-**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
-mission
+**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
+Mission
 -------
-RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
+RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
-features
+Features
 --------
 * Asynchronous HTTP Requests thru EM & Synchrony
-* Bloom filter for tracking pages visited.
-* 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
+* Bloom filter for tracking visited pages
+* 3 CLI modes
+	* Sitemap
+	* File Harvest
+	* SEO
-use-cases
+Use cases
 ---------
-RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
+RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
 1. Crawl your website and output a *valid XML sitemap* based on what it found.
 2. Crawl a target website and *download all files of a given filetype*.
-3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
+3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
 Help & Forks Welcome!
-getting started
+Getting started
 -----------
 Install the gem
 ```sh
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
 rr -s csv -p -l 100 http://www.cnet.com
 ```
-This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
+This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
  **Example: File Harvesting mode**
 ```sh
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
 rr -f pdf -p -l 100 http://www.hubspot.com
 ```
-This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
+This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
 **Example: SEO mode**
 ```sh
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
 rr -e -p -l 10 -o cnet-seo http://www.cnet.com
 ```
-This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
+This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
 command-line arguments
@@ -93,4 +97,4 @@ bloomfilter-rb
 License
 -------
-See included 'LICENSE' file. It's the MIT license.
+See included 'LICENSE' file. It's the MIT license.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.2.3
+  version: 1.2.4
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-14 00:00:00.000000000 Z
+date: 2014-06-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony