RubyGems - rubyretriever - Versions diffs - 1.2.3 → 1.2.4 - Mend

rubyretriever 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/bin/rr +1 -1
data/lib/retriever/fetch.rb +10 -10
data/lib/retriever/fetchfiles.rb +6 -6
data/lib/retriever/fetchseo.rb +2 -2
data/lib/retriever/fetchsitemap.rb +6 -6
data/lib/retriever/link.rb +6 -1
data/lib/retriever/page.rb +1 -1
data/lib/retriever/target.rb +1 -1
data/lib/retriever/version.rb +1 -1
data/readme.md +19 -15
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3bb32aa2e9c8317d2f3cb13572e2cdecb1da24a9
-  data.tar.gz: 732e5610104345efed80651929cb9a050e01d9be
+  metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
+  data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
 SHA512:
-  metadata.gz: 3d4e109785452db3906dc7b66158846cda24e4c3e1b942f600918338e141d6a337f1f9b3087b94b2561c64095fcdc2f2fb439d29b73574a2ddae501a8f0d965b
-  data.tar.gz: 2e0befea22dfc2bc689d15ad3c33efaf015f7b1ee5c53a322cccca7f6394a4def445e362585600e1934f770377ce193c5a762caa3639ccda480f7c481ce64d64
+  metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
+  data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3

data/bin/rr CHANGED Viewed

@@ -56,7 +56,7 @@ if ARGV[0].nil?
 end
 ARGV.each do|q|
-  if options[:verbose]
+  if options['verbose']
     puts '###############################'
     puts '### [RubyRetriever]'
     puts '### Creating Sitemap' if options['sitemap']

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -10,12 +10,12 @@ module Retriever
   #
   class Fetch
     HR = '###############################'
-    attr_reader :max_pages, :t
+    attr_reader :max_pages, :t, :result
     # given target URL and RR options, creates a fetch object.
     # There is no direct output
     # this is a parent class that the other fetch classes build off of.
     def initialize(url, options)
-      @data = []
+      @result = []
       @connection_tally = {
         success: 0,
         error: 0,
@@ -52,9 +52,9 @@ module Retriever
       elsif @seo
         puts 'SEO Metrics'
       end
-      puts "Data Dump -- Object Count: #{@data.size}"
+      puts "Data Dump -- Object Count: #{@result.size}"
       puts HR
-      @data.each do |line|
+      @result.each do |line|
         puts line
       end
       puts
@@ -69,13 +69,13 @@ module Retriever
           csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
           i += 1
         end
-        @data.each do |entry|
+        @result.each do |entry|
           csv << entry
         end
       end
       puts HR
       puts "File Created: #{@output}.csv"
-      puts "Object Count: #{@data.size}"
+      puts "Object Count: #{@result.size}"
       puts HR
       puts
     end
@@ -152,9 +152,9 @@ module Retriever
         next if new_links_arr.nil? || new_links_arr.empty?
         @link_stack.concat(new_links_arr)
         next unless @sitemap
-        @data.concat(new_links_arr)
+        @result.concat(new_links_arr)
       end
-      @data.uniq!
+      @result.uniq!
     end
     # returns true is resp is ok to continue
@@ -193,13 +193,13 @@ module Retriever
     def push_seo_to_data(url, new_page)
       seos = [url]
       seos.concat(new_page.parse_seo)
-      @data.push(seos)
+      @result.push(seos)
       lg('--page SEO scraped')
     end
     def push_files_to_data(new_page)
       filez = new_page.parse_files(new_page.parse_internal)
-      @data.concat(filez) unless filez.empty?
+      @result.concat(filez) unless filez.empty?
       lg("--#{filez.size} files found")
     end

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -6,13 +6,13 @@ module Retriever
     def initialize(url, options)
       super
       temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
-      @data.concat(temp_file_collection) if temp_file_collection.size > 0
-      lg("#{@data.size} new files found")
+      @result.concat(temp_file_collection) if temp_file_collection.size > 0
+      lg("#{@result.size} new files found")
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x.length }
+      @result.sort_by! { |x| x.length }
     end
     def download_file(path)
@@ -33,7 +33,7 @@ module Retriever
       puts HR
       puts '### Initiating Autodownload...'
       puts HR
-      puts "#{@data.count} - #{@file_ext}'s Located"
+      puts "#{@result.count} - #{@file_ext}'s Located"
       puts HR
       move_to_download_dir
       iterate_thru_collection_and_download
@@ -43,8 +43,8 @@ module Retriever
     private
     def iterate_thru_collection_and_download
-      lenn = @data.count
-      @data.each_with_index do |entry, i|
+      lenn = @result.count
+      @result.each_with_index do |entry, i|
         begin
           download_file(entry)
         rescue StandardError

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -6,12 +6,12 @@ module Retriever
     #   on all unique pages found on the site
     def initialize(url, options)
       super
-      @data.push(@page_one.parse_seo)
+      @result.push(@page_one.parse_seo)
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x[0].length }
+      @result.sort_by! { |x| x[0].length }
     end
   end
 end

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -5,14 +5,14 @@ module Retriever
     # returns an array of all unique pages found on the site
     def initialize(url, options)
       super
-      @data.push(@t.target)
-      @data.concat(@link_stack)
+      @result.push(@t.target)
+      @result.concat(@link_stack)
       async_crawl_and_collect
       # done, make sure progress bar says we are done
       @progressbar.finish if @progress
-      @data.sort_by! { |x| x.length } if @data.size > 1
-      @data.uniq!
+      @result.sort_by! { |x| x.length } if @result.size > 1
+      @result.uniq!
     end
     private
@@ -24,7 +24,7 @@ module Retriever
       f = File.open("sitemap-#{filename}.xml", 'w+')
       f << "<?xml version='1.0' encoding='UTF-8'?>"
       f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
-      @data.each do |url|
+      @result.each do |url|
         f << "<url><loc>#{url}</loc></url>"
       end
       f << '</urlset>'
@@ -35,7 +35,7 @@ module Retriever
     def print_file_info(filename)
       puts HR
       puts "File Created: sitemap-#{filename}.xml"
-      puts "Object Count: #{@data.size}"
+      puts "Object Count: #{@result.size}"
       puts HR + "\n"
     end
   end

data/lib/retriever/link.rb CHANGED Viewed

@@ -8,7 +8,12 @@ module Retriever
     WWW_DOT_RE = Regexp.new(/^www\./i).freeze
     def initialize(target_scheme, target_host, this_link)
-      @link_uri = Addressable::URI.parse(this_link)
+      begin
+        @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
+      rescue Addressable::URI::InvalidURIError => e
+        dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
+        @link_uri = Addressable::URI.parse(dummy_link.path)
+      end
       @scheme = target_scheme
       @host = target_host
       @this_link = @link_uri.to_s

data/lib/retriever/page.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module Retriever
     end
     def parse_internal
-      links.select { |x| @t.host == Addressable::URI.parse(x).host }
+      links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
     end
     def parse_internal_visitable

data/lib/retriever/target.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Retriever
     def initialize(url, file_re = nil)
       fail 'Bad URL' unless url.include?('.')
       url         = "http://#{url}" unless HTTP_RE =~ url
-      target_uri  = Addressable::URI.parse(url)
+      target_uri  = Addressable::URI.parse(Addressable::URI.encode(url))
       @target     = target_uri.to_s
       @host       = target_uri.host
       @host_re    = Regexp.new(@host.sub('www.', ''))

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 #
 module Retriever
-  VERSION = '1.2.3'
+  VERSION = '1.2.4'
 end

data/readme.md CHANGED Viewed

@@ -6,29 +6,33 @@ By Joe Norton
 RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
-RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*.  Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
+RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
-**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
-mission
+**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
+Mission
 -------
-RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
+RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
-features
+Features
 --------
 * Asynchronous HTTP Requests thru EM & Synchrony
-* Bloom filter for tracking pages visited.
-* 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
+* Bloom filter for tracking visited pages
+* 3 CLI modes
+	* Sitemap
+	* File Harvest
+	* SEO
-use-cases
+Use cases
 ---------
-RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
+RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
 1. Crawl your website and output a *valid XML sitemap* based on what it found.
 2. Crawl a target website and *download all files of a given filetype*.
-3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
+3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
 Help & Forks Welcome!
-getting started
+Getting started
 -----------
 Install the gem
 ```sh
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
 rr -s csv -p -l 100 http://www.cnet.com
 ```
-This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
+This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
  **Example: File Harvesting mode**
 ```sh
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
 rr -f pdf -p -l 100 http://www.hubspot.com
 ```
-This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
+This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
 **Example: SEO mode**
 ```sh
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
 rr -e -p -l 10 -o cnet-seo http://www.cnet.com
 ```
-This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
+This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
 command-line arguments
@@ -93,4 +97,4 @@ bloomfilter-rb
 License
 -------
-See included 'LICENSE' file. It's the MIT license.
+See included 'LICENSE' file. It's the MIT license.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.2.3
+  version: 1.2.4
 platform: ruby
 authors:
 - Joe Norton
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-14 00:00:00.000000000 Z
+date: 2014-06-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: em-synchrony