RubyGems - rubyretriever - Versions diffs - 1.3.0 → 1.4.0 - Mend

rubyretriever 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/LICENSE +1 -1
data/lib/retriever/fetch.rb +2 -2
data/lib/retriever/fetchfiles.rb +1 -1
data/lib/retriever/fetchseo.rb +1 -1
data/lib/retriever/fetchsitemap.rb +1 -1
data/lib/retriever/link.rb +14 -7
data/lib/retriever/page.rb +11 -3
data/lib/retriever/page_iterator.rb +1 -1
data/lib/retriever/version.rb +1 -1
data/readme.md +30 -10
data/spec/link_spec.rb +12 -3
data/spec/page_spec.rb +34 -47
data/spec/retriever_spec.rb +5 -5
metadata +29 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
-  data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
+  metadata.gz: 9a282bd399fdff64f26493e2b76a44df2ece00e4
+  data.tar.gz: 57b52b0b8b56116ae924fe7a03c912526c3342b2
 SHA512:
-  metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
-  data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
+  metadata.gz: 92b91cc247b847a845b48d33e7fdbbc828e2e3372663d06abcdbd0dc5a524357319dfddbc28f879419670aea0f0a5ab88934ae515b4d31087fe67d534bf95e26
+  data.tar.gz: 3bdb6d28f487209b36426e6b00a7c929d049e539d3cda583cbd13e73902d3230f0a3a2d081af4ab5939110b1e2685b4202e67a1b11bb256d4c924e46cdeefb57

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
+2016 (c) Joseph Michael Norton - @JoeNorton - http://Norton.io
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -84,7 +84,7 @@ module Retriever
       puts
     end
- # returns true is resp is ok to continue
+    # returns true is resp is ok to continue
     def good_response?(resp, url)
       return false unless resp
       hdr = resp.response_header
@@ -128,7 +128,7 @@ module Retriever
       @sitemap      = options['sitemap']
       @seo          = options['seo']
       @autodown     = options['autodown']
-      @file_re      = Regexp.new(/.#{@fileharvest}\z/).freeze if @fileharvest
+      @file_re      = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
     end
     def setup_bloom_filter

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Retriever
-  # recieves target url and RR options
+  # receives target url and RR options
   # returns an array of all unique files (based on given filetype)
   #   found on the target site
   class FetchFiles < Fetch

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class FetchSEO < Fetch
-    # recieves target url and RR options
+    # receives target url and RR options
     # returns an array of onpage SEO related fields
     #   on all unique pages found on the site
     def initialize(url, options)

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class FetchSitemap < Fetch
-    # recieves target URL and RR options
+    # receives target URL and RR options
     # returns an array of all unique pages found on the site
     def initialize(url, options)
       super

data/lib/retriever/link.rb CHANGED Viewed

@@ -7,16 +7,18 @@ module Retriever
     DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
     WWW_DOT_RE = Regexp.new(/^www\./i).freeze
-    def initialize(target_scheme, target_host, this_link)
+    def initialize(target_scheme, target_host, this_link, current_url)
       begin
-        @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
-      rescue Addressable::URI::InvalidURIError => e
-        dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
-        @link_uri = Addressable::URI.parse(dummy_link.path)
+        #this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
+        @link_uri = Addressable::URI.parse(this_link)
+      rescue Addressable::URI::InvalidURIError
+        dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
+        @link_uri = Addressable::URI.parse(dummy.path)
       end
       @scheme = target_scheme
       @host = target_host
       @this_link = @link_uri.to_s
+      @current_page_url = current_url
     end
     def path
@@ -30,11 +32,16 @@ module Retriever
       return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
       # link uses relative path with no slashes at all
-      return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
+      if link_uri.relative?
+        if @current_page_url[-1, 1] == "/"
+          return "#{@current_page_url}#{this_link}"
+        end
+        return "#{@current_page_url}/#{this_link}"
+      end
     end
     private
-    attr_reader :this_link, :host, :link_uri
+    attr_reader :this_link, :host, :link_uri, :current_page_url
   end
 end

data/lib/retriever/page.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'nokogiri'
 require 'addressable/uri'
 #
 using SourceString
@@ -40,7 +41,7 @@ module Retriever
       @links = nil
     end
-    # recieves page source as string
+    # receives page source as string
     # returns array of unique href links
     def links
       return @links if @links
@@ -49,12 +50,14 @@ module Retriever
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
-        Link.new(@t.scheme, @t.host, link).path
+        Link.new(@t.scheme, @t.host, link, @url).path
       end.compact.uniq
     end
     def parse_internal
-      links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
+      links.select do |x|
+        @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
+      end
     end
     def parse_internal_visitable
@@ -65,6 +68,11 @@ module Retriever
       arr.select { |x| @t.file_re =~ x }
     end
+    def parse_by_css(selector)
+      nokogiri_doc = Nokogiri::HTML(@source)
+      nokogiri_doc.css(selector).text
+    end
     def title
       TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
     end

data/lib/retriever/page_iterator.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class PageIterator < Fetch
-    # recieves target url and RR options, and a block
+    # receives target url and RR options, and a block
     # runs the block on all pages during crawl, pushing
     #   the returned value of the block onto a result stack
     #   the complete data returned from the crawl is accessible thru self.result

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 #
 module Retriever
-  VERSION = '1.3.0'
+  VERSION = '1.4.0'
 end

data/readme.md CHANGED Viewed

@@ -4,33 +4,53 @@
 By Joe Norton
-RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
+RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
-RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
+RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
+**v1.3.1 Update (3/24/2016)** - Several bug fixes.
+**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
 **v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
 Mission
 -------
-RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
+RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
+Roadmap?
+Not sure. Feel free to offer your thoughts.
+Some Potential Ideas:
+* 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
+* 'dead-link finder' mode - collects links returning 404, or other error msgs
+* 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
+* more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
 Features
 --------
 * Asynchronous HTTP Requests thru EM & Synchrony
-* Bloom filter for tracking visited pages
-* 3 CLI modes
-	* Sitemap
-	* File Harvest
-	* SEO
+* Bloom filter for tracking visited pages
+* Supports HTTPS
+* Follows 301 redirects (if to same host)
+* 3 CLI modes
+	* Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
+	* File Harvest - find all files linked to on a website, option to autodownload
+	* SEO  - collect important SEO info from every page, output to a CSV (or STDOUT)
+* Run a Custom Block on a Per-Page basis (PageIterator)
 Use cases
 ---------
-RubyRetriever can do multiple things for you. As an Executable
+**As an Executable**
 With a single command at the terminal, RR can:
 1. Crawl your website and output a *valid XML sitemap* based on what it found.
 2. Crawl a target website and *download all files of a given filetype*.
 3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
+**Used in Custom scripts**
+As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
 Help & Forks Welcome!
 Getting started
@@ -94,7 +114,7 @@ and OPTIONS is the applicable:
     -h, --help                          *Display this screen*
-Using as a Library (starting as of version 1.3.0 -- yet to be released)
+Using as a Library (starting as of version 1.3.0)
 ------------------
 If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.

data/spec/link_spec.rb CHANGED Viewed

@@ -3,7 +3,9 @@ require 'retriever'
 describe 'Link' do
   t = Retriever::Target.new('http://www.cnet.com/reviews/')
-  let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
+  let(:links) do
+    Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
+  end
   it 'collects links in anchor tags' do
     @source = (<<SOURCE).strip
@@ -46,7 +48,7 @@ SOURCE
     expect(links).to include('http://www.cnet.com/download.exe')
   end
-  it "doesn't care about any extra attributes on the anchor tag" do
+  it "doesn\'t care about any extra attributes on the anchor tag" do
     @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
 <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
@@ -63,6 +65,13 @@ SOURCE
 SOURCE
     expect(links).to include('http://www.cnet.com/test.html',
-                             'http://www.cnet.com/cpage_18')
+                             'http://www.cnet.com/reviews/cpage_18')
+  end
+  it 'collects files even when query strings exist' do
+    @source = (<<SOURCE).strip
+    <a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
+SOURCE
+    expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
   end
 end

data/spec/page_spec.rb CHANGED Viewed

@@ -4,102 +4,89 @@ require 'retriever/fetch'
 t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
 describe 'Page' do
+  let(:common_source) do
+    <<-SOURCE
+    <title>test</title>
+    <a href='www.cnet.com/download.exe'>download</a>
+    <a href='/test.html'>test</a>
+    <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
+    </a>
+    <a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
+    <a href='http://www.yahoo.com/test/'>yahoo</a>"
+    <meta name='description' content="test2 ">
+    <h1>test 3</h1>
+    <h2> test 4 </h2>
+    SOURCE
+  end
   describe '#url' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns current page URL' do
-      @source = (<<SOURCE).strip
-<a href='http://www.cnet.com/'>download</a>
-SOURCE
       expect(page.url).to eq('http://www.cnet.com/')
     end
   end
   describe '#links' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'collects all unique href links on the page' do
-      @source = (<<SOURCE).strip
-<a href='www.cnet.com/download.exe'>download</a>
-<a href='/test.html'>test</a>
-<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
-</a>
-<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
- <a href='http://www.yahoo.com/test/'>yahoo</a>
-SOURCE
       expect(page.links.size).to eq(4)
     end
   end
   describe '#parse_internal' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     let(:links) { page.parse_internal }
     it 'filters links by host' do
-      @source = (<<SOURCE).strip
-<a href='http://www.cnet.com/'>download</a>
-<a href='http://www.yahoo.com/test/'>yahoo</a>
-SOURCE
-      expect(links.size).to eq(1)
+      expect(links.size).to eq(3)
     end
   end
   describe '#parse_internal_visitable' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
     let(:links) { page.parse_internal_visitable }
     it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
-      @source = (<<SOURCE).strip
- <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
-SOURCE
       expect(links.size).to eq(0)
     end
   end
   describe '#parse_files' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     let(:files) { page.parse_files(page.parse_internal) }
     it 'filters links by filetype' do
-      @source = (<<SOURCE).strip
-<a href='www.cnet.com/download.exe'>download</a>
-http://www.google.com
-<a href='/test.html'>test</a>
-SOURCE
       expect(files.size).to eq(1)
     end
   end
+  describe '#parse_by_css' do
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
+    it 'returns the text from the received css selector' do
+      expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
+    end
+  end
   describe '#title' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns page title' do
-      @source = (<<SOURCE).strip
-<title>test</title>
-SOURCE
       expect(page.title).to eq('test')
     end
   end
   describe '#desc' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns meta description' do
-      @source = (<<SOURCE).strip
-<meta name='description' content="test2 ">
-SOURCE
       expect(page.desc).to eq('test2 ')
     end
   end
   describe '#h1' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns h1 text' do
-      @source = (<<SOURCE).strip
-<h1>test 3</h1>
-SOURCE
       expect(page.h1).to eq('test 3')
     end
   end
   describe '#h2' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns h2 text' do
-      @source = (<<SOURCE).strip
-<h2> test 4 </h2>
-SOURCE
       expect(page.h2).to eq(' test 4 ')
     end
   end

data/spec/retriever_spec.rb CHANGED Viewed

@@ -11,7 +11,7 @@ describe 'Fetch' do
     end
     let(:nil_response) do
-      r.good_response?(nil,'http://www.yahoo.com')
+      r.good_response?(nil, 'http://www.yahoo.com')
     end
     let(:unsuccessful_resp) do
@@ -20,14 +20,14 @@ describe 'Fetch' do
       resp.stub(:successful?).and_return(false)
       resp.stub(:server_error?).and_return(false)
       resp.stub(:client_error?).and_return(false)
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:redir_resp) do
       resp.stub(:response_header).and_return(resp)
       resp.stub(:redirection?).and_return(true)
       resp.stub(:location).and_return('http://www.google.com')
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:bad_content_type_resp) do
@@ -35,7 +35,7 @@ describe 'Fetch' do
       resp.stub(:redirection?).and_return(false)
       resp.stub(:successful?).and_return(true)
       resp['CONTENT_TYPE'] = 'image/jpeg'
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:success_resp) do
@@ -43,7 +43,7 @@ describe 'Fetch' do
       resp.stub(:redirection?).and_return(false)
       resp.stub(:successful?).and_return(true)
       resp['CONTENT_TYPE'] = 'text/html'
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     it 'returns false if the response is empty' do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.4.0
 platform: ruby
 authors:
 - Joe Norton
@@ -94,6 +94,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -136,6 +150,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.14'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Asynchronous web crawler, scraper and file harvester
 email:
 - joe@softwarebyjoe.com