RubyGems - rubyretriever - Versions diffs - 1.3.0 → 1.4.0 - Mend

rubyretriever 1.3.0 → 1.4.0

Files changed (15) hide show

checksums.yaml +4 -4
data/LICENSE +1 -1
data/lib/retriever/fetch.rb +2 -2
data/lib/retriever/fetchfiles.rb +1 -1
data/lib/retriever/fetchseo.rb +1 -1
data/lib/retriever/fetchsitemap.rb +1 -1
data/lib/retriever/link.rb +14 -7
data/lib/retriever/page.rb +11 -3
data/lib/retriever/page_iterator.rb +1 -1
data/lib/retriever/version.rb +1 -1
data/readme.md +30 -10
data/spec/link_spec.rb +12 -3
data/spec/page_spec.rb +34 -47
data/spec/retriever_spec.rb +5 -5
metadata +29 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
-  data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
+  metadata.gz: 9a282bd399fdff64f26493e2b76a44df2ece00e4
+  data.tar.gz: 57b52b0b8b56116ae924fe7a03c912526c3342b2
 SHA512:
-  metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
-  data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
+  metadata.gz: 92b91cc247b847a845b48d33e7fdbbc828e2e3372663d06abcdbd0dc5a524357319dfddbc28f879419670aea0f0a5ab88934ae515b4d31087fe67d534bf95e26
+  data.tar.gz: 3bdb6d28f487209b36426e6b00a7c929d049e539d3cda583cbd13e73902d3230f0a3a2d081af4ab5939110b1e2685b4202e67a1b11bb256d4c924e46cdeefb57

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
+2016 (c) Joseph Michael Norton - @JoeNorton - http://Norton.io
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -84,7 +84,7 @@ module Retriever
       puts
     end
- # returns true is resp is ok to continue
+    # returns true is resp is ok to continue
     def good_response?(resp, url)
       return false unless resp
       hdr = resp.response_header
@@ -128,7 +128,7 @@ module Retriever
       @sitemap      = options['sitemap']
       @seo          = options['seo']
       @autodown     = options['autodown']
-      @file_re      = Regexp.new(/.#{@fileharvest}\z/).freeze if @fileharvest
+      @file_re      = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
     end
     def setup_bloom_filter

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Retriever
-  # recieves target url and RR options
+  # receives target url and RR options
   # returns an array of all unique files (based on given filetype)
   #   found on the target site
   class FetchFiles < Fetch

data/lib/retriever/fetchseo.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class FetchSEO < Fetch
-    # recieves target url and RR options
+    # receives target url and RR options
     # returns an array of onpage SEO related fields
     #   on all unique pages found on the site
     def initialize(url, options)

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class FetchSitemap < Fetch
-    # recieves target URL and RR options
+    # receives target URL and RR options
     # returns an array of all unique pages found on the site
     def initialize(url, options)
       super

data/lib/retriever/link.rb CHANGED Viewed

@@ -7,16 +7,18 @@ module Retriever
     DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
     WWW_DOT_RE = Regexp.new(/^www\./i).freeze
-    def initialize(target_scheme, target_host, this_link)
+    def initialize(target_scheme, target_host, this_link, current_url)
       begin
-        @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
-      rescue Addressable::URI::InvalidURIError => e
-        dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
-        @link_uri = Addressable::URI.parse(dummy_link.path)
+        #this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
+        @link_uri = Addressable::URI.parse(this_link)
+      rescue Addressable::URI::InvalidURIError
+        dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
+        @link_uri = Addressable::URI.parse(dummy.path)
       end
       @scheme = target_scheme
       @host = target_host
       @this_link = @link_uri.to_s
+      @current_page_url = current_url
     end
     def path
@@ -30,11 +32,16 @@ module Retriever
       return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
       # link uses relative path with no slashes at all
-      return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
+      if link_uri.relative?
+        if @current_page_url[-1, 1] == "/"
+          return "#{@current_page_url}#{this_link}"
+        end
+        return "#{@current_page_url}/#{this_link}"
+      end
     end
     private
-    attr_reader :this_link, :host, :link_uri
+    attr_reader :this_link, :host, :link_uri, :current_page_url
   end
 end

data/lib/retriever/page.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'nokogiri'
 require 'addressable/uri'
 #
 using SourceString
@@ -40,7 +41,7 @@ module Retriever
       @links = nil
     end
-    # recieves page source as string
+    # receives page source as string
     # returns array of unique href links
     def links
       return @links if @links
@@ -49,12 +50,14 @@ module Retriever
         # filter some malformed URLS that come in
         # meant to be a loose filter to catch all reasonable HREF attributes.
         link = match[0]
-        Link.new(@t.scheme, @t.host, link).path
+        Link.new(@t.scheme, @t.host, link, @url).path
       end.compact.uniq
     end
     def parse_internal
-      links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
+      links.select do |x|
+        @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
+      end
     end
     def parse_internal_visitable
@@ -65,6 +68,11 @@ module Retriever
       arr.select { |x| @t.file_re =~ x }
     end
+    def parse_by_css(selector)
+      nokogiri_doc = Nokogiri::HTML(@source)
+      nokogiri_doc.css(selector).text
+    end
     def title
       TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
     end

data/lib/retriever/page_iterator.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Retriever
   #
   class PageIterator < Fetch
-    # recieves target url and RR options, and a block
+    # receives target url and RR options, and a block
     # runs the block on all pages during crawl, pushing
     #   the returned value of the block onto a result stack
     #   the complete data returned from the crawl is accessible thru self.result

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 #
 module Retriever
-  VERSION = '1.3.0'
+  VERSION = '1.4.0'
 end

data/readme.md CHANGED Viewed

@@ -4,33 +4,53 @@
 By Joe Norton
-RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
+RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
-RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
+RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
+**v1.3.1 Update (3/24/2016)** - Several bug fixes.
+**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
 **v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
 Mission
 -------
-RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
+RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
+Roadmap?
+Not sure. Feel free to offer your thoughts.
+Some Potential Ideas:
+* 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
+* 'dead-link finder' mode - collects links returning 404, or other error msgs
+* 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
+* more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
 Features
 --------
 * Asynchronous HTTP Requests thru EM & Synchrony
-* Bloom filter for tracking visited pages
-* 3 CLI modes
-	* Sitemap
-	* File Harvest
-	* SEO
+* Bloom filter for tracking visited pages
+* Supports HTTPS
+* Follows 301 redirects (if to same host)
+* 3 CLI modes
+	* Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
+	* File Harvest - find all files linked to on a website, option to autodownload
+	* SEO  - collect important SEO info from every page, output to a CSV (or STDOUT)
+* Run a Custom Block on a Per-Page basis (PageIterator)
 Use cases
 ---------
-RubyRetriever can do multiple things for you. As an Executable
+**As an Executable**
 With a single command at the terminal, RR can:
 1. Crawl your website and output a *valid XML sitemap* based on what it found.
 2. Crawl a target website and *download all files of a given filetype*.
 3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
+**Used in Custom scripts**
+As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
 Help & Forks Welcome!
 Getting started
@@ -94,7 +114,7 @@ and OPTIONS is the applicable:
     -h, --help                          *Display this screen*
-Using as a Library (starting as of version 1.3.0 -- yet to be released)
+Using as a Library (starting as of version 1.3.0)
 ------------------
 If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.

data/spec/link_spec.rb CHANGED Viewed

@@ -3,7 +3,9 @@ require 'retriever'
 describe 'Link' do
   t = Retriever::Target.new('http://www.cnet.com/reviews/')
-  let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
+  let(:links) do
+    Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
+  end
   it 'collects links in anchor tags' do
     @source = (<<SOURCE).strip
@@ -46,7 +48,7 @@ SOURCE
     expect(links).to include('http://www.cnet.com/download.exe')
   end
-  it "doesn't care about any extra attributes on the anchor tag" do
+  it "doesn\'t care about any extra attributes on the anchor tag" do
     @source = (<<SOURCE).strip
 <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
 <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
@@ -63,6 +65,13 @@ SOURCE
 SOURCE
     expect(links).to include('http://www.cnet.com/test.html',
-                             'http://www.cnet.com/cpage_18')
+                             'http://www.cnet.com/reviews/cpage_18')
+  end
+  it 'collects files even when query strings exist' do
+    @source = (<<SOURCE).strip
+    <a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
+SOURCE
+    expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
   end
 end

data/spec/page_spec.rb CHANGED Viewed

@@ -4,102 +4,89 @@ require 'retriever/fetch'
 t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
 describe 'Page' do
+  let(:common_source) do
+    <<-SOURCE
+    <title>test</title>
+    <a href='www.cnet.com/download.exe'>download</a>
+    <a href='/test.html'>test</a>
+    <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
+    </a>
+    <a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
+    <a href='http://www.yahoo.com/test/'>yahoo</a>"
+    <meta name='description' content="test2 ">
+    <h1>test 3</h1>
+    <h2> test 4 </h2>
+    SOURCE
+  end
   describe '#url' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns current page URL' do
-      @source = (<<SOURCE).strip
-<a href='http://www.cnet.com/'>download</a>
-SOURCE
       expect(page.url).to eq('http://www.cnet.com/')
     end
   end
   describe '#links' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'collects all unique href links on the page' do
-      @source = (<<SOURCE).strip
-<a href='www.cnet.com/download.exe'>download</a>
-<a href='/test.html'>test</a>
-<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
-</a>
-<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
- <a href='http://www.yahoo.com/test/'>yahoo</a>
-SOURCE
       expect(page.links.size).to eq(4)
     end
   end
   describe '#parse_internal' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     let(:links) { page.parse_internal }
     it 'filters links by host' do
-      @source = (<<SOURCE).strip
-<a href='http://www.cnet.com/'>download</a>
-<a href='http://www.yahoo.com/test/'>yahoo</a>
-SOURCE
-      expect(links.size).to eq(1)
+      expect(links.size).to eq(3)
     end
   end
   describe '#parse_internal_visitable' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
     let(:links) { page.parse_internal_visitable }
     it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
-      @source = (<<SOURCE).strip
- <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
-SOURCE
       expect(links.size).to eq(0)
     end
   end
   describe '#parse_files' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     let(:files) { page.parse_files(page.parse_internal) }
     it 'filters links by filetype' do
-      @source = (<<SOURCE).strip
-<a href='www.cnet.com/download.exe'>download</a>
-http://www.google.com
-<a href='/test.html'>test</a>
-SOURCE
       expect(files.size).to eq(1)
     end
   end
+  describe '#parse_by_css' do
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
+    it 'returns the text from the received css selector' do
+      expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
+    end
+  end
   describe '#title' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns page title' do
-      @source = (<<SOURCE).strip
-<title>test</title>
-SOURCE
       expect(page.title).to eq('test')
     end
   end
   describe '#desc' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns meta description' do
-      @source = (<<SOURCE).strip
-<meta name='description' content="test2 ">
-SOURCE
       expect(page.desc).to eq('test2 ')
     end
   end
   describe '#h1' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns h1 text' do
-      @source = (<<SOURCE).strip
-<h1>test 3</h1>
-SOURCE
       expect(page.h1).to eq('test 3')
     end
   end
   describe '#h2' do
-    let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
+    let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
     it 'returns h2 text' do
-      @source = (<<SOURCE).strip
-<h2> test 4 </h2>
-SOURCE
       expect(page.h2).to eq(' test 4 ')
     end
   end

data/spec/retriever_spec.rb CHANGED Viewed

@@ -11,7 +11,7 @@ describe 'Fetch' do
     end
     let(:nil_response) do
-      r.good_response?(nil,'http://www.yahoo.com')
+      r.good_response?(nil, 'http://www.yahoo.com')
     end
     let(:unsuccessful_resp) do
@@ -20,14 +20,14 @@ describe 'Fetch' do
       resp.stub(:successful?).and_return(false)
       resp.stub(:server_error?).and_return(false)
       resp.stub(:client_error?).and_return(false)
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:redir_resp) do
       resp.stub(:response_header).and_return(resp)
       resp.stub(:redirection?).and_return(true)
       resp.stub(:location).and_return('http://www.google.com')
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:bad_content_type_resp) do
@@ -35,7 +35,7 @@ describe 'Fetch' do
       resp.stub(:redirection?).and_return(false)
       resp.stub(:successful?).and_return(true)
       resp['CONTENT_TYPE'] = 'image/jpeg'
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     let(:success_resp) do
@@ -43,7 +43,7 @@ describe 'Fetch' do
       resp.stub(:redirection?).and_return(false)
       resp.stub(:successful?).and_return(true)
       resp['CONTENT_TYPE'] = 'text/html'
-      r.good_response?(resp,'http://www.yahoo.com')
+      r.good_response?(resp, 'http://www.yahoo.com')
     end
     it 'returns false if the response is empty' do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.4.0
 platform: ruby
 authors:
 - Joe Norton
@@ -94,6 +94,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -136,6 +150,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.14'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Asynchronous web crawler, scraper and file harvester
 email:
 - joe@softwarebyjoe.com