RubyGems - rubyretriever - Versions diffs - 1.4.2 → 1.4.3 - Mend

rubyretriever 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dca26f55e68ab2c5095d9e633592735dc67323e9
-  data.tar.gz: 4e96d543d7ea4001ef4531db3beb5bf3e8accf4d
+  metadata.gz: d87c30761e7ffa8aef615e168e4fe6bb1f671586
+  data.tar.gz: cdcff318868b38610b6233dca9408e2b6ad457a4
 SHA512:
-  metadata.gz: f08c1d9c7c2395cfc297f1c0abb80760d08d35d24394d381b703d982709757ffc9072e55a60e15b123e45ae6731a388cb2afa914f2a136ab3fce454af11b7b8c
-  data.tar.gz: 0f75a30cb8a5f969d7bc196c6f83e0f25ddd0f76ca2a931aefb6206180172dbb6c4dbb5d4b926d69e75555dd0eec3a8b1f8a528cf18799cca31af9a753e89fc5
+  metadata.gz: e76d0a819e803d6c1a286141ea40519f14f734259a4f8a2b09c5d994a45d9ce795b09ea2c8f3c0138b47347dd986663acae718ce37309c9c2ee564b04f8e483e
+  data.tar.gz: e445e3112eb1f0c90a4ccd06028de7b8a8f412f5afe0f5b74cb63613af5febd3634c9e3c79b62c2ba0d55fdb21a1760d9509929f5a710c90dbe38a0bec41b229

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -117,6 +117,15 @@ module Retriever
       true
     end
+    def filter_out_querystrings(path)
+      if path.include?('?')
+        uri = Addressable::URI.parse(path)
+        uri.query_values = {}
+        return uri.to_s.chomp('?')
+      end
+      path
+    end
     private
     def setup_options(options)

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -2,6 +2,7 @@ module Retriever
   # receives target url and RR options
   # returns an array of all unique files (based on given filetype)
   #   found on the target site
   class FetchFiles < Fetch
     def initialize(url, options)
       super
@@ -17,6 +18,7 @@ module Retriever
     end
     def download_file(path)
+      path = filter_out_querystrings(path)
       # given valid url, downloads file to current directory in /rr-downloads/
       arr = path.split('/')
       shortname = arr.pop
@@ -66,4 +68,4 @@ module Retriever
       puts "Downloading files to local directory: '/#{dir_name}/'"
     end
   end
-end
+end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 #
 module Retriever
-  VERSION = '1.4.2'
+  VERSION = '1.4.3'
 end

data/readme.md CHANGED Viewed

@@ -8,15 +8,14 @@ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command
 RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
+**v1.4.3 Update (3/24/2016)** - Fixes problem with file downloads that had query strings, the filename was being saved with the querystrings still attached. No more.
 **v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
 **v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
 **v1.4.0 Update (3/24/2016)** - Several bug fixes.
-**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
-**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
 Mission
 -------

data/spec/retriever_spec.rb CHANGED Viewed

@@ -1,10 +1,11 @@
 require 'retriever'
+require 'retriever/fetchfiles'
 describe 'Fetch' do
+  let(:r) do
+    Retriever::Fetch.new('http://www.yahoo.com', {})
+  end
   describe '#good_response?' do
-    let(:r) do
-      Retriever::Fetch.new('http://www.yahoo.com', {})
-    end
     let(:resp) do
       {}
@@ -66,4 +67,18 @@ describe 'Fetch' do
       expect(success_resp).to eq(true)
     end
   end
+  describe '#filter_out_querystrings' do
+    let(:normal_url) do
+      r.filter_out_querystrings('http://mises.org/test.mp3')
+    end
+    let(:query_string_url) do
+      r.filter_out_querystrings('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
+    end
+    it 'accepts standard urls' do
+      expect(normal_url).to eq('http://mises.org/test.mp3')
+    end
+    it 'strips query params' do
+      expect(query_string_url).to eq('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3')
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 1.4.2
+  version: 1.4.3
 platform: ruby
 authors:
 - Joe Norton