rubyretriever 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dca26f55e68ab2c5095d9e633592735dc67323e9
4
- data.tar.gz: 4e96d543d7ea4001ef4531db3beb5bf3e8accf4d
3
+ metadata.gz: d87c30761e7ffa8aef615e168e4fe6bb1f671586
4
+ data.tar.gz: cdcff318868b38610b6233dca9408e2b6ad457a4
5
5
  SHA512:
6
- metadata.gz: f08c1d9c7c2395cfc297f1c0abb80760d08d35d24394d381b703d982709757ffc9072e55a60e15b123e45ae6731a388cb2afa914f2a136ab3fce454af11b7b8c
7
- data.tar.gz: 0f75a30cb8a5f969d7bc196c6f83e0f25ddd0f76ca2a931aefb6206180172dbb6c4dbb5d4b926d69e75555dd0eec3a8b1f8a528cf18799cca31af9a753e89fc5
6
+ metadata.gz: e76d0a819e803d6c1a286141ea40519f14f734259a4f8a2b09c5d994a45d9ce795b09ea2c8f3c0138b47347dd986663acae718ce37309c9c2ee564b04f8e483e
7
+ data.tar.gz: e445e3112eb1f0c90a4ccd06028de7b8a8f412f5afe0f5b74cb63613af5febd3634c9e3c79b62c2ba0d55fdb21a1760d9509929f5a710c90dbe38a0bec41b229
@@ -117,6 +117,15 @@ module Retriever
117
117
  true
118
118
  end
119
119
 
120
+ def filter_out_querystrings(path)
121
+ if path.include?('?')
122
+ uri = Addressable::URI.parse(path)
123
+ uri.query_values = {}
124
+ return uri.to_s.chomp('?')
125
+ end
126
+ path
127
+ end
128
+
120
129
  private
121
130
 
122
131
  def setup_options(options)
@@ -2,6 +2,7 @@ module Retriever
2
2
  # receives target url and RR options
3
3
  # returns an array of all unique files (based on given filetype)
4
4
  # found on the target site
5
+
5
6
  class FetchFiles < Fetch
6
7
  def initialize(url, options)
7
8
  super
@@ -17,6 +18,7 @@ module Retriever
17
18
  end
18
19
 
19
20
  def download_file(path)
21
+ path = filter_out_querystrings(path)
20
22
  # given valid url, downloads file to current directory in /rr-downloads/
21
23
  arr = path.split('/')
22
24
  shortname = arr.pop
@@ -66,4 +68,4 @@ module Retriever
66
68
  puts "Downloading files to local directory: '/#{dir_name}/'"
67
69
  end
68
70
  end
69
- end
71
+ end
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.4.2'
3
+ VERSION = '1.4.3'
4
4
  end
data/readme.md CHANGED
@@ -8,15 +8,14 @@ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command
8
8
 
9
9
  RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
10
10
 
11
+ **v1.4.3 Update (3/24/2016)** - Fixes problem with file downloads that had query strings, the filename was being saved with the querystrings still attached. No more.
12
+
11
13
  **v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
12
14
 
13
15
  **v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
14
16
 
15
17
  **v1.4.0 Update (3/24/2016)** - Several bug fixes.
16
18
 
17
- **v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
18
-
19
- **v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
20
19
 
21
20
  Mission
22
21
  -------
@@ -1,10 +1,11 @@
1
1
  require 'retriever'
2
+ require 'retriever/fetchfiles'
2
3
 
3
4
  describe 'Fetch' do
5
+ let(:r) do
6
+ Retriever::Fetch.new('http://www.yahoo.com', {})
7
+ end
4
8
  describe '#good_response?' do
5
- let(:r) do
6
- Retriever::Fetch.new('http://www.yahoo.com', {})
7
- end
8
9
 
9
10
  let(:resp) do
10
11
  {}
@@ -66,4 +67,18 @@ describe 'Fetch' do
66
67
  expect(success_resp).to eq(true)
67
68
  end
68
69
  end
70
+ describe '#filter_out_querystrings' do
71
+ let(:normal_url) do
72
+ r.filter_out_querystrings('http://mises.org/test.mp3')
73
+ end
74
+ let(:query_string_url) do
75
+ r.filter_out_querystrings('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
76
+ end
77
+ it 'accepts standard urls' do
78
+ expect(normal_url).to eq('http://mises.org/test.mp3')
79
+ end
80
+ it 'strips query params' do
81
+ expect(query_string_url).to eq('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3')
82
+ end
83
+ end
69
84
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton