rubyretriever 1.4.2 → 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/retriever/fetch.rb +9 -0
- data/lib/retriever/fetchfiles.rb +3 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +2 -3
- data/spec/retriever_spec.rb +18 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d87c30761e7ffa8aef615e168e4fe6bb1f671586
|
4
|
+
data.tar.gz: cdcff318868b38610b6233dca9408e2b6ad457a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e76d0a819e803d6c1a286141ea40519f14f734259a4f8a2b09c5d994a45d9ce795b09ea2c8f3c0138b47347dd986663acae718ce37309c9c2ee564b04f8e483e
|
7
|
+
data.tar.gz: e445e3112eb1f0c90a4ccd06028de7b8a8f412f5afe0f5b74cb63613af5febd3634c9e3c79b62c2ba0d55fdb21a1760d9509929f5a710c90dbe38a0bec41b229
|
data/lib/retriever/fetch.rb
CHANGED
@@ -117,6 +117,15 @@ module Retriever
|
|
117
117
|
true
|
118
118
|
end
|
119
119
|
|
120
|
+
def filter_out_querystrings(path)
|
121
|
+
if path.include?('?')
|
122
|
+
uri = Addressable::URI.parse(path)
|
123
|
+
uri.query_values = {}
|
124
|
+
return uri.to_s.chomp('?')
|
125
|
+
end
|
126
|
+
path
|
127
|
+
end
|
128
|
+
|
120
129
|
private
|
121
130
|
|
122
131
|
def setup_options(options)
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -2,6 +2,7 @@ module Retriever
|
|
2
2
|
# receives target url and RR options
|
3
3
|
# returns an array of all unique files (based on given filetype)
|
4
4
|
# found on the target site
|
5
|
+
|
5
6
|
class FetchFiles < Fetch
|
6
7
|
def initialize(url, options)
|
7
8
|
super
|
@@ -17,6 +18,7 @@ module Retriever
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def download_file(path)
|
21
|
+
path = filter_out_querystrings(path)
|
20
22
|
# given valid url, downloads file to current directory in /rr-downloads/
|
21
23
|
arr = path.split('/')
|
22
24
|
shortname = arr.pop
|
@@ -66,4 +68,4 @@ module Retriever
|
|
66
68
|
puts "Downloading files to local directory: '/#{dir_name}/'"
|
67
69
|
end
|
68
70
|
end
|
69
|
-
end
|
71
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -8,15 +8,14 @@ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command
|
|
8
8
|
|
9
9
|
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
|
10
10
|
|
11
|
+
**v1.4.3 Update (3/24/2016)** - Fixes problem with file downloads that had query strings, the filename was being saved with the querystrings still attached. No more.
|
12
|
+
|
11
13
|
**v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
|
12
14
|
|
13
15
|
**v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
|
14
16
|
|
15
17
|
**v1.4.0 Update (3/24/2016)** - Several bug fixes.
|
16
18
|
|
17
|
-
**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
|
18
|
-
|
19
|
-
**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
|
20
19
|
|
21
20
|
Mission
|
22
21
|
-------
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'retriever'
|
2
|
+
require 'retriever/fetchfiles'
|
2
3
|
|
3
4
|
describe 'Fetch' do
|
5
|
+
let(:r) do
|
6
|
+
Retriever::Fetch.new('http://www.yahoo.com', {})
|
7
|
+
end
|
4
8
|
describe '#good_response?' do
|
5
|
-
let(:r) do
|
6
|
-
Retriever::Fetch.new('http://www.yahoo.com', {})
|
7
|
-
end
|
8
9
|
|
9
10
|
let(:resp) do
|
10
11
|
{}
|
@@ -66,4 +67,18 @@ describe 'Fetch' do
|
|
66
67
|
expect(success_resp).to eq(true)
|
67
68
|
end
|
68
69
|
end
|
70
|
+
describe '#filter_out_querystrings' do
|
71
|
+
let(:normal_url) do
|
72
|
+
r.filter_out_querystrings('http://mises.org/test.mp3')
|
73
|
+
end
|
74
|
+
let(:query_string_url) do
|
75
|
+
r.filter_out_querystrings('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio')
|
76
|
+
end
|
77
|
+
it 'accepts standard urls' do
|
78
|
+
expect(normal_url).to eq('http://mises.org/test.mp3')
|
79
|
+
end
|
80
|
+
it 'strips query params' do
|
81
|
+
expect(query_string_url).to eq('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3')
|
82
|
+
end
|
83
|
+
end
|
69
84
|
end
|