rubyretriever 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/lib/retriever/fetch.rb +2 -2
- data/lib/retriever/fetchfiles.rb +1 -1
- data/lib/retriever/fetchseo.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +1 -1
- data/lib/retriever/link.rb +14 -7
- data/lib/retriever/page.rb +11 -3
- data/lib/retriever/page_iterator.rb +1 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +30 -10
- data/spec/link_spec.rb +12 -3
- data/spec/page_spec.rb +34 -47
- data/spec/retriever_spec.rb +5 -5
- metadata +29 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a282bd399fdff64f26493e2b76a44df2ece00e4
|
4
|
+
data.tar.gz: 57b52b0b8b56116ae924fe7a03c912526c3342b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 92b91cc247b847a845b48d33e7fdbbc828e2e3372663d06abcdbd0dc5a524357319dfddbc28f879419670aea0f0a5ab88934ae515b4d31087fe67d534bf95e26
|
7
|
+
data.tar.gz: 3bdb6d28f487209b36426e6b00a7c929d049e539d3cda583cbd13e73902d3230f0a3a2d081af4ab5939110b1e2685b4202e67a1b11bb256d4c924e46cdeefb57
|
data/LICENSE
CHANGED
data/lib/retriever/fetch.rb
CHANGED
@@ -84,7 +84,7 @@ module Retriever
|
|
84
84
|
puts
|
85
85
|
end
|
86
86
|
|
87
|
-
|
87
|
+
# returns true is resp is ok to continue
|
88
88
|
def good_response?(resp, url)
|
89
89
|
return false unless resp
|
90
90
|
hdr = resp.response_header
|
@@ -128,7 +128,7 @@ module Retriever
|
|
128
128
|
@sitemap = options['sitemap']
|
129
129
|
@seo = options['seo']
|
130
130
|
@autodown = options['autodown']
|
131
|
-
@file_re = Regexp.new(/.#{@fileharvest}
|
131
|
+
@file_re = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
|
132
132
|
end
|
133
133
|
|
134
134
|
def setup_bloom_filter
|
data/lib/retriever/fetchfiles.rb
CHANGED
data/lib/retriever/fetchseo.rb
CHANGED
data/lib/retriever/link.rb
CHANGED
@@ -7,16 +7,18 @@ module Retriever
|
|
7
7
|
DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
|
8
8
|
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
9
|
|
10
|
-
def initialize(target_scheme, target_host, this_link)
|
10
|
+
def initialize(target_scheme, target_host, this_link, current_url)
|
11
11
|
begin
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
#this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
|
13
|
+
@link_uri = Addressable::URI.parse(this_link)
|
14
|
+
rescue Addressable::URI::InvalidURIError
|
15
|
+
dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
|
16
|
+
@link_uri = Addressable::URI.parse(dummy.path)
|
16
17
|
end
|
17
18
|
@scheme = target_scheme
|
18
19
|
@host = target_host
|
19
20
|
@this_link = @link_uri.to_s
|
21
|
+
@current_page_url = current_url
|
20
22
|
end
|
21
23
|
|
22
24
|
def path
|
@@ -30,11 +32,16 @@ module Retriever
|
|
30
32
|
return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
|
31
33
|
|
32
34
|
# link uses relative path with no slashes at all
|
33
|
-
|
35
|
+
if link_uri.relative?
|
36
|
+
if @current_page_url[-1, 1] == "/"
|
37
|
+
return "#{@current_page_url}#{this_link}"
|
38
|
+
end
|
39
|
+
return "#{@current_page_url}/#{this_link}"
|
40
|
+
end
|
34
41
|
end
|
35
42
|
|
36
43
|
private
|
37
44
|
|
38
|
-
attr_reader :this_link, :host, :link_uri
|
45
|
+
attr_reader :this_link, :host, :link_uri, :current_page_url
|
39
46
|
end
|
40
47
|
end
|
data/lib/retriever/page.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'nokogiri'
|
1
2
|
require 'addressable/uri'
|
2
3
|
#
|
3
4
|
using SourceString
|
@@ -40,7 +41,7 @@ module Retriever
|
|
40
41
|
@links = nil
|
41
42
|
end
|
42
43
|
|
43
|
-
#
|
44
|
+
# receives page source as string
|
44
45
|
# returns array of unique href links
|
45
46
|
def links
|
46
47
|
return @links if @links
|
@@ -49,12 +50,14 @@ module Retriever
|
|
49
50
|
# filter some malformed URLS that come in
|
50
51
|
# meant to be a loose filter to catch all reasonable HREF attributes.
|
51
52
|
link = match[0]
|
52
|
-
Link.new(@t.scheme, @t.host, link).path
|
53
|
+
Link.new(@t.scheme, @t.host, link, @url).path
|
53
54
|
end.compact.uniq
|
54
55
|
end
|
55
56
|
|
56
57
|
def parse_internal
|
57
|
-
links.select
|
58
|
+
links.select do |x|
|
59
|
+
@t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
|
60
|
+
end
|
58
61
|
end
|
59
62
|
|
60
63
|
def parse_internal_visitable
|
@@ -65,6 +68,11 @@ module Retriever
|
|
65
68
|
arr.select { |x| @t.file_re =~ x }
|
66
69
|
end
|
67
70
|
|
71
|
+
def parse_by_css(selector)
|
72
|
+
nokogiri_doc = Nokogiri::HTML(@source)
|
73
|
+
nokogiri_doc.css(selector).text
|
74
|
+
end
|
75
|
+
|
68
76
|
def title
|
69
77
|
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
|
70
78
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Retriever
|
2
2
|
#
|
3
3
|
class PageIterator < Fetch
|
4
|
-
#
|
4
|
+
# receives target url and RR options, and a block
|
5
5
|
# runs the block on all pages during crawl, pushing
|
6
6
|
# the returned value of the block onto a result stack
|
7
7
|
# the complete data returned from the crawl is accessible thru self.result
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -4,33 +4,53 @@
|
|
4
4
|
|
5
5
|
By Joe Norton
|
6
6
|
|
7
|
-
RubyRetriever is a Web Crawler,
|
7
|
+
RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
|
10
|
+
|
11
|
+
**v1.3.1 Update (3/24/2016)** - Several bug fixes.
|
12
|
+
|
13
|
+
**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
|
10
14
|
|
11
15
|
**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
|
12
16
|
|
13
17
|
Mission
|
14
18
|
-------
|
15
|
-
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
|
19
|
+
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
|
20
|
+
|
21
|
+
|
22
|
+
Roadmap?
|
23
|
+
Not sure. Feel free to offer your thoughts.
|
24
|
+
|
25
|
+
Some Potential Ideas:
|
26
|
+
* 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
|
27
|
+
* 'dead-link finder' mode - collects links returning 404, or other error msgs
|
28
|
+
* 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
|
29
|
+
* more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
|
16
30
|
|
17
31
|
Features
|
18
32
|
--------
|
19
33
|
* Asynchronous HTTP Requests thru EM & Synchrony
|
20
|
-
* Bloom filter for tracking visited pages
|
21
|
-
*
|
22
|
-
|
23
|
-
|
24
|
-
*
|
34
|
+
* Bloom filter for tracking visited pages
|
35
|
+
* Supports HTTPS
|
36
|
+
* Follows 301 redirects (if to same host)
|
37
|
+
* 3 CLI modes
|
38
|
+
* Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
|
39
|
+
* File Harvest - find all files linked to on a website, option to autodownload
|
40
|
+
* SEO - collect important SEO info from every page, output to a CSV (or STDOUT)
|
41
|
+
* Run a Custom Block on a Per-Page basis (PageIterator)
|
25
42
|
|
26
43
|
Use cases
|
27
44
|
---------
|
28
|
-
|
45
|
+
**As an Executable**
|
29
46
|
With a single command at the terminal, RR can:
|
30
47
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
31
48
|
2. Crawl a target website and *download all files of a given filetype*.
|
32
49
|
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
33
50
|
|
51
|
+
**Used in Custom scripts**
|
52
|
+
As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
|
53
|
+
|
34
54
|
Help & Forks Welcome!
|
35
55
|
|
36
56
|
Getting started
|
@@ -94,7 +114,7 @@ and OPTIONS is the applicable:
|
|
94
114
|
-h, --help *Display this screen*
|
95
115
|
|
96
116
|
|
97
|
-
Using as a Library (starting as of version 1.3.0
|
117
|
+
Using as a Library (starting as of version 1.3.0)
|
98
118
|
------------------
|
99
119
|
|
100
120
|
If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
|
data/spec/link_spec.rb
CHANGED
@@ -3,7 +3,9 @@ require 'retriever'
|
|
3
3
|
describe 'Link' do
|
4
4
|
|
5
5
|
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
-
let(:links)
|
6
|
+
let(:links) do
|
7
|
+
Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
|
8
|
+
end
|
7
9
|
|
8
10
|
it 'collects links in anchor tags' do
|
9
11
|
@source = (<<SOURCE).strip
|
@@ -46,7 +48,7 @@ SOURCE
|
|
46
48
|
expect(links).to include('http://www.cnet.com/download.exe')
|
47
49
|
end
|
48
50
|
|
49
|
-
it "doesn't care about any extra attributes on the anchor tag" do
|
51
|
+
it "doesn\'t care about any extra attributes on the anchor tag" do
|
50
52
|
@source = (<<SOURCE).strip
|
51
53
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
54
|
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
@@ -63,6 +65,13 @@ SOURCE
|
|
63
65
|
SOURCE
|
64
66
|
|
65
67
|
expect(links).to include('http://www.cnet.com/test.html',
|
66
|
-
'http://www.cnet.com/cpage_18')
|
68
|
+
'http://www.cnet.com/reviews/cpage_18')
|
69
|
+
end
|
70
|
+
it 'collects files even when query strings exist' do
|
71
|
+
@source = (<<SOURCE).strip
|
72
|
+
<a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
|
73
|
+
SOURCE
|
74
|
+
|
75
|
+
expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio')
|
67
76
|
end
|
68
77
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -4,102 +4,89 @@ require 'retriever/fetch'
|
|
4
4
|
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
6
|
describe 'Page' do
|
7
|
+
let(:common_source) do
|
8
|
+
<<-SOURCE
|
9
|
+
<title>test</title>
|
10
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
11
|
+
<a href='/test.html'>test</a>
|
12
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
13
|
+
</a>
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
|
15
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>"
|
16
|
+
<meta name='description' content="test2 ">
|
17
|
+
<h1>test 3</h1>
|
18
|
+
<h2> test 4 </h2>
|
19
|
+
SOURCE
|
20
|
+
end
|
21
|
+
|
7
22
|
describe '#url' do
|
8
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
23
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
9
24
|
it 'returns current page URL' do
|
10
|
-
@source = (<<SOURCE).strip
|
11
|
-
<a href='http://www.cnet.com/'>download</a>
|
12
|
-
SOURCE
|
13
25
|
expect(page.url).to eq('http://www.cnet.com/')
|
14
26
|
end
|
15
27
|
end
|
16
28
|
|
17
29
|
describe '#links' do
|
18
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
30
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
19
31
|
it 'collects all unique href links on the page' do
|
20
|
-
@source = (<<SOURCE).strip
|
21
|
-
<a href='www.cnet.com/download.exe'>download</a>
|
22
|
-
<a href='/test.html'>test</a>
|
23
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
24
|
-
</a>
|
25
|
-
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
26
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
27
|
-
SOURCE
|
28
|
-
|
29
32
|
expect(page.links.size).to eq(4)
|
30
33
|
end
|
31
34
|
end
|
32
35
|
|
33
36
|
describe '#parse_internal' do
|
34
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
37
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
35
38
|
let(:links) { page.parse_internal }
|
36
39
|
it 'filters links by host' do
|
37
|
-
|
38
|
-
<a href='http://www.cnet.com/'>download</a>
|
39
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
40
|
-
SOURCE
|
41
|
-
|
42
|
-
expect(links.size).to eq(1)
|
40
|
+
expect(links.size).to eq(3)
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
46
44
|
describe '#parse_internal_visitable' do
|
47
|
-
let(:
|
45
|
+
let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
46
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
48
47
|
let(:links) { page.parse_internal_visitable }
|
49
48
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
50
|
-
@source = (<<SOURCE).strip
|
51
|
-
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
52
|
-
SOURCE
|
53
49
|
expect(links.size).to eq(0)
|
54
50
|
end
|
55
51
|
end
|
56
52
|
|
57
53
|
describe '#parse_files' do
|
58
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
54
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
59
55
|
let(:files) { page.parse_files(page.parse_internal) }
|
60
56
|
it 'filters links by filetype' do
|
61
|
-
@source = (<<SOURCE).strip
|
62
|
-
<a href='www.cnet.com/download.exe'>download</a>
|
63
|
-
http://www.google.com
|
64
|
-
<a href='/test.html'>test</a>
|
65
|
-
SOURCE
|
66
57
|
expect(files.size).to eq(1)
|
67
58
|
end
|
68
59
|
end
|
69
60
|
|
61
|
+
describe '#parse_by_css' do
|
62
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
63
|
+
|
64
|
+
it 'returns the text from the received css selector' do
|
65
|
+
expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
70
69
|
describe '#title' do
|
71
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
70
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
72
71
|
it 'returns page title' do
|
73
|
-
@source = (<<SOURCE).strip
|
74
|
-
<title>test</title>
|
75
|
-
SOURCE
|
76
72
|
expect(page.title).to eq('test')
|
77
73
|
end
|
78
74
|
end
|
79
75
|
describe '#desc' do
|
80
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
76
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
81
77
|
it 'returns meta description' do
|
82
|
-
@source = (<<SOURCE).strip
|
83
|
-
<meta name='description' content="test2 ">
|
84
|
-
SOURCE
|
85
78
|
expect(page.desc).to eq('test2 ')
|
86
79
|
end
|
87
80
|
end
|
88
81
|
describe '#h1' do
|
89
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
82
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
90
83
|
it 'returns h1 text' do
|
91
|
-
@source = (<<SOURCE).strip
|
92
|
-
<h1>test 3</h1>
|
93
|
-
SOURCE
|
94
84
|
expect(page.h1).to eq('test 3')
|
95
85
|
end
|
96
86
|
end
|
97
87
|
describe '#h2' do
|
98
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
88
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
99
89
|
it 'returns h2 text' do
|
100
|
-
@source = (<<SOURCE).strip
|
101
|
-
<h2> test 4 </h2>
|
102
|
-
SOURCE
|
103
90
|
expect(page.h2).to eq(' test 4 ')
|
104
91
|
end
|
105
92
|
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -11,7 +11,7 @@ describe 'Fetch' do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
let(:nil_response) do
|
14
|
-
r.good_response?(nil,'http://www.yahoo.com')
|
14
|
+
r.good_response?(nil, 'http://www.yahoo.com')
|
15
15
|
end
|
16
16
|
|
17
17
|
let(:unsuccessful_resp) do
|
@@ -20,14 +20,14 @@ describe 'Fetch' do
|
|
20
20
|
resp.stub(:successful?).and_return(false)
|
21
21
|
resp.stub(:server_error?).and_return(false)
|
22
22
|
resp.stub(:client_error?).and_return(false)
|
23
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
23
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
24
24
|
end
|
25
25
|
|
26
26
|
let(:redir_resp) do
|
27
27
|
resp.stub(:response_header).and_return(resp)
|
28
28
|
resp.stub(:redirection?).and_return(true)
|
29
29
|
resp.stub(:location).and_return('http://www.google.com')
|
30
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
30
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
31
31
|
end
|
32
32
|
|
33
33
|
let(:bad_content_type_resp) do
|
@@ -35,7 +35,7 @@ describe 'Fetch' do
|
|
35
35
|
resp.stub(:redirection?).and_return(false)
|
36
36
|
resp.stub(:successful?).and_return(true)
|
37
37
|
resp['CONTENT_TYPE'] = 'image/jpeg'
|
38
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
38
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
39
39
|
end
|
40
40
|
|
41
41
|
let(:success_resp) do
|
@@ -43,7 +43,7 @@ describe 'Fetch' do
|
|
43
43
|
resp.stub(:redirection?).and_return(false)
|
44
44
|
resp.stub(:successful?).and_return(true)
|
45
45
|
resp['CONTENT_TYPE'] = 'text/html'
|
46
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
46
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
47
47
|
end
|
48
48
|
|
49
49
|
it 'returns false if the response is empty' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: nokogiri
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: bundler
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,20 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '2.14'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: pry
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
139
167
|
description: Asynchronous web crawler, scraper and file harvester
|
140
168
|
email:
|
141
169
|
- joe@softwarebyjoe.com
|