rubyretriever 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/lib/retriever/fetch.rb +2 -2
- data/lib/retriever/fetchfiles.rb +1 -1
- data/lib/retriever/fetchseo.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +1 -1
- data/lib/retriever/link.rb +14 -7
- data/lib/retriever/page.rb +11 -3
- data/lib/retriever/page_iterator.rb +1 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +30 -10
- data/spec/link_spec.rb +12 -3
- data/spec/page_spec.rb +34 -47
- data/spec/retriever_spec.rb +5 -5
- metadata +29 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a282bd399fdff64f26493e2b76a44df2ece00e4
|
4
|
+
data.tar.gz: 57b52b0b8b56116ae924fe7a03c912526c3342b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 92b91cc247b847a845b48d33e7fdbbc828e2e3372663d06abcdbd0dc5a524357319dfddbc28f879419670aea0f0a5ab88934ae515b4d31087fe67d534bf95e26
|
7
|
+
data.tar.gz: 3bdb6d28f487209b36426e6b00a7c929d049e539d3cda583cbd13e73902d3230f0a3a2d081af4ab5939110b1e2685b4202e67a1b11bb256d4c924e46cdeefb57
|
data/LICENSE
CHANGED
data/lib/retriever/fetch.rb
CHANGED
@@ -84,7 +84,7 @@ module Retriever
|
|
84
84
|
puts
|
85
85
|
end
|
86
86
|
|
87
|
-
|
87
|
+
# returns true is resp is ok to continue
|
88
88
|
def good_response?(resp, url)
|
89
89
|
return false unless resp
|
90
90
|
hdr = resp.response_header
|
@@ -128,7 +128,7 @@ module Retriever
|
|
128
128
|
@sitemap = options['sitemap']
|
129
129
|
@seo = options['seo']
|
130
130
|
@autodown = options['autodown']
|
131
|
-
@file_re = Regexp.new(/.#{@fileharvest}
|
131
|
+
@file_re = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
|
132
132
|
end
|
133
133
|
|
134
134
|
def setup_bloom_filter
|
data/lib/retriever/fetchfiles.rb
CHANGED
data/lib/retriever/fetchseo.rb
CHANGED
data/lib/retriever/link.rb
CHANGED
@@ -7,16 +7,18 @@ module Retriever
|
|
7
7
|
DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
|
8
8
|
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
9
|
|
10
|
-
def initialize(target_scheme, target_host, this_link)
|
10
|
+
def initialize(target_scheme, target_host, this_link, current_url)
|
11
11
|
begin
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
#this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
|
13
|
+
@link_uri = Addressable::URI.parse(this_link)
|
14
|
+
rescue Addressable::URI::InvalidURIError
|
15
|
+
dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
|
16
|
+
@link_uri = Addressable::URI.parse(dummy.path)
|
16
17
|
end
|
17
18
|
@scheme = target_scheme
|
18
19
|
@host = target_host
|
19
20
|
@this_link = @link_uri.to_s
|
21
|
+
@current_page_url = current_url
|
20
22
|
end
|
21
23
|
|
22
24
|
def path
|
@@ -30,11 +32,16 @@ module Retriever
|
|
30
32
|
return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
|
31
33
|
|
32
34
|
# link uses relative path with no slashes at all
|
33
|
-
|
35
|
+
if link_uri.relative?
|
36
|
+
if @current_page_url[-1, 1] == "/"
|
37
|
+
return "#{@current_page_url}#{this_link}"
|
38
|
+
end
|
39
|
+
return "#{@current_page_url}/#{this_link}"
|
40
|
+
end
|
34
41
|
end
|
35
42
|
|
36
43
|
private
|
37
44
|
|
38
|
-
attr_reader :this_link, :host, :link_uri
|
45
|
+
attr_reader :this_link, :host, :link_uri, :current_page_url
|
39
46
|
end
|
40
47
|
end
|
data/lib/retriever/page.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'nokogiri'
|
1
2
|
require 'addressable/uri'
|
2
3
|
#
|
3
4
|
using SourceString
|
@@ -40,7 +41,7 @@ module Retriever
|
|
40
41
|
@links = nil
|
41
42
|
end
|
42
43
|
|
43
|
-
#
|
44
|
+
# receives page source as string
|
44
45
|
# returns array of unique href links
|
45
46
|
def links
|
46
47
|
return @links if @links
|
@@ -49,12 +50,14 @@ module Retriever
|
|
49
50
|
# filter some malformed URLS that come in
|
50
51
|
# meant to be a loose filter to catch all reasonable HREF attributes.
|
51
52
|
link = match[0]
|
52
|
-
Link.new(@t.scheme, @t.host, link).path
|
53
|
+
Link.new(@t.scheme, @t.host, link, @url).path
|
53
54
|
end.compact.uniq
|
54
55
|
end
|
55
56
|
|
56
57
|
def parse_internal
|
57
|
-
links.select
|
58
|
+
links.select do |x|
|
59
|
+
@t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
|
60
|
+
end
|
58
61
|
end
|
59
62
|
|
60
63
|
def parse_internal_visitable
|
@@ -65,6 +68,11 @@ module Retriever
|
|
65
68
|
arr.select { |x| @t.file_re =~ x }
|
66
69
|
end
|
67
70
|
|
71
|
+
def parse_by_css(selector)
|
72
|
+
nokogiri_doc = Nokogiri::HTML(@source)
|
73
|
+
nokogiri_doc.css(selector).text
|
74
|
+
end
|
75
|
+
|
68
76
|
def title
|
69
77
|
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
|
70
78
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Retriever
|
2
2
|
#
|
3
3
|
class PageIterator < Fetch
|
4
|
-
#
|
4
|
+
# receives target url and RR options, and a block
|
5
5
|
# runs the block on all pages during crawl, pushing
|
6
6
|
# the returned value of the block onto a result stack
|
7
7
|
# the complete data returned from the crawl is accessible thru self.result
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -4,33 +4,53 @@
|
|
4
4
|
|
5
5
|
By Joe Norton
|
6
6
|
|
7
|
-
RubyRetriever is a Web Crawler,
|
7
|
+
RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
|
10
|
+
|
11
|
+
**v1.3.1 Update (3/24/2016)** - Several bug fixes.
|
12
|
+
|
13
|
+
**v1.3.0 Update (6/22/2014)** - The major change in this update is the new PageIterator class which adds functionality for library/script usage. Now you can run custom blocks against each page during the crawl. This update also includes more tests, and other code improvements to improve modularity and testability.
|
10
14
|
|
11
15
|
**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
|
12
16
|
|
13
17
|
Mission
|
14
18
|
-------
|
15
|
-
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
|
19
|
+
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
|
20
|
+
|
21
|
+
|
22
|
+
Roadmap?
|
23
|
+
Not sure. Feel free to offer your thoughts.
|
24
|
+
|
25
|
+
Some Potential Ideas:
|
26
|
+
* 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
|
27
|
+
* 'dead-link finder' mode - collects links returning 404, or other error msgs
|
28
|
+
* 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
|
29
|
+
* more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
|
16
30
|
|
17
31
|
Features
|
18
32
|
--------
|
19
33
|
* Asynchronous HTTP Requests thru EM & Synchrony
|
20
|
-
* Bloom filter for tracking visited pages
|
21
|
-
*
|
22
|
-
|
23
|
-
|
24
|
-
*
|
34
|
+
* Bloom filter for tracking visited pages
|
35
|
+
* Supports HTTPS
|
36
|
+
* Follows 301 redirects (if to same host)
|
37
|
+
* 3 CLI modes
|
38
|
+
* Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
|
39
|
+
* File Harvest - find all files linked to on a website, option to autodownload
|
40
|
+
* SEO - collect important SEO info from every page, output to a CSV (or STDOUT)
|
41
|
+
* Run a Custom Block on a Per-Page basis (PageIterator)
|
25
42
|
|
26
43
|
Use cases
|
27
44
|
---------
|
28
|
-
|
45
|
+
**As an Executable**
|
29
46
|
With a single command at the terminal, RR can:
|
30
47
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
31
48
|
2. Crawl a target website and *download all files of a given filetype*.
|
32
49
|
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
33
50
|
|
51
|
+
**Used in Custom scripts**
|
52
|
+
As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
|
53
|
+
|
34
54
|
Help & Forks Welcome!
|
35
55
|
|
36
56
|
Getting started
|
@@ -94,7 +114,7 @@ and OPTIONS is the applicable:
|
|
94
114
|
-h, --help *Display this screen*
|
95
115
|
|
96
116
|
|
97
|
-
Using as a Library (starting as of version 1.3.0
|
117
|
+
Using as a Library (starting as of version 1.3.0)
|
98
118
|
------------------
|
99
119
|
|
100
120
|
If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
|
data/spec/link_spec.rb
CHANGED
@@ -3,7 +3,9 @@ require 'retriever'
|
|
3
3
|
describe 'Link' do
|
4
4
|
|
5
5
|
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
-
let(:links)
|
6
|
+
let(:links) do
|
7
|
+
Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
|
8
|
+
end
|
7
9
|
|
8
10
|
it 'collects links in anchor tags' do
|
9
11
|
@source = (<<SOURCE).strip
|
@@ -46,7 +48,7 @@ SOURCE
|
|
46
48
|
expect(links).to include('http://www.cnet.com/download.exe')
|
47
49
|
end
|
48
50
|
|
49
|
-
it "doesn't care about any extra attributes on the anchor tag" do
|
51
|
+
it "doesn\'t care about any extra attributes on the anchor tag" do
|
50
52
|
@source = (<<SOURCE).strip
|
51
53
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
54
|
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
@@ -63,6 +65,13 @@ SOURCE
|
|
63
65
|
SOURCE
|
64
66
|
|
65
67
|
expect(links).to include('http://www.cnet.com/test.html',
|
66
|
-
'http://www.cnet.com/cpage_18')
|
68
|
+
'http://www.cnet.com/reviews/cpage_18')
|
69
|
+
end
|
70
|
+
it 'collects files even when query strings exist' do
|
71
|
+
@source = (<<SOURCE).strip
|
72
|
+
<a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
|
73
|
+
SOURCE
|
74
|
+
|
75
|
+
expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio')
|
67
76
|
end
|
68
77
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -4,102 +4,89 @@ require 'retriever/fetch'
|
|
4
4
|
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
6
|
describe 'Page' do
|
7
|
+
let(:common_source) do
|
8
|
+
<<-SOURCE
|
9
|
+
<title>test</title>
|
10
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
11
|
+
<a href='/test.html'>test</a>
|
12
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
13
|
+
</a>
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
|
15
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>"
|
16
|
+
<meta name='description' content="test2 ">
|
17
|
+
<h1>test 3</h1>
|
18
|
+
<h2> test 4 </h2>
|
19
|
+
SOURCE
|
20
|
+
end
|
21
|
+
|
7
22
|
describe '#url' do
|
8
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
23
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
9
24
|
it 'returns current page URL' do
|
10
|
-
@source = (<<SOURCE).strip
|
11
|
-
<a href='http://www.cnet.com/'>download</a>
|
12
|
-
SOURCE
|
13
25
|
expect(page.url).to eq('http://www.cnet.com/')
|
14
26
|
end
|
15
27
|
end
|
16
28
|
|
17
29
|
describe '#links' do
|
18
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
30
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
19
31
|
it 'collects all unique href links on the page' do
|
20
|
-
@source = (<<SOURCE).strip
|
21
|
-
<a href='www.cnet.com/download.exe'>download</a>
|
22
|
-
<a href='/test.html'>test</a>
|
23
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
24
|
-
</a>
|
25
|
-
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
26
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
27
|
-
SOURCE
|
28
|
-
|
29
32
|
expect(page.links.size).to eq(4)
|
30
33
|
end
|
31
34
|
end
|
32
35
|
|
33
36
|
describe '#parse_internal' do
|
34
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
37
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
35
38
|
let(:links) { page.parse_internal }
|
36
39
|
it 'filters links by host' do
|
37
|
-
|
38
|
-
<a href='http://www.cnet.com/'>download</a>
|
39
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
40
|
-
SOURCE
|
41
|
-
|
42
|
-
expect(links.size).to eq(1)
|
40
|
+
expect(links.size).to eq(3)
|
43
41
|
end
|
44
42
|
end
|
45
43
|
|
46
44
|
describe '#parse_internal_visitable' do
|
47
|
-
let(:
|
45
|
+
let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
46
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
48
47
|
let(:links) { page.parse_internal_visitable }
|
49
48
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
50
|
-
@source = (<<SOURCE).strip
|
51
|
-
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
52
|
-
SOURCE
|
53
49
|
expect(links.size).to eq(0)
|
54
50
|
end
|
55
51
|
end
|
56
52
|
|
57
53
|
describe '#parse_files' do
|
58
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
54
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
59
55
|
let(:files) { page.parse_files(page.parse_internal) }
|
60
56
|
it 'filters links by filetype' do
|
61
|
-
@source = (<<SOURCE).strip
|
62
|
-
<a href='www.cnet.com/download.exe'>download</a>
|
63
|
-
http://www.google.com
|
64
|
-
<a href='/test.html'>test</a>
|
65
|
-
SOURCE
|
66
57
|
expect(files.size).to eq(1)
|
67
58
|
end
|
68
59
|
end
|
69
60
|
|
61
|
+
describe '#parse_by_css' do
|
62
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
63
|
+
|
64
|
+
it 'returns the text from the received css selector' do
|
65
|
+
expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
70
69
|
describe '#title' do
|
71
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
70
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
72
71
|
it 'returns page title' do
|
73
|
-
@source = (<<SOURCE).strip
|
74
|
-
<title>test</title>
|
75
|
-
SOURCE
|
76
72
|
expect(page.title).to eq('test')
|
77
73
|
end
|
78
74
|
end
|
79
75
|
describe '#desc' do
|
80
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
76
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
81
77
|
it 'returns meta description' do
|
82
|
-
@source = (<<SOURCE).strip
|
83
|
-
<meta name='description' content="test2 ">
|
84
|
-
SOURCE
|
85
78
|
expect(page.desc).to eq('test2 ')
|
86
79
|
end
|
87
80
|
end
|
88
81
|
describe '#h1' do
|
89
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
82
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
90
83
|
it 'returns h1 text' do
|
91
|
-
@source = (<<SOURCE).strip
|
92
|
-
<h1>test 3</h1>
|
93
|
-
SOURCE
|
94
84
|
expect(page.h1).to eq('test 3')
|
95
85
|
end
|
96
86
|
end
|
97
87
|
describe '#h2' do
|
98
|
-
let(:page) { Retriever::Page.new('http://www.cnet.com/',
|
88
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
99
89
|
it 'returns h2 text' do
|
100
|
-
@source = (<<SOURCE).strip
|
101
|
-
<h2> test 4 </h2>
|
102
|
-
SOURCE
|
103
90
|
expect(page.h2).to eq(' test 4 ')
|
104
91
|
end
|
105
92
|
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -11,7 +11,7 @@ describe 'Fetch' do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
let(:nil_response) do
|
14
|
-
r.good_response?(nil,'http://www.yahoo.com')
|
14
|
+
r.good_response?(nil, 'http://www.yahoo.com')
|
15
15
|
end
|
16
16
|
|
17
17
|
let(:unsuccessful_resp) do
|
@@ -20,14 +20,14 @@ describe 'Fetch' do
|
|
20
20
|
resp.stub(:successful?).and_return(false)
|
21
21
|
resp.stub(:server_error?).and_return(false)
|
22
22
|
resp.stub(:client_error?).and_return(false)
|
23
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
23
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
24
24
|
end
|
25
25
|
|
26
26
|
let(:redir_resp) do
|
27
27
|
resp.stub(:response_header).and_return(resp)
|
28
28
|
resp.stub(:redirection?).and_return(true)
|
29
29
|
resp.stub(:location).and_return('http://www.google.com')
|
30
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
30
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
31
31
|
end
|
32
32
|
|
33
33
|
let(:bad_content_type_resp) do
|
@@ -35,7 +35,7 @@ describe 'Fetch' do
|
|
35
35
|
resp.stub(:redirection?).and_return(false)
|
36
36
|
resp.stub(:successful?).and_return(true)
|
37
37
|
resp['CONTENT_TYPE'] = 'image/jpeg'
|
38
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
38
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
39
39
|
end
|
40
40
|
|
41
41
|
let(:success_resp) do
|
@@ -43,7 +43,7 @@ describe 'Fetch' do
|
|
43
43
|
resp.stub(:redirection?).and_return(false)
|
44
44
|
resp.stub(:successful?).and_return(true)
|
45
45
|
resp['CONTENT_TYPE'] = 'text/html'
|
46
|
-
r.good_response?(resp,'http://www.yahoo.com')
|
46
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
47
47
|
end
|
48
48
|
|
49
49
|
it 'returns false if the response is empty' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: nokogiri
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: bundler
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +150,20 @@ dependencies:
|
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '2.14'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: pry
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
139
167
|
description: Asynchronous web crawler, scraper and file harvester
|
140
168
|
email:
|
141
169
|
- joe@softwarebyjoe.com
|