rubyretriever 1.2.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +1 -1
- data/lib/retriever/core_ext.rb +13 -0
- data/lib/retriever/fetch.rb +53 -44
- data/lib/retriever/fetchfiles.rb +1 -0
- data/lib/retriever/fetchseo.rb +1 -0
- data/lib/retriever/fetchsitemap.rb +1 -0
- data/lib/retriever/page.rb +11 -9
- data/lib/retriever/page_iterator.rb +21 -0
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +2 -0
- data/readme.md +52 -11
- data/spec/link_spec.rb +1 -1
- data/spec/page_spec.rb +18 -9
- data/spec/retriever_spec.rb +65 -0
- data/spec/target_spec.rb +7 -4
- metadata +38 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
|
4
|
+
data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
|
7
|
+
data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
|
data/bin/rr
CHANGED
@@ -32,7 +32,7 @@ optparse = OptionParser.new do |opts|
|
|
32
32
|
opts.on('-p', '--progress', 'Output progress bar') do
|
33
33
|
options['progress'] = true
|
34
34
|
end
|
35
|
-
options['maxpages'] =
|
35
|
+
options['maxpages'] = 100
|
36
36
|
opts.on('-l',
|
37
37
|
'--limit PAGE_LIMIT_#',
|
38
38
|
'set a max on the total number of crawled pages') do |maxp|
|
data/lib/retriever/fetch.rb
CHANGED
@@ -15,6 +15,7 @@ module Retriever
|
|
15
15
|
# There is no direct output
|
16
16
|
# this is a parent class that the other fetch classes build off of.
|
17
17
|
def initialize(url, options)
|
18
|
+
@iterator = false
|
18
19
|
@result = []
|
19
20
|
@connection_tally = {
|
20
21
|
success: 0,
|
@@ -27,6 +28,9 @@ module Retriever
|
|
27
28
|
@t = Retriever::Target.new(url, @file_re)
|
28
29
|
@output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
|
29
30
|
@already_crawled = setup_bloom_filter
|
31
|
+
end
|
32
|
+
|
33
|
+
def start
|
30
34
|
@page_one = crawl_page_one
|
31
35
|
@link_stack = create_link_stack
|
32
36
|
@temp_link_stack = []
|
@@ -80,6 +84,39 @@ module Retriever
|
|
80
84
|
puts
|
81
85
|
end
|
82
86
|
|
87
|
+
# returns true is resp is ok to continue
|
88
|
+
def good_response?(resp, url)
|
89
|
+
return false unless resp
|
90
|
+
hdr = resp.response_header
|
91
|
+
if hdr.redirection?
|
92
|
+
loc = hdr.location
|
93
|
+
lg("#{url} Redirected to #{loc}")
|
94
|
+
if t.host_re =~ loc
|
95
|
+
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
96
|
+
lg('--Added to stack for later')
|
97
|
+
return false
|
98
|
+
end
|
99
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
100
|
+
return false
|
101
|
+
end
|
102
|
+
# lets not continue if unsuccessful connection
|
103
|
+
unless hdr.successful?
|
104
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
105
|
+
@connection_tally[:error] += 1
|
106
|
+
@connection_tally[:error_server] += 1 if hdr.server_error?
|
107
|
+
@connection_tally[:error_client] += 1 if hdr.client_error?
|
108
|
+
return false
|
109
|
+
end
|
110
|
+
# let's not continue if not text/html
|
111
|
+
unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
|
112
|
+
@already_crawled.insert(url)
|
113
|
+
lg("Page Not text/html -- #{url}")
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
@connection_tally[:success] += 1
|
117
|
+
true
|
118
|
+
end
|
119
|
+
|
83
120
|
private
|
84
121
|
|
85
122
|
def setup_options(options)
|
@@ -119,7 +156,7 @@ module Retriever
|
|
119
156
|
end
|
120
157
|
|
121
158
|
def crawl_page_one
|
122
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
159
|
+
page_one = Retriever::Page.new(@t.target, @t.source, @t)
|
123
160
|
lg("URL Crawled: #{@t.target}")
|
124
161
|
page_one
|
125
162
|
end
|
@@ -141,13 +178,13 @@ module Retriever
|
|
141
178
|
|
142
179
|
# iterates over the existing @link_stack
|
143
180
|
# running until we reach the @max_pages value.
|
144
|
-
def async_crawl_and_collect
|
181
|
+
def async_crawl_and_collect(&block)
|
145
182
|
while @already_crawled.size < @max_pages
|
146
183
|
if @link_stack.empty?
|
147
184
|
end_crawl_notice
|
148
185
|
break
|
149
186
|
end
|
150
|
-
new_links_arr = process_link_stack
|
187
|
+
new_links_arr = process_link_stack(&block)
|
151
188
|
@temp_link_stack = []
|
152
189
|
next if new_links_arr.nil? || new_links_arr.empty?
|
153
190
|
@link_stack.concat(new_links_arr)
|
@@ -157,47 +194,14 @@ module Retriever
|
|
157
194
|
@result.uniq!
|
158
195
|
end
|
159
196
|
|
160
|
-
|
161
|
-
def good_response?(resp, url)
|
162
|
-
return false unless resp
|
163
|
-
hdr = resp.response_header
|
164
|
-
if hdr.redirection?
|
165
|
-
loc = hdr.location
|
166
|
-
lg("#{url} Redirected to #{loc}")
|
167
|
-
if t.host_re =~ loc
|
168
|
-
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
169
|
-
lg('--Added to stack for later')
|
170
|
-
return false
|
171
|
-
end
|
172
|
-
lg("Redirection outside of target host. No - go. #{loc}")
|
173
|
-
return false
|
174
|
-
end
|
175
|
-
# lets not continue if unsuccessful connection
|
176
|
-
unless hdr.successful?
|
177
|
-
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
178
|
-
@connection_tally[:error] += 1
|
179
|
-
@connection_tally[:error_server] += 1 if hdr.server_error?
|
180
|
-
@connection_tally[:error_client] += 1 if hdr.client_error?
|
181
|
-
return false
|
182
|
-
end
|
183
|
-
# let's not continue if not text/html
|
184
|
-
unless hdr['CONTENT_TYPE'].include?('text/html')
|
185
|
-
@already_crawled.insert(url)
|
186
|
-
lg("Page Not text/html -- #{url}")
|
187
|
-
return false
|
188
|
-
end
|
189
|
-
@connection_tally[:success] += 1
|
190
|
-
true
|
191
|
-
end
|
192
|
-
|
193
|
-
def push_seo_to_data(url, new_page)
|
197
|
+
def push_seo_to_result(url, new_page)
|
194
198
|
seos = [url]
|
195
199
|
seos.concat(new_page.parse_seo)
|
196
200
|
@result.push(seos)
|
197
201
|
lg('--page SEO scraped')
|
198
202
|
end
|
199
203
|
|
200
|
-
def
|
204
|
+
def push_files_to_result(new_page)
|
201
205
|
filez = new_page.parse_files(new_page.parse_internal)
|
202
206
|
@result.concat(filez) unless filez.empty?
|
203
207
|
lg("--#{filez.size} files found")
|
@@ -209,7 +213,7 @@ module Retriever
|
|
209
213
|
if @progress && (@already_crawled.size < @max_pages)
|
210
214
|
@progressbar.increment
|
211
215
|
end
|
212
|
-
Retriever::Page.new(response, @t)
|
216
|
+
Retriever::Page.new(url, response, @t)
|
213
217
|
end
|
214
218
|
|
215
219
|
def new_visitable_links(current_page)
|
@@ -217,10 +221,16 @@ module Retriever
|
|
217
221
|
current_page.parse_internal_visitable
|
218
222
|
end
|
219
223
|
|
224
|
+
def push_custom_to_result(url, current_page, &block)
|
225
|
+
data = block.call current_page
|
226
|
+
@result.push(data) unless data.empty?
|
227
|
+
lg("-- PageIterator called on: #{url}")
|
228
|
+
end
|
229
|
+
|
220
230
|
# send a new wave of GET requests, using current @link_stack
|
221
231
|
# at end of the loop it empties link_stack
|
222
232
|
# puts new links into temporary stack
|
223
|
-
def process_link_stack
|
233
|
+
def process_link_stack(&block)
|
224
234
|
EM.synchrony do
|
225
235
|
concurrency = 10
|
226
236
|
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
@@ -230,20 +240,19 @@ module Retriever
|
|
230
240
|
next unless good_response?(resp, url)
|
231
241
|
current_page = page_from_response(url, resp.response)
|
232
242
|
# non-link dependent modes
|
233
|
-
|
243
|
+
push_seo_to_result(url, current_page) if @seo
|
244
|
+
push_custom_to_result(url, current_page, &block) if @iterator
|
234
245
|
next unless current_page.links.size > 0
|
235
246
|
@temp_link_stack.push(new_visitable_links(current_page))
|
236
247
|
# link dependent modes
|
237
248
|
next unless @fileharvest
|
238
|
-
|
249
|
+
push_files_to_result(current_page)
|
239
250
|
end
|
240
251
|
EventMachine.stop
|
241
252
|
end
|
242
253
|
# empty the stack. most clean way
|
243
254
|
@link_stack = []
|
244
255
|
# temp contains redirects + new visitable links
|
245
|
-
# we will re-initialize it as empty right after this function
|
246
|
-
# in the parent method 'async crawl and collect'
|
247
256
|
@temp_link_stack.flatten.uniq!
|
248
257
|
end
|
249
258
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -5,6 +5,7 @@ module Retriever
|
|
5
5
|
class FetchFiles < Fetch
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
+
start
|
8
9
|
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
9
10
|
@result.concat(temp_file_collection) if temp_file_collection.size > 0
|
10
11
|
lg("#{@result.size} new files found")
|
data/lib/retriever/fetchseo.rb
CHANGED
data/lib/retriever/page.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'addressable/uri'
|
2
|
-
|
2
|
+
#
|
3
|
+
using SourceString
|
3
4
|
module Retriever
|
4
5
|
#
|
5
6
|
class Page
|
@@ -30,11 +31,12 @@ module Retriever
|
|
30
31
|
wmv|flv|mp3|wav|doc|txt|ico|xml)
|
31
32
|
/ix).freeze
|
32
33
|
|
33
|
-
attr_reader :links, :source, :t
|
34
|
+
attr_reader :links, :source, :t, :url
|
34
35
|
|
35
|
-
def initialize(source, t)
|
36
|
+
def initialize(url, source, t)
|
37
|
+
@url = url
|
36
38
|
@t = t
|
37
|
-
@source = source.
|
39
|
+
@source = source.encode_utf8_and_replace
|
38
40
|
@links = nil
|
39
41
|
end
|
40
42
|
|
@@ -59,24 +61,24 @@ module Retriever
|
|
59
61
|
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
|
60
62
|
end
|
61
63
|
|
62
|
-
def parse_files(arr)
|
64
|
+
def parse_files(arr = parse_internal)
|
63
65
|
arr.select { |x| @t.file_re =~ x }
|
64
66
|
end
|
65
67
|
|
66
68
|
def title
|
67
|
-
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
|
69
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
|
68
70
|
end
|
69
71
|
|
70
72
|
def desc
|
71
|
-
DESC_RE =~ @source ? @source.match(DESC_RE)[1]
|
73
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
|
72
74
|
end
|
73
75
|
|
74
76
|
def h1
|
75
|
-
H1_RE =~ @source ? @source.match(H1_RE)[1]
|
77
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
|
76
78
|
end
|
77
79
|
|
78
80
|
def h2
|
79
|
-
H2_RE =~ @source ? @source.match(H2_RE)[1]
|
81
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
|
80
82
|
end
|
81
83
|
|
82
84
|
def parse_seo
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Retriever
|
2
|
+
#
|
3
|
+
class PageIterator < Fetch
|
4
|
+
# recieves target url and RR options, and a block
|
5
|
+
# runs the block on all pages during crawl, pushing
|
6
|
+
# the returned value of the block onto a result stack
|
7
|
+
# the complete data returned from the crawl is accessible thru self.result
|
8
|
+
def initialize(url, options, &block)
|
9
|
+
super
|
10
|
+
start
|
11
|
+
fail 'block required for PageIterator' unless block_given?
|
12
|
+
@iterator = true
|
13
|
+
@result.push(block.call @page_one)
|
14
|
+
lg("-- PageIterator crawled- #{url}")
|
15
|
+
async_crawl_and_collect(&block)
|
16
|
+
# done, make sure progress bar says we are done
|
17
|
+
@progressbar.finish if @progress
|
18
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'retriever/core_ext'
|
1
2
|
require 'retriever/fetch'
|
2
3
|
require 'retriever/fetchfiles'
|
3
4
|
require 'retriever/fetchsitemap'
|
4
5
|
require 'retriever/fetchseo'
|
6
|
+
require 'retriever/page_iterator'
|
5
7
|
require 'retriever/cli'
|
6
8
|
require 'retriever/link'
|
7
9
|
require 'retriever/target'
|
data/readme.md
CHANGED
@@ -25,7 +25,8 @@ Features
|
|
25
25
|
|
26
26
|
Use cases
|
27
27
|
---------
|
28
|
-
RubyRetriever can do multiple things for you.
|
28
|
+
RubyRetriever can do multiple things for you. As an Executable
|
29
|
+
With a single command at the terminal, RR can:
|
29
30
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
30
31
|
2. Crawl a target website and *download all files of a given filetype*.
|
31
32
|
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
@@ -36,41 +37,44 @@ Getting started
|
|
36
37
|
-----------
|
37
38
|
Install the gem
|
38
39
|
```sh
|
39
|
-
gem install rubyretriever
|
40
|
+
$ gem install rubyretriever
|
40
41
|
```
|
42
|
+
|
41
43
|
|
44
|
+
Using the Executable
|
45
|
+
--------------------
|
42
46
|
**Example: Sitemap mode**
|
43
47
|
```sh
|
44
|
-
rr --sitemap CSV --progress --limit
|
48
|
+
$ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
|
45
49
|
```
|
46
50
|
OR -- SAME COMMAND
|
47
51
|
```sh
|
48
|
-
rr -s csv -p -l
|
52
|
+
$ rr -s csv -p -l 10 http://www.cnet.com
|
49
53
|
```
|
50
54
|
|
51
|
-
This would map http://www.cnet.com until it crawled a max of
|
55
|
+
This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
|
52
56
|
|
53
57
|
**Example: File Harvesting mode**
|
54
58
|
```sh
|
55
|
-
rr --files
|
59
|
+
$ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
|
56
60
|
```
|
57
61
|
OR -- SAME COMMAND
|
58
62
|
```sh
|
59
|
-
rr -f
|
63
|
+
$ rr -f txt -v -l 1 http://textfiles.com/programming/
|
60
64
|
```
|
61
65
|
|
62
|
-
This would crawl http://
|
66
|
+
This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
|
63
67
|
|
64
68
|
**Example: SEO mode**
|
65
69
|
```sh
|
66
|
-
rr --seo --progress --limit
|
70
|
+
$ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
|
67
71
|
```
|
68
72
|
OR -- SAME COMMAND
|
69
73
|
```sh
|
70
|
-
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
74
|
+
$ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
71
75
|
```
|
72
76
|
|
73
|
-
This would go to http://www.cnet.com and crawl a max of
|
77
|
+
This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
|
74
78
|
|
75
79
|
|
76
80
|
command-line arguments
|
@@ -89,11 +93,48 @@ and OPTIONS is the applicable:
|
|
89
93
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
90
94
|
-h, --help *Display this screen*
|
91
95
|
|
96
|
+
|
97
|
+
Using as a Library (starting as of version 1.3.0 -- yet to be released)
|
98
|
+
------------------
|
99
|
+
|
100
|
+
If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
|
101
|
+
|
102
|
+
Sample Script using **PageIterator**
|
103
|
+
```ruby
|
104
|
+
require 'retriever'
|
105
|
+
opts = {
|
106
|
+
'maxpages' => 1
|
107
|
+
}
|
108
|
+
t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
|
109
|
+
[page.url, page.title]
|
110
|
+
end
|
111
|
+
puts t.result.to_s
|
112
|
+
```
|
113
|
+
|
114
|
+
```sh
|
115
|
+
>> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
|
116
|
+
```
|
117
|
+
Available methods on the page iterator:
|
118
|
+
* **#url** - returns full URL of current page
|
119
|
+
* **#source** - returns raw page source code
|
120
|
+
* **#title** - returns html decoded verson of curent page title
|
121
|
+
* **#desc** - returns html decoded verson of curent page meta description
|
122
|
+
* **#h1** - returns html decoded verson of current page's h1 tag
|
123
|
+
* **#h2** - returns html decoded verson of current page's h2 tag
|
124
|
+
* **#links** - returns array of all links on the page
|
125
|
+
* **#parse_internal** - returns array of current page's internal (same host) links
|
126
|
+
* **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
|
127
|
+
* **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
|
128
|
+
* **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
|
129
|
+
|
130
|
+
|
92
131
|
Current Requirements
|
93
132
|
------------
|
94
133
|
em-synchrony
|
95
134
|
ruby-progressbar
|
96
135
|
bloomfilter-rb
|
136
|
+
addressable
|
137
|
+
htmlentities
|
97
138
|
|
98
139
|
License
|
99
140
|
-------
|
data/spec/link_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require 'retriever'
|
|
3
3
|
describe 'Link' do
|
4
4
|
|
5
5
|
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
-
let(:links) { Retriever::Page.new(@source, t).links }
|
6
|
+
let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
|
7
7
|
|
8
8
|
it 'collects links in anchor tags' do
|
9
9
|
@source = (<<SOURCE).strip
|
data/spec/page_spec.rb
CHANGED
@@ -4,9 +4,18 @@ require 'retriever/fetch'
|
|
4
4
|
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
6
|
describe 'Page' do
|
7
|
+
describe '#url' do
|
8
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
9
|
+
it 'returns current page URL' do
|
10
|
+
@source = (<<SOURCE).strip
|
11
|
+
<a href='http://www.cnet.com/'>download</a>
|
12
|
+
SOURCE
|
13
|
+
expect(page.url).to eq('http://www.cnet.com/')
|
14
|
+
end
|
15
|
+
end
|
7
16
|
|
8
17
|
describe '#links' do
|
9
|
-
let(:
|
18
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
10
19
|
it 'collects all unique href links on the page' do
|
11
20
|
@source = (<<SOURCE).strip
|
12
21
|
<a href='www.cnet.com/download.exe'>download</a>
|
@@ -17,12 +26,12 @@ describe 'Page' do
|
|
17
26
|
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
18
27
|
SOURCE
|
19
28
|
|
20
|
-
expect(links.size).to eq(4)
|
29
|
+
expect(page.links.size).to eq(4)
|
21
30
|
end
|
22
31
|
end
|
23
32
|
|
24
33
|
describe '#parse_internal' do
|
25
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
34
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
26
35
|
let(:links) { page.parse_internal }
|
27
36
|
it 'filters links by host' do
|
28
37
|
@source = (<<SOURCE).strip
|
@@ -35,7 +44,7 @@ SOURCE
|
|
35
44
|
end
|
36
45
|
|
37
46
|
describe '#parse_internal_visitable' do
|
38
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
47
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
39
48
|
let(:links) { page.parse_internal_visitable }
|
40
49
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
41
50
|
@source = (<<SOURCE).strip
|
@@ -46,7 +55,7 @@ SOURCE
|
|
46
55
|
end
|
47
56
|
|
48
57
|
describe '#parse_files' do
|
49
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
58
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
50
59
|
let(:files) { page.parse_files(page.parse_internal) }
|
51
60
|
it 'filters links by filetype' do
|
52
61
|
@source = (<<SOURCE).strip
|
@@ -59,7 +68,7 @@ SOURCE
|
|
59
68
|
end
|
60
69
|
|
61
70
|
describe '#title' do
|
62
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
71
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
63
72
|
it 'returns page title' do
|
64
73
|
@source = (<<SOURCE).strip
|
65
74
|
<title>test</title>
|
@@ -68,7 +77,7 @@ SOURCE
|
|
68
77
|
end
|
69
78
|
end
|
70
79
|
describe '#desc' do
|
71
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
80
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
72
81
|
it 'returns meta description' do
|
73
82
|
@source = (<<SOURCE).strip
|
74
83
|
<meta name='description' content="test2 ">
|
@@ -77,7 +86,7 @@ SOURCE
|
|
77
86
|
end
|
78
87
|
end
|
79
88
|
describe '#h1' do
|
80
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
89
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
81
90
|
it 'returns h1 text' do
|
82
91
|
@source = (<<SOURCE).strip
|
83
92
|
<h1>test 3</h1>
|
@@ -86,7 +95,7 @@ SOURCE
|
|
86
95
|
end
|
87
96
|
end
|
88
97
|
describe '#h2' do
|
89
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
98
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
90
99
|
it 'returns h2 text' do
|
91
100
|
@source = (<<SOURCE).strip
|
92
101
|
<h2> test 4 </h2>
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,4 +1,69 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
3
|
describe 'Fetch' do
|
4
|
+
describe '#good_response?' do
|
5
|
+
let(:r) do
|
6
|
+
Retriever::Fetch.new('http://www.yahoo.com', {})
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:resp) do
|
10
|
+
{}
|
11
|
+
end
|
12
|
+
|
13
|
+
let(:nil_response) do
|
14
|
+
r.good_response?(nil,'http://www.yahoo.com')
|
15
|
+
end
|
16
|
+
|
17
|
+
let(:unsuccessful_resp) do
|
18
|
+
resp.stub(:response_header).and_return(resp)
|
19
|
+
resp.stub(:redirection?).and_return(false)
|
20
|
+
resp.stub(:successful?).and_return(false)
|
21
|
+
resp.stub(:server_error?).and_return(false)
|
22
|
+
resp.stub(:client_error?).and_return(false)
|
23
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
24
|
+
end
|
25
|
+
|
26
|
+
let(:redir_resp) do
|
27
|
+
resp.stub(:response_header).and_return(resp)
|
28
|
+
resp.stub(:redirection?).and_return(true)
|
29
|
+
resp.stub(:location).and_return('http://www.google.com')
|
30
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
31
|
+
end
|
32
|
+
|
33
|
+
let(:bad_content_type_resp) do
|
34
|
+
resp.stub(:response_header).and_return(resp)
|
35
|
+
resp.stub(:redirection?).and_return(false)
|
36
|
+
resp.stub(:successful?).and_return(true)
|
37
|
+
resp['CONTENT_TYPE'] = 'image/jpeg'
|
38
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
39
|
+
end
|
40
|
+
|
41
|
+
let(:success_resp) do
|
42
|
+
resp.stub(:response_header).and_return(resp)
|
43
|
+
resp.stub(:redirection?).and_return(false)
|
44
|
+
resp.stub(:successful?).and_return(true)
|
45
|
+
resp['CONTENT_TYPE'] = 'text/html'
|
46
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'returns false if the response is empty' do
|
50
|
+
expect(nil_response).to eq(false)
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'returns false on unsuccessful connection' do
|
54
|
+
expect(unsuccessful_resp).to eq(false)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'returns false on redirecting host' do
|
58
|
+
expect(redir_resp).to eq(false)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'returns false on non-visitable content type' do
|
62
|
+
expect(bad_content_type_resp).to eq(false)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'returns true otherwise' do
|
66
|
+
expect(success_resp).to eq(true)
|
67
|
+
end
|
68
|
+
end
|
4
69
|
end
|
data/spec/target_spec.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
|
-
|
6
4
|
describe 'Target' do
|
5
|
+
let(:t) do
|
6
|
+
Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
7
|
+
end
|
7
8
|
|
8
9
|
it 'creates target var' do
|
9
10
|
expect(t.target).to eq('http://www.cnet.com/reviews/')
|
@@ -30,6 +31,9 @@ describe 'Target' do
|
|
30
31
|
end
|
31
32
|
|
32
33
|
describe '#source' do
|
34
|
+
let(:redirecting_url) do
|
35
|
+
Retriever::Target.new('http://software-by-joe.appspot.com').source
|
36
|
+
end
|
33
37
|
|
34
38
|
it 'opens URL and returns source as String' do
|
35
39
|
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
|
@@ -37,8 +41,7 @@ describe 'Target' do
|
|
37
41
|
end
|
38
42
|
|
39
43
|
it 'fails if target redirects to new host' do
|
40
|
-
expect {
|
41
|
-
.to raise_error
|
44
|
+
expect { redirecting_url }.to raise_error
|
42
45
|
end
|
43
46
|
end
|
44
47
|
end
|
metadata
CHANGED
@@ -1,125 +1,139 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: em-http-request
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: ruby-progressbar
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: bloomfilter-rb
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: addressable
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: htmlentities
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: bundler
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - ~>
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
103
|
version: '1.6'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - ~>
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
110
|
version: '1.6'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rake
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - ~>
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
117
|
version: '10.3'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - ~>
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '10.3'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
|
-
- - ~>
|
129
|
+
- - "~>"
|
116
130
|
- !ruby/object:Gem::Version
|
117
131
|
version: '2.14'
|
118
132
|
type: :development
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
|
-
- - ~>
|
136
|
+
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '2.14'
|
125
139
|
description: Asynchronous web crawler, scraper and file harvester
|
@@ -134,6 +148,7 @@ files:
|
|
134
148
|
- bin/rr
|
135
149
|
- lib/retriever.rb
|
136
150
|
- lib/retriever/cli.rb
|
151
|
+
- lib/retriever/core_ext.rb
|
137
152
|
- lib/retriever/fetch.rb
|
138
153
|
- lib/retriever/fetchfiles.rb
|
139
154
|
- lib/retriever/fetchseo.rb
|
@@ -141,6 +156,7 @@ files:
|
|
141
156
|
- lib/retriever/link.rb
|
142
157
|
- lib/retriever/openuri_redirect_patch.rb
|
143
158
|
- lib/retriever/page.rb
|
159
|
+
- lib/retriever/page_iterator.rb
|
144
160
|
- lib/retriever/target.rb
|
145
161
|
- lib/retriever/version.rb
|
146
162
|
- readme.md
|
@@ -159,17 +175,17 @@ require_paths:
|
|
159
175
|
- lib
|
160
176
|
required_ruby_version: !ruby/object:Gem::Requirement
|
161
177
|
requirements:
|
162
|
-
- -
|
178
|
+
- - ">="
|
163
179
|
- !ruby/object:Gem::Version
|
164
|
-
version:
|
180
|
+
version: 2.0.0
|
165
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
166
182
|
requirements:
|
167
|
-
- -
|
183
|
+
- - ">="
|
168
184
|
- !ruby/object:Gem::Version
|
169
185
|
version: 1.3.6
|
170
186
|
requirements: []
|
171
187
|
rubyforge_project: rubyretriever
|
172
|
-
rubygems_version: 2.
|
188
|
+
rubygems_version: 2.3.0
|
173
189
|
signing_key:
|
174
190
|
specification_version: 4
|
175
191
|
summary: Ruby Web Crawler & File Harvester
|