rubyretriever 1.2.4 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +1 -1
- data/lib/retriever/core_ext.rb +13 -0
- data/lib/retriever/fetch.rb +53 -44
- data/lib/retriever/fetchfiles.rb +1 -0
- data/lib/retriever/fetchseo.rb +1 -0
- data/lib/retriever/fetchsitemap.rb +1 -0
- data/lib/retriever/page.rb +11 -9
- data/lib/retriever/page_iterator.rb +21 -0
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +2 -0
- data/readme.md +52 -11
- data/spec/link_spec.rb +1 -1
- data/spec/page_spec.rb +18 -9
- data/spec/retriever_spec.rb +65 -0
- data/spec/target_spec.rb +7 -4
- metadata +38 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
|
4
|
+
data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
|
7
|
+
data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
|
data/bin/rr
CHANGED
@@ -32,7 +32,7 @@ optparse = OptionParser.new do |opts|
|
|
32
32
|
opts.on('-p', '--progress', 'Output progress bar') do
|
33
33
|
options['progress'] = true
|
34
34
|
end
|
35
|
-
options['maxpages'] =
|
35
|
+
options['maxpages'] = 100
|
36
36
|
opts.on('-l',
|
37
37
|
'--limit PAGE_LIMIT_#',
|
38
38
|
'set a max on the total number of crawled pages') do |maxp|
|
data/lib/retriever/fetch.rb
CHANGED
@@ -15,6 +15,7 @@ module Retriever
|
|
15
15
|
# There is no direct output
|
16
16
|
# this is a parent class that the other fetch classes build off of.
|
17
17
|
def initialize(url, options)
|
18
|
+
@iterator = false
|
18
19
|
@result = []
|
19
20
|
@connection_tally = {
|
20
21
|
success: 0,
|
@@ -27,6 +28,9 @@ module Retriever
|
|
27
28
|
@t = Retriever::Target.new(url, @file_re)
|
28
29
|
@output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
|
29
30
|
@already_crawled = setup_bloom_filter
|
31
|
+
end
|
32
|
+
|
33
|
+
def start
|
30
34
|
@page_one = crawl_page_one
|
31
35
|
@link_stack = create_link_stack
|
32
36
|
@temp_link_stack = []
|
@@ -80,6 +84,39 @@ module Retriever
|
|
80
84
|
puts
|
81
85
|
end
|
82
86
|
|
87
|
+
# returns true is resp is ok to continue
|
88
|
+
def good_response?(resp, url)
|
89
|
+
return false unless resp
|
90
|
+
hdr = resp.response_header
|
91
|
+
if hdr.redirection?
|
92
|
+
loc = hdr.location
|
93
|
+
lg("#{url} Redirected to #{loc}")
|
94
|
+
if t.host_re =~ loc
|
95
|
+
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
96
|
+
lg('--Added to stack for later')
|
97
|
+
return false
|
98
|
+
end
|
99
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
100
|
+
return false
|
101
|
+
end
|
102
|
+
# lets not continue if unsuccessful connection
|
103
|
+
unless hdr.successful?
|
104
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
105
|
+
@connection_tally[:error] += 1
|
106
|
+
@connection_tally[:error_server] += 1 if hdr.server_error?
|
107
|
+
@connection_tally[:error_client] += 1 if hdr.client_error?
|
108
|
+
return false
|
109
|
+
end
|
110
|
+
# let's not continue if not text/html
|
111
|
+
unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
|
112
|
+
@already_crawled.insert(url)
|
113
|
+
lg("Page Not text/html -- #{url}")
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
@connection_tally[:success] += 1
|
117
|
+
true
|
118
|
+
end
|
119
|
+
|
83
120
|
private
|
84
121
|
|
85
122
|
def setup_options(options)
|
@@ -119,7 +156,7 @@ module Retriever
|
|
119
156
|
end
|
120
157
|
|
121
158
|
def crawl_page_one
|
122
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
159
|
+
page_one = Retriever::Page.new(@t.target, @t.source, @t)
|
123
160
|
lg("URL Crawled: #{@t.target}")
|
124
161
|
page_one
|
125
162
|
end
|
@@ -141,13 +178,13 @@ module Retriever
|
|
141
178
|
|
142
179
|
# iterates over the existing @link_stack
|
143
180
|
# running until we reach the @max_pages value.
|
144
|
-
def async_crawl_and_collect
|
181
|
+
def async_crawl_and_collect(&block)
|
145
182
|
while @already_crawled.size < @max_pages
|
146
183
|
if @link_stack.empty?
|
147
184
|
end_crawl_notice
|
148
185
|
break
|
149
186
|
end
|
150
|
-
new_links_arr = process_link_stack
|
187
|
+
new_links_arr = process_link_stack(&block)
|
151
188
|
@temp_link_stack = []
|
152
189
|
next if new_links_arr.nil? || new_links_arr.empty?
|
153
190
|
@link_stack.concat(new_links_arr)
|
@@ -157,47 +194,14 @@ module Retriever
|
|
157
194
|
@result.uniq!
|
158
195
|
end
|
159
196
|
|
160
|
-
|
161
|
-
def good_response?(resp, url)
|
162
|
-
return false unless resp
|
163
|
-
hdr = resp.response_header
|
164
|
-
if hdr.redirection?
|
165
|
-
loc = hdr.location
|
166
|
-
lg("#{url} Redirected to #{loc}")
|
167
|
-
if t.host_re =~ loc
|
168
|
-
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
169
|
-
lg('--Added to stack for later')
|
170
|
-
return false
|
171
|
-
end
|
172
|
-
lg("Redirection outside of target host. No - go. #{loc}")
|
173
|
-
return false
|
174
|
-
end
|
175
|
-
# lets not continue if unsuccessful connection
|
176
|
-
unless hdr.successful?
|
177
|
-
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
178
|
-
@connection_tally[:error] += 1
|
179
|
-
@connection_tally[:error_server] += 1 if hdr.server_error?
|
180
|
-
@connection_tally[:error_client] += 1 if hdr.client_error?
|
181
|
-
return false
|
182
|
-
end
|
183
|
-
# let's not continue if not text/html
|
184
|
-
unless hdr['CONTENT_TYPE'].include?('text/html')
|
185
|
-
@already_crawled.insert(url)
|
186
|
-
lg("Page Not text/html -- #{url}")
|
187
|
-
return false
|
188
|
-
end
|
189
|
-
@connection_tally[:success] += 1
|
190
|
-
true
|
191
|
-
end
|
192
|
-
|
193
|
-
def push_seo_to_data(url, new_page)
|
197
|
+
def push_seo_to_result(url, new_page)
|
194
198
|
seos = [url]
|
195
199
|
seos.concat(new_page.parse_seo)
|
196
200
|
@result.push(seos)
|
197
201
|
lg('--page SEO scraped')
|
198
202
|
end
|
199
203
|
|
200
|
-
def
|
204
|
+
def push_files_to_result(new_page)
|
201
205
|
filez = new_page.parse_files(new_page.parse_internal)
|
202
206
|
@result.concat(filez) unless filez.empty?
|
203
207
|
lg("--#{filez.size} files found")
|
@@ -209,7 +213,7 @@ module Retriever
|
|
209
213
|
if @progress && (@already_crawled.size < @max_pages)
|
210
214
|
@progressbar.increment
|
211
215
|
end
|
212
|
-
Retriever::Page.new(response, @t)
|
216
|
+
Retriever::Page.new(url, response, @t)
|
213
217
|
end
|
214
218
|
|
215
219
|
def new_visitable_links(current_page)
|
@@ -217,10 +221,16 @@ module Retriever
|
|
217
221
|
current_page.parse_internal_visitable
|
218
222
|
end
|
219
223
|
|
224
|
+
def push_custom_to_result(url, current_page, &block)
|
225
|
+
data = block.call current_page
|
226
|
+
@result.push(data) unless data.empty?
|
227
|
+
lg("-- PageIterator called on: #{url}")
|
228
|
+
end
|
229
|
+
|
220
230
|
# send a new wave of GET requests, using current @link_stack
|
221
231
|
# at end of the loop it empties link_stack
|
222
232
|
# puts new links into temporary stack
|
223
|
-
def process_link_stack
|
233
|
+
def process_link_stack(&block)
|
224
234
|
EM.synchrony do
|
225
235
|
concurrency = 10
|
226
236
|
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
@@ -230,20 +240,19 @@ module Retriever
|
|
230
240
|
next unless good_response?(resp, url)
|
231
241
|
current_page = page_from_response(url, resp.response)
|
232
242
|
# non-link dependent modes
|
233
|
-
|
243
|
+
push_seo_to_result(url, current_page) if @seo
|
244
|
+
push_custom_to_result(url, current_page, &block) if @iterator
|
234
245
|
next unless current_page.links.size > 0
|
235
246
|
@temp_link_stack.push(new_visitable_links(current_page))
|
236
247
|
# link dependent modes
|
237
248
|
next unless @fileharvest
|
238
|
-
|
249
|
+
push_files_to_result(current_page)
|
239
250
|
end
|
240
251
|
EventMachine.stop
|
241
252
|
end
|
242
253
|
# empty the stack. most clean way
|
243
254
|
@link_stack = []
|
244
255
|
# temp contains redirects + new visitable links
|
245
|
-
# we will re-initialize it as empty right after this function
|
246
|
-
# in the parent method 'async crawl and collect'
|
247
256
|
@temp_link_stack.flatten.uniq!
|
248
257
|
end
|
249
258
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -5,6 +5,7 @@ module Retriever
|
|
5
5
|
class FetchFiles < Fetch
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
+
start
|
8
9
|
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
9
10
|
@result.concat(temp_file_collection) if temp_file_collection.size > 0
|
10
11
|
lg("#{@result.size} new files found")
|
data/lib/retriever/fetchseo.rb
CHANGED
data/lib/retriever/page.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'addressable/uri'
|
2
|
-
|
2
|
+
#
|
3
|
+
using SourceString
|
3
4
|
module Retriever
|
4
5
|
#
|
5
6
|
class Page
|
@@ -30,11 +31,12 @@ module Retriever
|
|
30
31
|
wmv|flv|mp3|wav|doc|txt|ico|xml)
|
31
32
|
/ix).freeze
|
32
33
|
|
33
|
-
attr_reader :links, :source, :t
|
34
|
+
attr_reader :links, :source, :t, :url
|
34
35
|
|
35
|
-
def initialize(source, t)
|
36
|
+
def initialize(url, source, t)
|
37
|
+
@url = url
|
36
38
|
@t = t
|
37
|
-
@source = source.
|
39
|
+
@source = source.encode_utf8_and_replace
|
38
40
|
@links = nil
|
39
41
|
end
|
40
42
|
|
@@ -59,24 +61,24 @@ module Retriever
|
|
59
61
|
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
|
60
62
|
end
|
61
63
|
|
62
|
-
def parse_files(arr)
|
64
|
+
def parse_files(arr = parse_internal)
|
63
65
|
arr.select { |x| @t.file_re =~ x }
|
64
66
|
end
|
65
67
|
|
66
68
|
def title
|
67
|
-
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
|
69
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
|
68
70
|
end
|
69
71
|
|
70
72
|
def desc
|
71
|
-
DESC_RE =~ @source ? @source.match(DESC_RE)[1]
|
73
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
|
72
74
|
end
|
73
75
|
|
74
76
|
def h1
|
75
|
-
H1_RE =~ @source ? @source.match(H1_RE)[1]
|
77
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
|
76
78
|
end
|
77
79
|
|
78
80
|
def h2
|
79
|
-
H2_RE =~ @source ? @source.match(H2_RE)[1]
|
81
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
|
80
82
|
end
|
81
83
|
|
82
84
|
def parse_seo
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Retriever
|
2
|
+
#
|
3
|
+
class PageIterator < Fetch
|
4
|
+
# recieves target url and RR options, and a block
|
5
|
+
# runs the block on all pages during crawl, pushing
|
6
|
+
# the returned value of the block onto a result stack
|
7
|
+
# the complete data returned from the crawl is accessible thru self.result
|
8
|
+
def initialize(url, options, &block)
|
9
|
+
super
|
10
|
+
start
|
11
|
+
fail 'block required for PageIterator' unless block_given?
|
12
|
+
@iterator = true
|
13
|
+
@result.push(block.call @page_one)
|
14
|
+
lg("-- PageIterator crawled- #{url}")
|
15
|
+
async_crawl_and_collect(&block)
|
16
|
+
# done, make sure progress bar says we are done
|
17
|
+
@progressbar.finish if @progress
|
18
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'retriever/core_ext'
|
1
2
|
require 'retriever/fetch'
|
2
3
|
require 'retriever/fetchfiles'
|
3
4
|
require 'retriever/fetchsitemap'
|
4
5
|
require 'retriever/fetchseo'
|
6
|
+
require 'retriever/page_iterator'
|
5
7
|
require 'retriever/cli'
|
6
8
|
require 'retriever/link'
|
7
9
|
require 'retriever/target'
|
data/readme.md
CHANGED
@@ -25,7 +25,8 @@ Features
|
|
25
25
|
|
26
26
|
Use cases
|
27
27
|
---------
|
28
|
-
RubyRetriever can do multiple things for you.
|
28
|
+
RubyRetriever can do multiple things for you. As an Executable
|
29
|
+
With a single command at the terminal, RR can:
|
29
30
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
30
31
|
2. Crawl a target website and *download all files of a given filetype*.
|
31
32
|
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
@@ -36,41 +37,44 @@ Getting started
|
|
36
37
|
-----------
|
37
38
|
Install the gem
|
38
39
|
```sh
|
39
|
-
gem install rubyretriever
|
40
|
+
$ gem install rubyretriever
|
40
41
|
```
|
42
|
+
|
41
43
|
|
44
|
+
Using the Executable
|
45
|
+
--------------------
|
42
46
|
**Example: Sitemap mode**
|
43
47
|
```sh
|
44
|
-
rr --sitemap CSV --progress --limit
|
48
|
+
$ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
|
45
49
|
```
|
46
50
|
OR -- SAME COMMAND
|
47
51
|
```sh
|
48
|
-
rr -s csv -p -l
|
52
|
+
$ rr -s csv -p -l 10 http://www.cnet.com
|
49
53
|
```
|
50
54
|
|
51
|
-
This would map http://www.cnet.com until it crawled a max of
|
55
|
+
This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
|
52
56
|
|
53
57
|
**Example: File Harvesting mode**
|
54
58
|
```sh
|
55
|
-
rr --files
|
59
|
+
$ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
|
56
60
|
```
|
57
61
|
OR -- SAME COMMAND
|
58
62
|
```sh
|
59
|
-
rr -f
|
63
|
+
$ rr -f txt -v -l 1 http://textfiles.com/programming/
|
60
64
|
```
|
61
65
|
|
62
|
-
This would crawl http://
|
66
|
+
This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
|
63
67
|
|
64
68
|
**Example: SEO mode**
|
65
69
|
```sh
|
66
|
-
rr --seo --progress --limit
|
70
|
+
$ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
|
67
71
|
```
|
68
72
|
OR -- SAME COMMAND
|
69
73
|
```sh
|
70
|
-
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
74
|
+
$ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
71
75
|
```
|
72
76
|
|
73
|
-
This would go to http://www.cnet.com and crawl a max of
|
77
|
+
This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
|
74
78
|
|
75
79
|
|
76
80
|
command-line arguments
|
@@ -89,11 +93,48 @@ and OPTIONS is the applicable:
|
|
89
93
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
90
94
|
-h, --help *Display this screen*
|
91
95
|
|
96
|
+
|
97
|
+
Using as a Library (starting as of version 1.3.0 -- yet to be released)
|
98
|
+
------------------
|
99
|
+
|
100
|
+
If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
|
101
|
+
|
102
|
+
Sample Script using **PageIterator**
|
103
|
+
```ruby
|
104
|
+
require 'retriever'
|
105
|
+
opts = {
|
106
|
+
'maxpages' => 1
|
107
|
+
}
|
108
|
+
t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
|
109
|
+
[page.url, page.title]
|
110
|
+
end
|
111
|
+
puts t.result.to_s
|
112
|
+
```
|
113
|
+
|
114
|
+
```sh
|
115
|
+
>> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
|
116
|
+
```
|
117
|
+
Available methods on the page iterator:
|
118
|
+
* **#url** - returns full URL of current page
|
119
|
+
* **#source** - returns raw page source code
|
120
|
+
* **#title** - returns html decoded verson of curent page title
|
121
|
+
* **#desc** - returns html decoded verson of curent page meta description
|
122
|
+
* **#h1** - returns html decoded verson of current page's h1 tag
|
123
|
+
* **#h2** - returns html decoded verson of current page's h2 tag
|
124
|
+
* **#links** - returns array of all links on the page
|
125
|
+
* **#parse_internal** - returns array of current page's internal (same host) links
|
126
|
+
* **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
|
127
|
+
* **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
|
128
|
+
* **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
|
129
|
+
|
130
|
+
|
92
131
|
Current Requirements
|
93
132
|
------------
|
94
133
|
em-synchrony
|
95
134
|
ruby-progressbar
|
96
135
|
bloomfilter-rb
|
136
|
+
addressable
|
137
|
+
htmlentities
|
97
138
|
|
98
139
|
License
|
99
140
|
-------
|
data/spec/link_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require 'retriever'
|
|
3
3
|
describe 'Link' do
|
4
4
|
|
5
5
|
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
-
let(:links) { Retriever::Page.new(@source, t).links }
|
6
|
+
let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
|
7
7
|
|
8
8
|
it 'collects links in anchor tags' do
|
9
9
|
@source = (<<SOURCE).strip
|
data/spec/page_spec.rb
CHANGED
@@ -4,9 +4,18 @@ require 'retriever/fetch'
|
|
4
4
|
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
6
|
describe 'Page' do
|
7
|
+
describe '#url' do
|
8
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
9
|
+
it 'returns current page URL' do
|
10
|
+
@source = (<<SOURCE).strip
|
11
|
+
<a href='http://www.cnet.com/'>download</a>
|
12
|
+
SOURCE
|
13
|
+
expect(page.url).to eq('http://www.cnet.com/')
|
14
|
+
end
|
15
|
+
end
|
7
16
|
|
8
17
|
describe '#links' do
|
9
|
-
let(:
|
18
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
10
19
|
it 'collects all unique href links on the page' do
|
11
20
|
@source = (<<SOURCE).strip
|
12
21
|
<a href='www.cnet.com/download.exe'>download</a>
|
@@ -17,12 +26,12 @@ describe 'Page' do
|
|
17
26
|
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
18
27
|
SOURCE
|
19
28
|
|
20
|
-
expect(links.size).to eq(4)
|
29
|
+
expect(page.links.size).to eq(4)
|
21
30
|
end
|
22
31
|
end
|
23
32
|
|
24
33
|
describe '#parse_internal' do
|
25
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
34
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
26
35
|
let(:links) { page.parse_internal }
|
27
36
|
it 'filters links by host' do
|
28
37
|
@source = (<<SOURCE).strip
|
@@ -35,7 +44,7 @@ SOURCE
|
|
35
44
|
end
|
36
45
|
|
37
46
|
describe '#parse_internal_visitable' do
|
38
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
47
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
39
48
|
let(:links) { page.parse_internal_visitable }
|
40
49
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
41
50
|
@source = (<<SOURCE).strip
|
@@ -46,7 +55,7 @@ SOURCE
|
|
46
55
|
end
|
47
56
|
|
48
57
|
describe '#parse_files' do
|
49
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
58
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
50
59
|
let(:files) { page.parse_files(page.parse_internal) }
|
51
60
|
it 'filters links by filetype' do
|
52
61
|
@source = (<<SOURCE).strip
|
@@ -59,7 +68,7 @@ SOURCE
|
|
59
68
|
end
|
60
69
|
|
61
70
|
describe '#title' do
|
62
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
71
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
63
72
|
it 'returns page title' do
|
64
73
|
@source = (<<SOURCE).strip
|
65
74
|
<title>test</title>
|
@@ -68,7 +77,7 @@ SOURCE
|
|
68
77
|
end
|
69
78
|
end
|
70
79
|
describe '#desc' do
|
71
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
80
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
72
81
|
it 'returns meta description' do
|
73
82
|
@source = (<<SOURCE).strip
|
74
83
|
<meta name='description' content="test2 ">
|
@@ -77,7 +86,7 @@ SOURCE
|
|
77
86
|
end
|
78
87
|
end
|
79
88
|
describe '#h1' do
|
80
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
89
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
81
90
|
it 'returns h1 text' do
|
82
91
|
@source = (<<SOURCE).strip
|
83
92
|
<h1>test 3</h1>
|
@@ -86,7 +95,7 @@ SOURCE
|
|
86
95
|
end
|
87
96
|
end
|
88
97
|
describe '#h2' do
|
89
|
-
let(:page) { Retriever::Page.new(@source, t) }
|
98
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
|
90
99
|
it 'returns h2 text' do
|
91
100
|
@source = (<<SOURCE).strip
|
92
101
|
<h2> test 4 </h2>
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,4 +1,69 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
3
|
describe 'Fetch' do
|
4
|
+
describe '#good_response?' do
|
5
|
+
let(:r) do
|
6
|
+
Retriever::Fetch.new('http://www.yahoo.com', {})
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:resp) do
|
10
|
+
{}
|
11
|
+
end
|
12
|
+
|
13
|
+
let(:nil_response) do
|
14
|
+
r.good_response?(nil,'http://www.yahoo.com')
|
15
|
+
end
|
16
|
+
|
17
|
+
let(:unsuccessful_resp) do
|
18
|
+
resp.stub(:response_header).and_return(resp)
|
19
|
+
resp.stub(:redirection?).and_return(false)
|
20
|
+
resp.stub(:successful?).and_return(false)
|
21
|
+
resp.stub(:server_error?).and_return(false)
|
22
|
+
resp.stub(:client_error?).and_return(false)
|
23
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
24
|
+
end
|
25
|
+
|
26
|
+
let(:redir_resp) do
|
27
|
+
resp.stub(:response_header).and_return(resp)
|
28
|
+
resp.stub(:redirection?).and_return(true)
|
29
|
+
resp.stub(:location).and_return('http://www.google.com')
|
30
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
31
|
+
end
|
32
|
+
|
33
|
+
let(:bad_content_type_resp) do
|
34
|
+
resp.stub(:response_header).and_return(resp)
|
35
|
+
resp.stub(:redirection?).and_return(false)
|
36
|
+
resp.stub(:successful?).and_return(true)
|
37
|
+
resp['CONTENT_TYPE'] = 'image/jpeg'
|
38
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
39
|
+
end
|
40
|
+
|
41
|
+
let(:success_resp) do
|
42
|
+
resp.stub(:response_header).and_return(resp)
|
43
|
+
resp.stub(:redirection?).and_return(false)
|
44
|
+
resp.stub(:successful?).and_return(true)
|
45
|
+
resp['CONTENT_TYPE'] = 'text/html'
|
46
|
+
r.good_response?(resp,'http://www.yahoo.com')
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'returns false if the response is empty' do
|
50
|
+
expect(nil_response).to eq(false)
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'returns false on unsuccessful connection' do
|
54
|
+
expect(unsuccessful_resp).to eq(false)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'returns false on redirecting host' do
|
58
|
+
expect(redir_resp).to eq(false)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'returns false on non-visitable content type' do
|
62
|
+
expect(bad_content_type_resp).to eq(false)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'returns true otherwise' do
|
66
|
+
expect(success_resp).to eq(true)
|
67
|
+
end
|
68
|
+
end
|
4
69
|
end
|
data/spec/target_spec.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
|
-
|
6
4
|
describe 'Target' do
|
5
|
+
let(:t) do
|
6
|
+
Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
7
|
+
end
|
7
8
|
|
8
9
|
it 'creates target var' do
|
9
10
|
expect(t.target).to eq('http://www.cnet.com/reviews/')
|
@@ -30,6 +31,9 @@ describe 'Target' do
|
|
30
31
|
end
|
31
32
|
|
32
33
|
describe '#source' do
|
34
|
+
let(:redirecting_url) do
|
35
|
+
Retriever::Target.new('http://software-by-joe.appspot.com').source
|
36
|
+
end
|
33
37
|
|
34
38
|
it 'opens URL and returns source as String' do
|
35
39
|
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
|
@@ -37,8 +41,7 @@ describe 'Target' do
|
|
37
41
|
end
|
38
42
|
|
39
43
|
it 'fails if target redirects to new host' do
|
40
|
-
expect {
|
41
|
-
.to raise_error
|
44
|
+
expect { redirecting_url }.to raise_error
|
42
45
|
end
|
43
46
|
end
|
44
47
|
end
|
metadata
CHANGED
@@ -1,125 +1,139 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: em-http-request
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: ruby-progressbar
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: bloomfilter-rb
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: addressable
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: htmlentities
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: bundler
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - ~>
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
103
|
version: '1.6'
|
90
104
|
type: :development
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - ~>
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
110
|
version: '1.6'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rake
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - ~>
|
115
|
+
- - "~>"
|
102
116
|
- !ruby/object:Gem::Version
|
103
117
|
version: '10.3'
|
104
118
|
type: :development
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - ~>
|
122
|
+
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '10.3'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: rspec
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
|
-
- - ~>
|
129
|
+
- - "~>"
|
116
130
|
- !ruby/object:Gem::Version
|
117
131
|
version: '2.14'
|
118
132
|
type: :development
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
|
-
- - ~>
|
136
|
+
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '2.14'
|
125
139
|
description: Asynchronous web crawler, scraper and file harvester
|
@@ -134,6 +148,7 @@ files:
|
|
134
148
|
- bin/rr
|
135
149
|
- lib/retriever.rb
|
136
150
|
- lib/retriever/cli.rb
|
151
|
+
- lib/retriever/core_ext.rb
|
137
152
|
- lib/retriever/fetch.rb
|
138
153
|
- lib/retriever/fetchfiles.rb
|
139
154
|
- lib/retriever/fetchseo.rb
|
@@ -141,6 +156,7 @@ files:
|
|
141
156
|
- lib/retriever/link.rb
|
142
157
|
- lib/retriever/openuri_redirect_patch.rb
|
143
158
|
- lib/retriever/page.rb
|
159
|
+
- lib/retriever/page_iterator.rb
|
144
160
|
- lib/retriever/target.rb
|
145
161
|
- lib/retriever/version.rb
|
146
162
|
- readme.md
|
@@ -159,17 +175,17 @@ require_paths:
|
|
159
175
|
- lib
|
160
176
|
required_ruby_version: !ruby/object:Gem::Requirement
|
161
177
|
requirements:
|
162
|
-
- -
|
178
|
+
- - ">="
|
163
179
|
- !ruby/object:Gem::Version
|
164
|
-
version:
|
180
|
+
version: 2.0.0
|
165
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
166
182
|
requirements:
|
167
|
-
- -
|
183
|
+
- - ">="
|
168
184
|
- !ruby/object:Gem::Version
|
169
185
|
version: 1.3.6
|
170
186
|
requirements: []
|
171
187
|
rubyforge_project: rubyretriever
|
172
|
-
rubygems_version: 2.
|
188
|
+
rubygems_version: 2.3.0
|
173
189
|
signing_key:
|
174
190
|
specification_version: 4
|
175
191
|
summary: Ruby Web Crawler & File Harvester
|