rubyretriever 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +51 -48
- data/lib/retriever/cli.rb +11 -7
- data/lib/retriever/fetch.rb +134 -105
- data/lib/retriever/fetchfiles.rb +32 -34
- data/lib/retriever/fetchseo.rb +3 -11
- data/lib/retriever/fetchsitemap.rb +19 -18
- data/lib/retriever/link.rb +17 -15
- data/lib/retriever/{openuri-redirect-patch.rb → openuri_redirect_patch.rb} +2 -1
- data/lib/retriever/page.rb +35 -16
- data/lib/retriever/target.rb +15 -13
- data/lib/retriever/version.rb +3 -2
- data/lib/retriever.rb +1 -1
- data/readme.md +19 -5
- data/spec/link_spec.rb +37 -35
- data/spec/page_spec.rb +48 -44
- data/spec/retriever_spec.rb +2 -3
- data/spec/target_spec.rb +28 -28
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79f0b251e367f085f7b84dd83a10f6a1dfcddd3c
|
4
|
+
data.tar.gz: 0e9b6bc8f66b9efd14921d8f0fc5fddc45042b5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe1a6c8e118378513c4a4e72adeccc94e212fd5e0d4244f56240830155e42f7b7bde80acdfabbc9fe9ee5b46687bc33d089d73c57690beadee3d04804a9435ac
|
7
|
+
data.tar.gz: 4e6bec31d3416293f2fb72b39cfab5602692a2b9d94cbfb6fba9a653afc6bf718e5eebf84b4a5745d01d2e55959df4e489342b0deca2060e2653bb5b31f4731e
|
data/bin/rr
CHANGED
@@ -1,57 +1,58 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
-
|
3
2
|
require 'retriever'
|
4
3
|
require 'optparse'
|
5
4
|
|
6
5
|
options = {}
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
6
|
+
optparse = OptionParser.new do |opts|
|
7
|
+
# Set a banner, displayed at the top
|
8
|
+
# of the help screen.
|
9
|
+
opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
|
10
|
+
options['sitemap'] = false
|
11
|
+
opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
|
12
|
+
options['sitemap'] = type || ''
|
13
|
+
end
|
14
|
+
options['fileharvest'] = false
|
15
|
+
opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
|
16
|
+
options['fileharvest'] = file_e
|
17
|
+
end
|
18
|
+
options['seo'] = false
|
19
|
+
opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
|
20
|
+
options['seo'] = true
|
21
|
+
end
|
22
|
+
options['filename'] = nil
|
23
|
+
opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
|
24
|
+
options['filename'] = file
|
25
|
+
end
|
26
|
+
# Define the options, and what they do
|
27
|
+
options['verbose'] = false
|
28
|
+
opts.on('-v', '--verbose', 'Output more information') do
|
29
|
+
options['verbose'] = true
|
30
|
+
end
|
31
|
+
options['progress'] = false
|
32
|
+
opts.on('-p', '--progress', 'Output progress bar') do
|
33
|
+
options['progress'] = true
|
34
|
+
end
|
35
|
+
options['maxpages'] = false
|
36
|
+
opts.on('-l',
|
37
|
+
'--limit PAGE_LIMIT_#',
|
38
|
+
'set a max on the total number of crawled pages') do |maxp|
|
39
|
+
options['maxpages'] = maxp
|
40
|
+
end
|
41
|
+
options['autodown'] = false
|
42
|
+
opts.on('-a', '--auto', 'Automatically download all files located') do
|
43
|
+
options['autodown'] = true
|
44
|
+
end
|
45
|
+
# This displays the help screen, all programs are
|
46
|
+
# assumed to have this option.
|
47
|
+
opts.on('-h', '--help', 'Display this screen') do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end
|
51
52
|
|
52
53
|
optparse.parse!
|
53
54
|
if ARGV[0].nil?
|
54
|
-
abort(
|
55
|
+
abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
|
55
56
|
end
|
56
57
|
|
57
58
|
ARGV.each do|q|
|
@@ -61,9 +62,11 @@ ARGV.each do|q|
|
|
61
62
|
puts '### Creating Sitemap' if options['sitemap']
|
62
63
|
puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
|
63
64
|
puts '### Performing File Harvest' if options['fileharvest']
|
64
|
-
|
65
|
+
if options['fileharvest']
|
66
|
+
puts "### Searching for filetype: #{options['fileharvest']}"
|
67
|
+
end
|
65
68
|
puts '### Performing SEO Scrape' if options['seo']
|
66
|
-
puts "### Writing
|
69
|
+
puts "### Writing to file: #{options['filename']}" if options['filename']
|
67
70
|
puts '### Being verbose'
|
68
71
|
puts "### Stopping after #{options['maxpages']} pages"
|
69
72
|
end
|
data/lib/retriever/cli.rb
CHANGED
@@ -3,19 +3,23 @@ module Retriever
|
|
3
3
|
class CLI
|
4
4
|
def initialize(url, options)
|
5
5
|
# kick off the fetch mode of choice
|
6
|
+
@fetch = choose_fetch_mode(url, options)
|
7
|
+
@fetch.dump
|
8
|
+
@fetch.write if options['filename']
|
9
|
+
@fetch.autodownload if options['autodown'] && options['fileharvest']
|
10
|
+
@fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def choose_fetch_mode(url, options)
|
6
14
|
if options['fileharvest']
|
7
|
-
|
15
|
+
Retriever::FetchFiles.new(url, options)
|
8
16
|
elsif options['sitemap']
|
9
|
-
|
17
|
+
Retriever::FetchSitemap.new(url, options)
|
10
18
|
elsif options['seo']
|
11
|
-
|
19
|
+
Retriever::FetchSEO.new(url, options)
|
12
20
|
else
|
13
21
|
fail '### Error: No Mode Selected'
|
14
22
|
end
|
15
|
-
@fetch.dump
|
16
|
-
@fetch.write if options['filename']
|
17
|
-
@fetch.autodownload if options['autodown'] && options['fileharvest']
|
18
|
-
@fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
|
19
23
|
end
|
20
24
|
end
|
21
25
|
end
|
data/lib/retriever/fetch.rb
CHANGED
@@ -9,56 +9,27 @@ require 'bloomfilter-rb'
|
|
9
9
|
module Retriever
|
10
10
|
#
|
11
11
|
class Fetch
|
12
|
+
HR = '###############################'
|
12
13
|
attr_reader :max_pages, :t
|
13
14
|
# given target URL and RR options, creates a fetch object.
|
14
15
|
# There is no direct output
|
15
16
|
# this is a parent class that the other fetch classes build off of.
|
16
17
|
def initialize(url, options)
|
18
|
+
@data = []
|
17
19
|
@connection_tally = {
|
18
|
-
:
|
19
|
-
:
|
20
|
-
:
|
21
|
-
:
|
20
|
+
success: 0,
|
21
|
+
error: 0,
|
22
|
+
error_client: 0,
|
23
|
+
error_server: 0
|
22
24
|
}
|
23
|
-
|
24
|
-
|
25
|
-
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
26
|
-
@v = options['verbose']
|
27
|
-
@output = options['filename']
|
28
|
-
@fh = options['fileharvest']
|
29
|
-
@file_ext = @fh.to_s
|
30
|
-
@s = options['sitemap']
|
31
|
-
@seo = options['seo']
|
32
|
-
@autodown = options['autodown']
|
33
|
-
#
|
34
|
-
if @fh
|
35
|
-
temp_ext_str = '.' + @file_ext + '\z'
|
36
|
-
@file_re = Regexp.new(temp_ext_str).freeze
|
37
|
-
else
|
38
|
-
# when FH is not true, and autodown is true
|
39
|
-
errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
|
40
|
-
end
|
41
|
-
if @prgrss
|
42
|
-
# verbose & progressbar conflict
|
43
|
-
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
|
44
|
-
prgress_vars = {
|
45
|
-
:title => 'Pages',
|
46
|
-
:starting_at => 1,
|
47
|
-
:total => @max_pages,
|
48
|
-
:format => '%a |%b>%i| %c/%C %t'
|
49
|
-
}
|
50
|
-
@progressbar = ProgressBar.create(prgress_vars)
|
51
|
-
end
|
25
|
+
setup_options(options)
|
26
|
+
setup_progress_bar if @progress
|
52
27
|
@t = Retriever::Target.new(url, @file_re)
|
53
|
-
@output = "rr-#{@t.host.split('.')[1]}" if @
|
54
|
-
@already_crawled =
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
:bucket => 8,
|
59
|
-
:raise => false
|
60
|
-
)
|
61
|
-
@already_crawled.insert(@t.target)
|
28
|
+
@output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
|
29
|
+
@already_crawled = setup_bloom_filter
|
30
|
+
@page_one = crawl_page_one
|
31
|
+
@link_stack = create_link_stack
|
32
|
+
@temp_link_stack = []
|
62
33
|
end
|
63
34
|
|
64
35
|
def errlog(msg)
|
@@ -66,35 +37,26 @@ module Retriever
|
|
66
37
|
end
|
67
38
|
|
68
39
|
def lg(msg)
|
69
|
-
puts "### #{msg}" if @
|
40
|
+
puts "### #{msg}" if @verbose
|
70
41
|
end
|
71
42
|
|
72
43
|
# prints current data collection to STDOUT
|
73
44
|
def dump
|
74
|
-
puts
|
75
|
-
if @
|
76
|
-
|
77
|
-
|
78
|
-
puts '
|
79
|
-
|
80
|
-
|
81
|
-
puts "#{@t.target} Sitemap"
|
82
|
-
puts "Page Count: #{@data.size}"
|
83
|
-
elsif @fh
|
84
|
-
puts "Target URL: #{@t.target}"
|
85
|
-
puts "Filetype: #{@file_ext}"
|
86
|
-
puts "File Count: #{@data.size}"
|
45
|
+
puts HR
|
46
|
+
puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
|
47
|
+
puts "Target URL: #{@t.target}"
|
48
|
+
if @sitemap
|
49
|
+
puts 'Sitemap'
|
50
|
+
elsif @fileharvest
|
51
|
+
puts "File harvest by type: #{@fileharvest}"
|
87
52
|
elsif @seo
|
88
|
-
puts
|
89
|
-
puts "Page Count: #{@data.size}"
|
90
|
-
else
|
91
|
-
fail 'ERROR - Cannot dump - Mode Not Found'
|
53
|
+
puts 'SEO Metrics'
|
92
54
|
end
|
93
|
-
puts
|
55
|
+
puts "Data Dump -- Object Count: #{@data.size}"
|
56
|
+
puts HR
|
94
57
|
@data.each do |line|
|
95
58
|
puts line
|
96
59
|
end
|
97
|
-
puts '###############################'
|
98
60
|
puts
|
99
61
|
end
|
100
62
|
|
@@ -111,34 +73,90 @@ module Retriever
|
|
111
73
|
csv << entry
|
112
74
|
end
|
113
75
|
end
|
114
|
-
puts
|
76
|
+
puts HR
|
115
77
|
puts "File Created: #{@output}.csv"
|
116
78
|
puts "Object Count: #{@data.size}"
|
117
|
-
puts
|
79
|
+
puts HR
|
118
80
|
puts
|
119
81
|
end
|
120
82
|
|
83
|
+
private
|
84
|
+
|
85
|
+
def setup_options(options)
|
86
|
+
@progress = options['progress']
|
87
|
+
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
88
|
+
@verbose = options['verbose']
|
89
|
+
@output = options['filename']
|
90
|
+
@fileharvest = options['fileharvest']
|
91
|
+
@sitemap = options['sitemap']
|
92
|
+
@seo = options['seo']
|
93
|
+
@autodown = options['autodown']
|
94
|
+
@file_re = Regexp.new(".#{@fileharvest}\z").freeze if @fileharvest
|
95
|
+
end
|
96
|
+
|
97
|
+
def setup_bloom_filter
|
98
|
+
already_crawled = BloomFilter::Native.new(
|
99
|
+
size: 1_000_000,
|
100
|
+
hashes: 5,
|
101
|
+
seed: 1,
|
102
|
+
bucket: 8,
|
103
|
+
raise: false
|
104
|
+
)
|
105
|
+
already_crawled.insert(@t.target)
|
106
|
+
already_crawled
|
107
|
+
end
|
108
|
+
|
109
|
+
def setup_progress_bar
|
110
|
+
# verbose & progressbar conflict
|
111
|
+
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
|
112
|
+
prgress_vars = {
|
113
|
+
title: 'Pages',
|
114
|
+
starting_at: 1,
|
115
|
+
total: @max_pages,
|
116
|
+
format: '%a |%b>%i| %c/%C %t'
|
117
|
+
}
|
118
|
+
@progressbar = ProgressBar.create(prgress_vars)
|
119
|
+
end
|
120
|
+
|
121
|
+
def crawl_page_one
|
122
|
+
page_one = Retriever::Page.new(@t.source, @t)
|
123
|
+
lg("URL Crawled: #{@t.target}")
|
124
|
+
page_one
|
125
|
+
end
|
126
|
+
|
127
|
+
def create_link_stack
|
128
|
+
link_stack = @page_one.parse_internal_visitable
|
129
|
+
errlog("Bad URL -- #{@t.target}") unless link_stack
|
130
|
+
lg("#{link_stack.size - 1} links found")
|
131
|
+
link_stack.delete(@t.target)
|
132
|
+
link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
|
133
|
+
link_stack
|
134
|
+
end
|
135
|
+
|
136
|
+
def end_crawl_notice
|
137
|
+
notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
|
138
|
+
@progressbar.log(notice) if @progress
|
139
|
+
lg(notice)
|
140
|
+
end
|
141
|
+
|
121
142
|
# iterates over the existing @link_stack
|
122
143
|
# running until we reach the @max_pages value.
|
123
144
|
def async_crawl_and_collect
|
124
145
|
while @already_crawled.size < @max_pages
|
125
146
|
if @link_stack.empty?
|
126
|
-
|
127
|
-
@progressbar.log("Can't find any more links.")
|
128
|
-
else
|
129
|
-
lg("Can't find any more links.")
|
130
|
-
end
|
147
|
+
end_crawl_notice
|
131
148
|
break
|
132
149
|
end
|
133
150
|
new_links_arr = process_link_stack
|
151
|
+
@temp_link_stack = []
|
134
152
|
next if new_links_arr.nil? || new_links_arr.empty?
|
135
153
|
# set operations to see are these in our previous visited pages arr
|
136
|
-
|
137
|
-
@link_stack.concat(new_links_arr)
|
138
|
-
|
154
|
+
next if new_links_arr.empty?
|
155
|
+
@link_stack.concat(new_links_arr)
|
156
|
+
next unless @sitemap
|
157
|
+
@data.concat(new_links_arr)
|
139
158
|
end
|
140
|
-
|
141
|
-
@progressbar.finish if @prgrss
|
159
|
+
@data.uniq!
|
142
160
|
end
|
143
161
|
|
144
162
|
# returns true is resp is ok to continue
|
@@ -149,8 +167,8 @@ module Retriever
|
|
149
167
|
loc = hdr.location
|
150
168
|
lg("#{url} Redirected to #{loc}")
|
151
169
|
if t.host_re =~ loc
|
152
|
-
@
|
153
|
-
lg('--Added to
|
170
|
+
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
171
|
+
lg('--Added to stack for later')
|
154
172
|
return false
|
155
173
|
end
|
156
174
|
lg("Redirection outside of target host. No - go. #{loc}")
|
@@ -159,7 +177,6 @@ module Retriever
|
|
159
177
|
# lets not continue if unsuccessful connection
|
160
178
|
unless hdr.successful?
|
161
179
|
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
162
|
-
|
163
180
|
@connection_tally[:error] += 1
|
164
181
|
@connection_tally[:error_server] += 1 if hdr.server_error?
|
165
182
|
@connection_tally[:error_client] += 1 if hdr.client_error?
|
@@ -168,7 +185,6 @@ module Retriever
|
|
168
185
|
# let's not continue if not text/html
|
169
186
|
unless hdr['CONTENT_TYPE'].include?('text/html')
|
170
187
|
@already_crawled.insert(url)
|
171
|
-
@link_stack.delete(url)
|
172
188
|
lg("Page Not text/html -- #{url}")
|
173
189
|
return false
|
174
190
|
end
|
@@ -176,45 +192,58 @@ module Retriever
|
|
176
192
|
true
|
177
193
|
end
|
178
194
|
|
195
|
+
def push_seo_to_data(url, new_page)
|
196
|
+
seos = [url]
|
197
|
+
seos.concat(new_page.parse_seo)
|
198
|
+
@data.push(seos)
|
199
|
+
lg('--page SEO scraped')
|
200
|
+
end
|
201
|
+
|
202
|
+
def push_files_to_data(new_page)
|
203
|
+
filez = new_page.parse_files(new_page.parse_internal)
|
204
|
+
@data.concat(filez) unless filez.empty?
|
205
|
+
lg("--#{filez.size} files found")
|
206
|
+
end
|
207
|
+
|
208
|
+
def page_from_response(url, response)
|
209
|
+
lg("Page Fetched: #{url}")
|
210
|
+
@already_crawled.insert(url)
|
211
|
+
if @progress && (@already_crawled.size < @max_pages)
|
212
|
+
@progressbar.increment
|
213
|
+
end
|
214
|
+
Retriever::Page.new(response, @t)
|
215
|
+
end
|
216
|
+
|
217
|
+
def new_visitable_links(current_page)
|
218
|
+
lg("--#{current_page.links.size} links found")
|
219
|
+
current_page.parse_internal_visitable
|
220
|
+
end
|
221
|
+
|
179
222
|
# send a new wave of GET requests, using current @link_stack
|
223
|
+
# at end of the loop it empties link_stack
|
224
|
+
# puts new links into temporary stack
|
180
225
|
def process_link_stack
|
181
|
-
new_stuff = []
|
182
226
|
EM.synchrony do
|
183
227
|
concurrency = 10
|
184
228
|
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
185
229
|
next if @already_crawled.size >= @max_pages
|
186
230
|
next if @already_crawled.include?(url)
|
187
|
-
|
188
231
|
resp = EventMachine::HttpRequest.new(url).get
|
189
|
-
|
190
232
|
next unless good_response?(resp, url)
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
seos = [url]
|
200
|
-
seos.concat(new_page.parse_seo)
|
201
|
-
@data.push(seos)
|
202
|
-
lg('--page SEO scraped')
|
203
|
-
end
|
204
|
-
next if new_page.links.size == 0
|
205
|
-
lg("--#{new_page.links.size} links found")
|
206
|
-
internal_links_arr = new_page.parse_internal_visitable
|
207
|
-
new_stuff.push(internal_links_arr)
|
208
|
-
if @fh
|
209
|
-
filez = new_page.parse_files
|
210
|
-
@data.concat(filez) unless filez.empty?
|
211
|
-
lg("--#{filez.size} files found")
|
212
|
-
end
|
233
|
+
current_page = page_from_response(url, resp.response)
|
234
|
+
# non-link dependent modes
|
235
|
+
push_seo_to_data(url, current_page) if @seo
|
236
|
+
next unless current_page.links.size > 0
|
237
|
+
@temp_link_stack.push(new_visitable_links(current_page))
|
238
|
+
# link dependent modes
|
239
|
+
next unless @fileharvest
|
240
|
+
push_files_to_data(current_page)
|
213
241
|
end
|
214
|
-
new_stuff = new_stuff.flatten # all completed requests
|
215
242
|
EventMachine.stop
|
216
243
|
end
|
217
|
-
|
244
|
+
# empty the stack. most clean way
|
245
|
+
@link_stack = []
|
246
|
+
@temp_link_stack.flatten.uniq!
|
218
247
|
end
|
219
248
|
end
|
220
249
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -5,29 +5,21 @@ module Retriever
|
|
5
5
|
class FetchFiles < Fetch
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
|
9
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
-
@link_stack = page_one.parse_internal_visitable
|
11
|
-
lg("URL Crawled: #{@t.target}")
|
12
|
-
lg("#{@link_stack.size - 1} new links found")
|
13
|
-
|
14
|
-
temp_file_collection = page_one.parse_files
|
8
|
+
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
15
9
|
@data.concat(tempFileCollection) if temp_file_collection.size > 0
|
16
10
|
lg("#{@data.size} new files found")
|
17
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
18
|
-
@link_stack.delete(@t.target)
|
19
11
|
|
20
12
|
async_crawl_and_collect
|
21
|
-
|
13
|
+
# done, make sure progress bar says we are done
|
14
|
+
@progressbar.finish if @progress
|
22
15
|
@data.sort_by! { |x| x.length }
|
23
|
-
@data.uniq!
|
24
16
|
end
|
25
17
|
|
26
18
|
def download_file(path)
|
27
19
|
# given valid url, downloads file to current directory in /rr-downloads/
|
28
20
|
arr = path.split('/')
|
29
21
|
shortname = arr.pop
|
30
|
-
puts "Initiating Download
|
22
|
+
puts "Initiating Download of: #{shortname}"
|
31
23
|
File.open(shortname, 'wb') do |saved_file|
|
32
24
|
open(path) do |read_file|
|
33
25
|
saved_file.write(read_file.read)
|
@@ -38,33 +30,39 @@ module Retriever
|
|
38
30
|
|
39
31
|
def autodownload
|
40
32
|
# go through the fetched file URL collection and download each one.
|
41
|
-
|
42
|
-
puts '###################'
|
33
|
+
puts HR
|
43
34
|
puts '### Initiating Autodownload...'
|
44
|
-
puts
|
45
|
-
puts "#{
|
46
|
-
puts
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
@data.
|
35
|
+
puts HR
|
36
|
+
puts "#{@data.count} - #{@file_ext}'s Located"
|
37
|
+
puts HR
|
38
|
+
move_to_download_dir
|
39
|
+
iterate_thru_collection_and_download
|
40
|
+
Dir.chdir('..')
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def iterate_thru_collection_and_download
|
46
|
+
lenn = @data.count
|
47
|
+
@data.each_with_index do |entry, i|
|
56
48
|
begin
|
57
49
|
download_file(entry)
|
58
|
-
|
59
|
-
|
60
|
-
puts
|
61
|
-
rescue StandardError => e
|
62
|
-
puts 'ERROR: failed to download - #{entry}'
|
63
|
-
puts e.message
|
64
|
-
puts
|
50
|
+
rescue StandardError
|
51
|
+
puts "ERROR: failed to download - #{entry}"
|
65
52
|
end
|
53
|
+
lg(" File [#{i + 1} of #{lenn}]\n")
|
66
54
|
end
|
67
|
-
|
55
|
+
end
|
56
|
+
|
57
|
+
def move_to_download_dir(dir_name = 'rr-downloads')
|
58
|
+
if File.directory?(dir_name)
|
59
|
+
Dir.chdir(dir_name)
|
60
|
+
else
|
61
|
+
puts "creating #{dir_name} Directory"
|
62
|
+
Dir.mkdir(dir_name)
|
63
|
+
Dir.chdir(dir_name)
|
64
|
+
end
|
65
|
+
puts "Downloading files to local directory: '/#{dir_name}/'"
|
68
66
|
end
|
69
67
|
end
|
70
68
|
end
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -6,19 +6,11 @@ module Retriever
|
|
6
6
|
# on all unique pages found on the site
|
7
7
|
def initialize(url, options)
|
8
8
|
super
|
9
|
-
@data
|
10
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
11
|
-
lg("URL Crawled: #{@t.target}")
|
12
|
-
|
13
|
-
@link_stack = page_one.parse_internal_visitable
|
14
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
15
|
-
lg("#{@link_stack.size - 1} links found")
|
16
|
-
@link_stack.delete(@t.target)
|
17
|
-
|
18
|
-
@data.push(page_one.parse_seo)
|
9
|
+
@data.push(@page_one.parse_seo)
|
19
10
|
|
20
11
|
async_crawl_and_collect
|
21
|
-
|
12
|
+
# done, make sure progress bar says we are done
|
13
|
+
@progressbar.finish if @progress
|
22
14
|
@data.sort_by! { |x| x[0].length }
|
23
15
|
end
|
24
16
|
end
|
@@ -5,37 +5,38 @@ module Retriever
|
|
5
5
|
# returns an array of all unique pages found on the site
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
@data
|
9
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
-
lg("URL Crawled: #{@t.target}")
|
11
|
-
@link_stack = page_one.parse_internal_visitable
|
12
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
13
|
-
lg("#{@link_stack.size - 1} links found")
|
14
|
-
|
15
|
-
@link_stack.delete(@t.target)
|
8
|
+
@data.push(@t.target)
|
16
9
|
@data.concat(@link_stack)
|
17
10
|
|
18
11
|
async_crawl_and_collect
|
19
|
-
|
12
|
+
# done, make sure progress bar says we are done
|
13
|
+
@progressbar.finish if @progress
|
20
14
|
@data.sort_by! { |x| x.length } if @data.size > 1
|
21
15
|
@data.uniq!
|
22
16
|
end
|
23
17
|
|
18
|
+
private
|
19
|
+
|
24
20
|
# produces valid XML sitemap based on page collection fetched.
|
25
21
|
# Writes to current directory.
|
26
22
|
def gen_xml
|
27
|
-
|
28
|
-
f
|
29
|
-
|
30
|
-
|
31
|
-
|
23
|
+
filename = @t.host.split('.')[1]
|
24
|
+
f = File.open("sitemap-#{filename}.xml", 'w+')
|
25
|
+
f << "<?xml version='1.0' encoding='UTF-8'?>"
|
26
|
+
f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
27
|
+
@data.each do |url|
|
28
|
+
f << "<url><loc>#{url}</loc></url>"
|
29
|
+
end
|
32
30
|
f << '</urlset>'
|
33
31
|
f.close
|
34
|
-
|
35
|
-
|
32
|
+
print_file_info(filename)
|
33
|
+
end
|
34
|
+
|
35
|
+
def print_file_info(filename)
|
36
|
+
puts HR
|
37
|
+
puts "File Created: sitemap-#{filename}.xml"
|
36
38
|
puts "Object Count: #{@data.size}"
|
37
|
-
puts
|
38
|
-
puts
|
39
|
+
puts HR + "\n"
|
39
40
|
end
|
40
41
|
end
|
41
42
|
end
|
data/lib/retriever/link.rb
CHANGED
@@ -1,33 +1,35 @@
|
|
1
|
+
require 'addressable/uri'
|
1
2
|
module Retriever
|
2
3
|
#
|
3
4
|
class Link
|
4
|
-
HTTP_RE = Regexp.new(/^http/i).freeze
|
5
|
-
|
6
|
-
DOUBLE_SLASH_RE = Regexp.new(
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
@
|
12
|
-
@
|
5
|
+
# HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
|
7
|
+
DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
|
8
|
+
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
|
+
|
10
|
+
def initialize(target_scheme, target_host, this_link)
|
11
|
+
@link_uri = Addressable::URI.parse(this_link)
|
12
|
+
@scheme = target_scheme
|
13
|
+
@host = target_host
|
14
|
+
@this_link = @link_uri.to_s
|
13
15
|
end
|
14
16
|
|
15
17
|
def path
|
16
|
-
return
|
18
|
+
return this_link if link_uri.absolute?
|
17
19
|
|
18
|
-
return "
|
20
|
+
return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
|
19
21
|
|
20
|
-
return "
|
22
|
+
return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
|
21
23
|
|
22
24
|
# link begins with '//'
|
23
|
-
return "
|
25
|
+
return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
|
24
26
|
|
25
27
|
# link uses relative path with no slashes at all
|
26
|
-
return "
|
28
|
+
return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
|
27
29
|
end
|
28
30
|
|
29
31
|
private
|
30
32
|
|
31
|
-
attr_reader :host, :
|
33
|
+
attr_reader :this_link, :host, :link_uri
|
32
34
|
end
|
33
35
|
end
|
@@ -1,6 +1,7 @@
|
|
1
|
+
#
|
1
2
|
module OpenURI
|
2
3
|
# nesc patch otherwise OPENURI blocks redirects to and from https
|
3
|
-
def
|
4
|
+
def self.redirectable?(uri1, uri2)
|
4
5
|
uri1.scheme.downcase == uri2.scheme.downcase ||
|
5
6
|
(/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
|
6
7
|
end
|
data/lib/retriever/page.rb
CHANGED
@@ -1,21 +1,40 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
1
3
|
module Retriever
|
2
4
|
#
|
3
5
|
class Page
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
7
|
+
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
8
|
+
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
9
|
+
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
|
10
|
+
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
|
11
|
+
[^>]*content=[\"]
|
12
|
+
(
|
13
|
+
[^\"]*
|
14
|
+
)
|
15
|
+
[\"]
|
16
|
+
[^>]
|
17
|
+
*>
|
18
|
+
/ix).freeze
|
19
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=
|
20
|
+
['|"]
|
21
|
+
(
|
22
|
+
[^\s]
|
23
|
+
[a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
|
24
|
+
)
|
25
|
+
['|"]
|
26
|
+
[\s|\W]
|
27
|
+
/ix).freeze
|
28
|
+
NONPAGE_EXT_RE = Regexp.new(/\.
|
29
|
+
(?:css|js|png|gif|jpg|mp4|
|
30
|
+
wmv|flv|mp3|wav|doc|txt|ico|xml)
|
31
|
+
/ix).freeze
|
13
32
|
|
14
33
|
attr_reader :links, :source, :t
|
15
34
|
|
16
35
|
def initialize(source, t)
|
17
36
|
@t = t
|
18
|
-
@source = source.encode('UTF-8', :
|
37
|
+
@source = source.encode('UTF-8', invalid: :replace, undef: :replace)
|
19
38
|
@links = nil
|
20
39
|
end
|
21
40
|
|
@@ -28,20 +47,20 @@ module Retriever
|
|
28
47
|
# filter some malformed URLS that come in
|
29
48
|
# meant to be a loose filter to catch all reasonable HREF attributes.
|
30
49
|
link = match[0]
|
31
|
-
Link.new(@t.host, link).path
|
32
|
-
end.uniq
|
50
|
+
Link.new(@t.scheme, @t.host, link).path
|
51
|
+
end.compact.uniq
|
33
52
|
end
|
34
53
|
|
35
54
|
def parse_internal
|
36
|
-
links.select { |
|
55
|
+
links.select { |x| @t.host == Addressable::URI.parse(x).host }
|
37
56
|
end
|
38
57
|
|
39
58
|
def parse_internal_visitable
|
40
|
-
parse_internal.select { |
|
59
|
+
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
|
41
60
|
end
|
42
61
|
|
43
|
-
def parse_files
|
44
|
-
|
62
|
+
def parse_files(arr)
|
63
|
+
arr.select { |x| @t.file_re =~ x }
|
45
64
|
end
|
46
65
|
|
47
66
|
def title
|
data/lib/retriever/target.rb
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
require 'open-uri'
|
2
|
+
require 'addressable/uri'
|
2
3
|
|
3
4
|
module Retriever
|
4
5
|
#
|
5
6
|
class Target
|
6
|
-
HTTP_RE
|
7
|
-
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
8
8
|
|
9
|
-
attr_reader :host, :target, :host_re, :source, :file_re
|
9
|
+
attr_reader :host, :target, :host_re, :source, :file_re, :scheme
|
10
10
|
|
11
11
|
def initialize(url, file_re = nil)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
@target
|
16
|
-
@host
|
17
|
-
@host_re
|
18
|
-
@file_re
|
12
|
+
fail 'Bad URL' unless url.include?('.')
|
13
|
+
url = "http://#{url}" unless HTTP_RE =~ url
|
14
|
+
target_uri = Addressable::URI.parse(url)
|
15
|
+
@target = target_uri.to_s
|
16
|
+
@host = target_uri.host
|
17
|
+
@host_re = Regexp.new(@host.sub('www.', ''))
|
18
|
+
@file_re ||= file_re
|
19
|
+
@scheme = target_uri.scheme
|
19
20
|
end
|
20
21
|
|
21
22
|
def source
|
@@ -31,13 +32,14 @@ module Retriever
|
|
31
32
|
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
|
32
33
|
fail 'Domain not working. Try HTTPS???' unless resp
|
33
34
|
# consider using scrub from ruby 2.1? this misses some things
|
34
|
-
resp.encode('UTF-8', 'binary', :
|
35
|
+
resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
|
35
36
|
end
|
36
37
|
|
37
38
|
def resync_target_and_return_source(url)
|
38
|
-
new_t
|
39
|
+
new_t = Retriever::Target.new(url)
|
39
40
|
@target = new_t.target
|
40
|
-
@host
|
41
|
+
@host = new_t.host
|
42
|
+
@scheme = new_t.scheme
|
41
43
|
new_t.source
|
42
44
|
end
|
43
45
|
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
data/readme.md
CHANGED
@@ -4,15 +4,29 @@
|
|
4
4
|
|
5
5
|
By Joe Norton
|
6
6
|
|
7
|
-
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
7
|
+
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
|
10
|
-
|
11
|
-
**Use at Own Risk**
|
12
|
-
RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
|
13
10
|
|
14
11
|
**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
|
12
|
+
mission
|
13
|
+
-------
|
14
|
+
RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
|
15
|
+
|
16
|
+
features
|
17
|
+
--------
|
18
|
+
* Asynchronous HTTP Requests thru EM & Synchrony
|
19
|
+
* Bloom filter for tracking pages visited.
|
20
|
+
* 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
|
21
|
+
|
22
|
+
use-cases
|
23
|
+
---------
|
24
|
+
RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
|
25
|
+
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
26
|
+
2. Crawl a target website and *download all files of a given filetype*.
|
27
|
+
3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
|
15
28
|
|
29
|
+
Help & Forks Welcome!
|
16
30
|
|
17
31
|
getting started
|
18
32
|
-----------
|
data/spec/link_spec.rb
CHANGED
@@ -1,66 +1,68 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe 'Link' do
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
+
let(:links) { Retriever::Page.new(@source, t).links }
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
<a href='http://www.cnet.com/download.exe'>download</a>
|
8
|
+
it 'collects links in anchor tags' do
|
9
|
+
@source = (<<SOURCE).strip
|
10
|
+
<a href='http://www.cnet.com/download.exe'>download</a>
|
11
11
|
SOURCE
|
12
12
|
|
13
|
-
|
14
|
-
|
13
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
14
|
+
end
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
it 'collects links in link tags' do
|
17
|
+
@source = (<<SOURCE).strip
|
18
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
19
19
|
SOURCE
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
expect(links[0]).to include('formreset.css?ver=1.7.12')
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
it 'does not collect bare links (ones not in an href)' do
|
25
|
+
@source = (<<SOURCE).strip
|
26
26
|
http://www.google.com
|
27
27
|
SOURCE
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
expect(links).to_not include('http://www.google.com')
|
30
|
+
end
|
31
31
|
|
32
|
-
|
33
|
-
|
32
|
+
it 'collects only unique href links on the page' do
|
33
|
+
@source = (<<SOURCE).strip
|
34
34
|
<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
|
35
35
|
<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
|
36
36
|
SOURCE
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
expect(links.size).to eq(1)
|
39
|
+
end
|
40
40
|
|
41
|
-
|
42
|
-
|
41
|
+
it 'adds a protocol to urls missing them (www.)' do
|
42
|
+
@source = (<<SOURCE).strip
|
43
43
|
<a href='www.cnet.com/download.exe'>download</a>
|
44
44
|
SOURCE
|
45
45
|
|
46
|
-
|
47
|
-
|
46
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
49
|
+
it "doesn't care about any extra attributes on the anchor tag" do
|
50
|
+
@source = (<<SOURCE).strip
|
51
51
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'
|
52
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
53
|
+
</a>
|
53
54
|
SOURCE
|
54
55
|
|
55
|
-
|
56
|
-
|
56
|
+
expect(links.size).to eq(1)
|
57
|
+
end
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
it 'returns relative urls with full path based on hostname' do
|
60
|
+
@source = (<<SOURCE).strip
|
60
61
|
<a href='/test.html'>test</a>
|
61
62
|
<a href='cpage_18'>about</a>
|
62
63
|
SOURCE
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
end
|
65
|
+
expect(links).to include('http://www.cnet.com/test.html',
|
66
|
+
'http://www.cnet.com/cpage_18')
|
67
|
+
end
|
68
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,93 +1,97 @@
|
|
1
1
|
require 'retriever/page'
|
2
2
|
require 'retriever/fetch'
|
3
3
|
|
4
|
-
t = Retriever::Target.new(
|
4
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe 'Page' do
|
7
7
|
|
8
|
-
describe
|
9
|
-
let
|
10
|
-
it
|
11
|
-
|
8
|
+
describe '#links' do
|
9
|
+
let(:links) { Retriever::Page.new(@source, t).links }
|
10
|
+
it 'collects all unique href links on the page' do
|
11
|
+
@source = (<<SOURCE).strip
|
12
12
|
<a href='www.cnet.com/download.exe'>download</a>
|
13
13
|
<a href='/test.html'>test</a>
|
14
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
15
|
+
</a>
|
15
16
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
16
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
18
|
SOURCE
|
18
19
|
|
19
20
|
expect(links.size).to eq(4)
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
23
|
-
describe
|
24
|
-
let
|
25
|
-
|
26
|
-
|
24
|
+
describe '#parse_internal' do
|
25
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
26
|
+
let(:links) { page.parse_internal }
|
27
|
+
it 'filters links by host' do
|
28
|
+
@source = (<<SOURCE).strip
|
27
29
|
<a href='http://www.cnet.com/'>download</a>
|
28
|
-
|
30
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
29
31
|
SOURCE
|
30
32
|
|
31
|
-
|
33
|
+
expect(links.size).to eq(1)
|
32
34
|
end
|
33
35
|
end
|
34
36
|
|
35
|
-
describe
|
36
|
-
let
|
37
|
+
describe '#parse_internal_visitable' do
|
38
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
39
|
+
let(:links) { page.parse_internal_visitable }
|
37
40
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
38
|
-
|
41
|
+
@source = (<<SOURCE).strip
|
39
42
|
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
40
43
|
SOURCE
|
41
|
-
|
44
|
+
expect(links.size).to eq(0)
|
42
45
|
end
|
43
46
|
end
|
44
47
|
|
45
|
-
describe
|
46
|
-
let
|
47
|
-
|
48
|
-
|
48
|
+
describe '#parse_files' do
|
49
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
50
|
+
let(:files) { page.parse_files(page.parse_internal) }
|
51
|
+
it 'filters links by filetype' do
|
52
|
+
@source = (<<SOURCE).strip
|
49
53
|
<a href='www.cnet.com/download.exe'>download</a>
|
50
|
-
http://www.google.com
|
54
|
+
http://www.google.com
|
51
55
|
<a href='/test.html'>test</a>
|
52
56
|
SOURCE
|
53
|
-
|
57
|
+
expect(files.size).to eq(1)
|
54
58
|
end
|
55
59
|
end
|
56
60
|
|
57
|
-
|
58
|
-
|
59
|
-
it
|
60
|
-
|
61
|
+
describe '#title' do
|
62
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
63
|
+
it 'returns page title' do
|
64
|
+
@source = (<<SOURCE).strip
|
61
65
|
<title>test</title>
|
62
66
|
SOURCE
|
63
|
-
|
67
|
+
expect(page.title).to eq('test')
|
64
68
|
end
|
65
69
|
end
|
66
|
-
|
67
|
-
|
68
|
-
it
|
69
|
-
|
70
|
+
describe '#desc' do
|
71
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
72
|
+
it 'returns meta description' do
|
73
|
+
@source = (<<SOURCE).strip
|
70
74
|
<meta name='description' content="test2 ">
|
71
75
|
SOURCE
|
72
|
-
|
76
|
+
expect(page.desc).to eq('test2 ')
|
73
77
|
end
|
74
78
|
end
|
75
|
-
|
76
|
-
|
77
|
-
it
|
78
|
-
|
79
|
+
describe '#h1' do
|
80
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
81
|
+
it 'returns h1 text' do
|
82
|
+
@source = (<<SOURCE).strip
|
79
83
|
<h1>test 3</h1>
|
80
84
|
SOURCE
|
81
|
-
|
85
|
+
expect(page.h1).to eq('test 3')
|
82
86
|
end
|
83
87
|
end
|
84
|
-
|
85
|
-
|
86
|
-
it
|
87
|
-
|
88
|
+
describe '#h2' do
|
89
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
90
|
+
it 'returns h2 text' do
|
91
|
+
@source = (<<SOURCE).strip
|
88
92
|
<h2> test 4 </h2>
|
89
93
|
SOURCE
|
90
|
-
|
94
|
+
expect(page.h2).to eq(' test 4 ')
|
91
95
|
end
|
92
96
|
end
|
93
97
|
end
|
data/spec/retriever_spec.rb
CHANGED
data/spec/target_spec.rb
CHANGED
@@ -1,44 +1,44 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new(
|
4
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe 'Target' do
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
it 'creates target var' do
|
9
|
+
expect(t.target).to eq('http://www.cnet.com/reviews/')
|
10
|
+
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
it 'creates host var' do
|
13
|
+
expect(t.host).to eq('www.cnet.com')
|
14
|
+
end
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
it 'creates host_re var' do
|
17
|
+
expect(t.host_re).to eq(/cnet.com/)
|
18
|
+
end
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
it 'creates file_re var (when provided)' do
|
21
|
+
expect(t.file_re).to eq(/\.exe\z/)
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
it 'adds protocol to Target URL if none given' do
|
25
|
+
expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
|
26
|
+
end
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
it 'fails if given URL has no dot in it' do
|
29
|
+
expect { Retriever::Target.new('cnetcom') }.to raise_error
|
30
|
+
end
|
31
31
|
|
32
|
-
describe
|
32
|
+
describe '#source' do
|
33
33
|
|
34
|
-
it
|
35
|
-
expect(Retriever::Target.new(
|
34
|
+
it 'opens URL and returns source as String' do
|
35
|
+
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
|
36
|
+
.to eq(String)
|
36
37
|
end
|
37
38
|
|
38
|
-
it
|
39
|
-
expect{Retriever::Target.new(
|
39
|
+
it 'fails if target redirects to new host' do
|
40
|
+
expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
|
41
|
+
.to raise_error
|
40
42
|
end
|
41
|
-
|
42
43
|
end
|
43
|
-
|
44
|
-
end
|
44
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: addressable
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -125,7 +139,7 @@ files:
|
|
125
139
|
- lib/retriever/fetchseo.rb
|
126
140
|
- lib/retriever/fetchsitemap.rb
|
127
141
|
- lib/retriever/link.rb
|
128
|
-
- lib/retriever/
|
142
|
+
- lib/retriever/openuri_redirect_patch.rb
|
129
143
|
- lib/retriever/page.rb
|
130
144
|
- lib/retriever/target.rb
|
131
145
|
- lib/retriever/version.rb
|