rubyretriever 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +51 -48
- data/lib/retriever/cli.rb +11 -7
- data/lib/retriever/fetch.rb +134 -105
- data/lib/retriever/fetchfiles.rb +32 -34
- data/lib/retriever/fetchseo.rb +3 -11
- data/lib/retriever/fetchsitemap.rb +19 -18
- data/lib/retriever/link.rb +17 -15
- data/lib/retriever/{openuri-redirect-patch.rb → openuri_redirect_patch.rb} +2 -1
- data/lib/retriever/page.rb +35 -16
- data/lib/retriever/target.rb +15 -13
- data/lib/retriever/version.rb +3 -2
- data/lib/retriever.rb +1 -1
- data/readme.md +19 -5
- data/spec/link_spec.rb +37 -35
- data/spec/page_spec.rb +48 -44
- data/spec/retriever_spec.rb +2 -3
- data/spec/target_spec.rb +28 -28
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79f0b251e367f085f7b84dd83a10f6a1dfcddd3c
|
4
|
+
data.tar.gz: 0e9b6bc8f66b9efd14921d8f0fc5fddc45042b5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe1a6c8e118378513c4a4e72adeccc94e212fd5e0d4244f56240830155e42f7b7bde80acdfabbc9fe9ee5b46687bc33d089d73c57690beadee3d04804a9435ac
|
7
|
+
data.tar.gz: 4e6bec31d3416293f2fb72b39cfab5602692a2b9d94cbfb6fba9a653afc6bf718e5eebf84b4a5745d01d2e55959df4e489342b0deca2060e2653bb5b31f4731e
|
data/bin/rr
CHANGED
@@ -1,57 +1,58 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
-
|
3
2
|
require 'retriever'
|
4
3
|
require 'optparse'
|
5
4
|
|
6
5
|
options = {}
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
6
|
+
optparse = OptionParser.new do |opts|
|
7
|
+
# Set a banner, displayed at the top
|
8
|
+
# of the help screen.
|
9
|
+
opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
|
10
|
+
options['sitemap'] = false
|
11
|
+
opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
|
12
|
+
options['sitemap'] = type || ''
|
13
|
+
end
|
14
|
+
options['fileharvest'] = false
|
15
|
+
opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
|
16
|
+
options['fileharvest'] = file_e
|
17
|
+
end
|
18
|
+
options['seo'] = false
|
19
|
+
opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
|
20
|
+
options['seo'] = true
|
21
|
+
end
|
22
|
+
options['filename'] = nil
|
23
|
+
opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
|
24
|
+
options['filename'] = file
|
25
|
+
end
|
26
|
+
# Define the options, and what they do
|
27
|
+
options['verbose'] = false
|
28
|
+
opts.on('-v', '--verbose', 'Output more information') do
|
29
|
+
options['verbose'] = true
|
30
|
+
end
|
31
|
+
options['progress'] = false
|
32
|
+
opts.on('-p', '--progress', 'Output progress bar') do
|
33
|
+
options['progress'] = true
|
34
|
+
end
|
35
|
+
options['maxpages'] = false
|
36
|
+
opts.on('-l',
|
37
|
+
'--limit PAGE_LIMIT_#',
|
38
|
+
'set a max on the total number of crawled pages') do |maxp|
|
39
|
+
options['maxpages'] = maxp
|
40
|
+
end
|
41
|
+
options['autodown'] = false
|
42
|
+
opts.on('-a', '--auto', 'Automatically download all files located') do
|
43
|
+
options['autodown'] = true
|
44
|
+
end
|
45
|
+
# This displays the help screen, all programs are
|
46
|
+
# assumed to have this option.
|
47
|
+
opts.on('-h', '--help', 'Display this screen') do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end
|
51
52
|
|
52
53
|
optparse.parse!
|
53
54
|
if ARGV[0].nil?
|
54
|
-
abort(
|
55
|
+
abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
|
55
56
|
end
|
56
57
|
|
57
58
|
ARGV.each do|q|
|
@@ -61,9 +62,11 @@ ARGV.each do|q|
|
|
61
62
|
puts '### Creating Sitemap' if options['sitemap']
|
62
63
|
puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
|
63
64
|
puts '### Performing File Harvest' if options['fileharvest']
|
64
|
-
|
65
|
+
if options['fileharvest']
|
66
|
+
puts "### Searching for filetype: #{options['fileharvest']}"
|
67
|
+
end
|
65
68
|
puts '### Performing SEO Scrape' if options['seo']
|
66
|
-
puts "### Writing
|
69
|
+
puts "### Writing to file: #{options['filename']}" if options['filename']
|
67
70
|
puts '### Being verbose'
|
68
71
|
puts "### Stopping after #{options['maxpages']} pages"
|
69
72
|
end
|
data/lib/retriever/cli.rb
CHANGED
@@ -3,19 +3,23 @@ module Retriever
|
|
3
3
|
class CLI
|
4
4
|
def initialize(url, options)
|
5
5
|
# kick off the fetch mode of choice
|
6
|
+
@fetch = choose_fetch_mode(url, options)
|
7
|
+
@fetch.dump
|
8
|
+
@fetch.write if options['filename']
|
9
|
+
@fetch.autodownload if options['autodown'] && options['fileharvest']
|
10
|
+
@fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def choose_fetch_mode(url, options)
|
6
14
|
if options['fileharvest']
|
7
|
-
|
15
|
+
Retriever::FetchFiles.new(url, options)
|
8
16
|
elsif options['sitemap']
|
9
|
-
|
17
|
+
Retriever::FetchSitemap.new(url, options)
|
10
18
|
elsif options['seo']
|
11
|
-
|
19
|
+
Retriever::FetchSEO.new(url, options)
|
12
20
|
else
|
13
21
|
fail '### Error: No Mode Selected'
|
14
22
|
end
|
15
|
-
@fetch.dump
|
16
|
-
@fetch.write if options['filename']
|
17
|
-
@fetch.autodownload if options['autodown'] && options['fileharvest']
|
18
|
-
@fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
|
19
23
|
end
|
20
24
|
end
|
21
25
|
end
|
data/lib/retriever/fetch.rb
CHANGED
@@ -9,56 +9,27 @@ require 'bloomfilter-rb'
|
|
9
9
|
module Retriever
|
10
10
|
#
|
11
11
|
class Fetch
|
12
|
+
HR = '###############################'
|
12
13
|
attr_reader :max_pages, :t
|
13
14
|
# given target URL and RR options, creates a fetch object.
|
14
15
|
# There is no direct output
|
15
16
|
# this is a parent class that the other fetch classes build off of.
|
16
17
|
def initialize(url, options)
|
18
|
+
@data = []
|
17
19
|
@connection_tally = {
|
18
|
-
:
|
19
|
-
:
|
20
|
-
:
|
21
|
-
:
|
20
|
+
success: 0,
|
21
|
+
error: 0,
|
22
|
+
error_client: 0,
|
23
|
+
error_server: 0
|
22
24
|
}
|
23
|
-
|
24
|
-
|
25
|
-
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
26
|
-
@v = options['verbose']
|
27
|
-
@output = options['filename']
|
28
|
-
@fh = options['fileharvest']
|
29
|
-
@file_ext = @fh.to_s
|
30
|
-
@s = options['sitemap']
|
31
|
-
@seo = options['seo']
|
32
|
-
@autodown = options['autodown']
|
33
|
-
#
|
34
|
-
if @fh
|
35
|
-
temp_ext_str = '.' + @file_ext + '\z'
|
36
|
-
@file_re = Regexp.new(temp_ext_str).freeze
|
37
|
-
else
|
38
|
-
# when FH is not true, and autodown is true
|
39
|
-
errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
|
40
|
-
end
|
41
|
-
if @prgrss
|
42
|
-
# verbose & progressbar conflict
|
43
|
-
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
|
44
|
-
prgress_vars = {
|
45
|
-
:title => 'Pages',
|
46
|
-
:starting_at => 1,
|
47
|
-
:total => @max_pages,
|
48
|
-
:format => '%a |%b>%i| %c/%C %t'
|
49
|
-
}
|
50
|
-
@progressbar = ProgressBar.create(prgress_vars)
|
51
|
-
end
|
25
|
+
setup_options(options)
|
26
|
+
setup_progress_bar if @progress
|
52
27
|
@t = Retriever::Target.new(url, @file_re)
|
53
|
-
@output = "rr-#{@t.host.split('.')[1]}" if @
|
54
|
-
@already_crawled =
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
:bucket => 8,
|
59
|
-
:raise => false
|
60
|
-
)
|
61
|
-
@already_crawled.insert(@t.target)
|
28
|
+
@output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
|
29
|
+
@already_crawled = setup_bloom_filter
|
30
|
+
@page_one = crawl_page_one
|
31
|
+
@link_stack = create_link_stack
|
32
|
+
@temp_link_stack = []
|
62
33
|
end
|
63
34
|
|
64
35
|
def errlog(msg)
|
@@ -66,35 +37,26 @@ module Retriever
|
|
66
37
|
end
|
67
38
|
|
68
39
|
def lg(msg)
|
69
|
-
puts "### #{msg}" if @
|
40
|
+
puts "### #{msg}" if @verbose
|
70
41
|
end
|
71
42
|
|
72
43
|
# prints current data collection to STDOUT
|
73
44
|
def dump
|
74
|
-
puts
|
75
|
-
if @
|
76
|
-
|
77
|
-
|
78
|
-
puts '
|
79
|
-
|
80
|
-
|
81
|
-
puts "#{@t.target} Sitemap"
|
82
|
-
puts "Page Count: #{@data.size}"
|
83
|
-
elsif @fh
|
84
|
-
puts "Target URL: #{@t.target}"
|
85
|
-
puts "Filetype: #{@file_ext}"
|
86
|
-
puts "File Count: #{@data.size}"
|
45
|
+
puts HR
|
46
|
+
puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
|
47
|
+
puts "Target URL: #{@t.target}"
|
48
|
+
if @sitemap
|
49
|
+
puts 'Sitemap'
|
50
|
+
elsif @fileharvest
|
51
|
+
puts "File harvest by type: #{@fileharvest}"
|
87
52
|
elsif @seo
|
88
|
-
puts
|
89
|
-
puts "Page Count: #{@data.size}"
|
90
|
-
else
|
91
|
-
fail 'ERROR - Cannot dump - Mode Not Found'
|
53
|
+
puts 'SEO Metrics'
|
92
54
|
end
|
93
|
-
puts
|
55
|
+
puts "Data Dump -- Object Count: #{@data.size}"
|
56
|
+
puts HR
|
94
57
|
@data.each do |line|
|
95
58
|
puts line
|
96
59
|
end
|
97
|
-
puts '###############################'
|
98
60
|
puts
|
99
61
|
end
|
100
62
|
|
@@ -111,34 +73,90 @@ module Retriever
|
|
111
73
|
csv << entry
|
112
74
|
end
|
113
75
|
end
|
114
|
-
puts
|
76
|
+
puts HR
|
115
77
|
puts "File Created: #{@output}.csv"
|
116
78
|
puts "Object Count: #{@data.size}"
|
117
|
-
puts
|
79
|
+
puts HR
|
118
80
|
puts
|
119
81
|
end
|
120
82
|
|
83
|
+
private
|
84
|
+
|
85
|
+
def setup_options(options)
|
86
|
+
@progress = options['progress']
|
87
|
+
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
88
|
+
@verbose = options['verbose']
|
89
|
+
@output = options['filename']
|
90
|
+
@fileharvest = options['fileharvest']
|
91
|
+
@sitemap = options['sitemap']
|
92
|
+
@seo = options['seo']
|
93
|
+
@autodown = options['autodown']
|
94
|
+
@file_re = Regexp.new(".#{@fileharvest}\z").freeze if @fileharvest
|
95
|
+
end
|
96
|
+
|
97
|
+
def setup_bloom_filter
|
98
|
+
already_crawled = BloomFilter::Native.new(
|
99
|
+
size: 1_000_000,
|
100
|
+
hashes: 5,
|
101
|
+
seed: 1,
|
102
|
+
bucket: 8,
|
103
|
+
raise: false
|
104
|
+
)
|
105
|
+
already_crawled.insert(@t.target)
|
106
|
+
already_crawled
|
107
|
+
end
|
108
|
+
|
109
|
+
def setup_progress_bar
|
110
|
+
# verbose & progressbar conflict
|
111
|
+
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
|
112
|
+
prgress_vars = {
|
113
|
+
title: 'Pages',
|
114
|
+
starting_at: 1,
|
115
|
+
total: @max_pages,
|
116
|
+
format: '%a |%b>%i| %c/%C %t'
|
117
|
+
}
|
118
|
+
@progressbar = ProgressBar.create(prgress_vars)
|
119
|
+
end
|
120
|
+
|
121
|
+
def crawl_page_one
|
122
|
+
page_one = Retriever::Page.new(@t.source, @t)
|
123
|
+
lg("URL Crawled: #{@t.target}")
|
124
|
+
page_one
|
125
|
+
end
|
126
|
+
|
127
|
+
def create_link_stack
|
128
|
+
link_stack = @page_one.parse_internal_visitable
|
129
|
+
errlog("Bad URL -- #{@t.target}") unless link_stack
|
130
|
+
lg("#{link_stack.size - 1} links found")
|
131
|
+
link_stack.delete(@t.target)
|
132
|
+
link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
|
133
|
+
link_stack
|
134
|
+
end
|
135
|
+
|
136
|
+
def end_crawl_notice
|
137
|
+
notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
|
138
|
+
@progressbar.log(notice) if @progress
|
139
|
+
lg(notice)
|
140
|
+
end
|
141
|
+
|
121
142
|
# iterates over the existing @link_stack
|
122
143
|
# running until we reach the @max_pages value.
|
123
144
|
def async_crawl_and_collect
|
124
145
|
while @already_crawled.size < @max_pages
|
125
146
|
if @link_stack.empty?
|
126
|
-
|
127
|
-
@progressbar.log("Can't find any more links.")
|
128
|
-
else
|
129
|
-
lg("Can't find any more links.")
|
130
|
-
end
|
147
|
+
end_crawl_notice
|
131
148
|
break
|
132
149
|
end
|
133
150
|
new_links_arr = process_link_stack
|
151
|
+
@temp_link_stack = []
|
134
152
|
next if new_links_arr.nil? || new_links_arr.empty?
|
135
153
|
# set operations to see are these in our previous visited pages arr
|
136
|
-
|
137
|
-
@link_stack.concat(new_links_arr)
|
138
|
-
|
154
|
+
next if new_links_arr.empty?
|
155
|
+
@link_stack.concat(new_links_arr)
|
156
|
+
next unless @sitemap
|
157
|
+
@data.concat(new_links_arr)
|
139
158
|
end
|
140
|
-
|
141
|
-
@progressbar.finish if @prgrss
|
159
|
+
@data.uniq!
|
142
160
|
end
|
143
161
|
|
144
162
|
# returns true is resp is ok to continue
|
@@ -149,8 +167,8 @@ module Retriever
|
|
149
167
|
loc = hdr.location
|
150
168
|
lg("#{url} Redirected to #{loc}")
|
151
169
|
if t.host_re =~ loc
|
152
|
-
@
|
153
|
-
lg('--Added to
|
170
|
+
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
171
|
+
lg('--Added to stack for later')
|
154
172
|
return false
|
155
173
|
end
|
156
174
|
lg("Redirection outside of target host. No - go. #{loc}")
|
@@ -159,7 +177,6 @@ module Retriever
|
|
159
177
|
# lets not continue if unsuccessful connection
|
160
178
|
unless hdr.successful?
|
161
179
|
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
162
|
-
|
163
180
|
@connection_tally[:error] += 1
|
164
181
|
@connection_tally[:error_server] += 1 if hdr.server_error?
|
165
182
|
@connection_tally[:error_client] += 1 if hdr.client_error?
|
@@ -168,7 +185,6 @@ module Retriever
|
|
168
185
|
# let's not continue if not text/html
|
169
186
|
unless hdr['CONTENT_TYPE'].include?('text/html')
|
170
187
|
@already_crawled.insert(url)
|
171
|
-
@link_stack.delete(url)
|
172
188
|
lg("Page Not text/html -- #{url}")
|
173
189
|
return false
|
174
190
|
end
|
@@ -176,45 +192,58 @@ module Retriever
|
|
176
192
|
true
|
177
193
|
end
|
178
194
|
|
195
|
+
def push_seo_to_data(url, new_page)
|
196
|
+
seos = [url]
|
197
|
+
seos.concat(new_page.parse_seo)
|
198
|
+
@data.push(seos)
|
199
|
+
lg('--page SEO scraped')
|
200
|
+
end
|
201
|
+
|
202
|
+
def push_files_to_data(new_page)
|
203
|
+
filez = new_page.parse_files(new_page.parse_internal)
|
204
|
+
@data.concat(filez) unless filez.empty?
|
205
|
+
lg("--#{filez.size} files found")
|
206
|
+
end
|
207
|
+
|
208
|
+
def page_from_response(url, response)
|
209
|
+
lg("Page Fetched: #{url}")
|
210
|
+
@already_crawled.insert(url)
|
211
|
+
if @progress && (@already_crawled.size < @max_pages)
|
212
|
+
@progressbar.increment
|
213
|
+
end
|
214
|
+
Retriever::Page.new(response, @t)
|
215
|
+
end
|
216
|
+
|
217
|
+
def new_visitable_links(current_page)
|
218
|
+
lg("--#{current_page.links.size} links found")
|
219
|
+
current_page.parse_internal_visitable
|
220
|
+
end
|
221
|
+
|
179
222
|
# send a new wave of GET requests, using current @link_stack
|
223
|
+
# at end of the loop it empties link_stack
|
224
|
+
# puts new links into temporary stack
|
180
225
|
def process_link_stack
|
181
|
-
new_stuff = []
|
182
226
|
EM.synchrony do
|
183
227
|
concurrency = 10
|
184
228
|
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
185
229
|
next if @already_crawled.size >= @max_pages
|
186
230
|
next if @already_crawled.include?(url)
|
187
|
-
|
188
231
|
resp = EventMachine::HttpRequest.new(url).get
|
189
|
-
|
190
232
|
next unless good_response?(resp, url)
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
seos = [url]
|
200
|
-
seos.concat(new_page.parse_seo)
|
201
|
-
@data.push(seos)
|
202
|
-
lg('--page SEO scraped')
|
203
|
-
end
|
204
|
-
next if new_page.links.size == 0
|
205
|
-
lg("--#{new_page.links.size} links found")
|
206
|
-
internal_links_arr = new_page.parse_internal_visitable
|
207
|
-
new_stuff.push(internal_links_arr)
|
208
|
-
if @fh
|
209
|
-
filez = new_page.parse_files
|
210
|
-
@data.concat(filez) unless filez.empty?
|
211
|
-
lg("--#{filez.size} files found")
|
212
|
-
end
|
233
|
+
current_page = page_from_response(url, resp.response)
|
234
|
+
# non-link dependent modes
|
235
|
+
push_seo_to_data(url, current_page) if @seo
|
236
|
+
next unless current_page.links.size > 0
|
237
|
+
@temp_link_stack.push(new_visitable_links(current_page))
|
238
|
+
# link dependent modes
|
239
|
+
next unless @fileharvest
|
240
|
+
push_files_to_data(current_page)
|
213
241
|
end
|
214
|
-
new_stuff = new_stuff.flatten # all completed requests
|
215
242
|
EventMachine.stop
|
216
243
|
end
|
217
|
-
|
244
|
+
# empty the stack. most clean way
|
245
|
+
@link_stack = []
|
246
|
+
@temp_link_stack.flatten.uniq!
|
218
247
|
end
|
219
248
|
end
|
220
249
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -5,29 +5,21 @@ module Retriever
|
|
5
5
|
class FetchFiles < Fetch
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
|
9
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
-
@link_stack = page_one.parse_internal_visitable
|
11
|
-
lg("URL Crawled: #{@t.target}")
|
12
|
-
lg("#{@link_stack.size - 1} new links found")
|
13
|
-
|
14
|
-
temp_file_collection = page_one.parse_files
|
8
|
+
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
15
9
|
@data.concat(tempFileCollection) if temp_file_collection.size > 0
|
16
10
|
lg("#{@data.size} new files found")
|
17
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
18
|
-
@link_stack.delete(@t.target)
|
19
11
|
|
20
12
|
async_crawl_and_collect
|
21
|
-
|
13
|
+
# done, make sure progress bar says we are done
|
14
|
+
@progressbar.finish if @progress
|
22
15
|
@data.sort_by! { |x| x.length }
|
23
|
-
@data.uniq!
|
24
16
|
end
|
25
17
|
|
26
18
|
def download_file(path)
|
27
19
|
# given valid url, downloads file to current directory in /rr-downloads/
|
28
20
|
arr = path.split('/')
|
29
21
|
shortname = arr.pop
|
30
|
-
puts "Initiating Download
|
22
|
+
puts "Initiating Download of: #{shortname}"
|
31
23
|
File.open(shortname, 'wb') do |saved_file|
|
32
24
|
open(path) do |read_file|
|
33
25
|
saved_file.write(read_file.read)
|
@@ -38,33 +30,39 @@ module Retriever
|
|
38
30
|
|
39
31
|
def autodownload
|
40
32
|
# go through the fetched file URL collection and download each one.
|
41
|
-
|
42
|
-
puts '###################'
|
33
|
+
puts HR
|
43
34
|
puts '### Initiating Autodownload...'
|
44
|
-
puts
|
45
|
-
puts "#{
|
46
|
-
puts
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
@data.
|
35
|
+
puts HR
|
36
|
+
puts "#{@data.count} - #{@file_ext}'s Located"
|
37
|
+
puts HR
|
38
|
+
move_to_download_dir
|
39
|
+
iterate_thru_collection_and_download
|
40
|
+
Dir.chdir('..')
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def iterate_thru_collection_and_download
|
46
|
+
lenn = @data.count
|
47
|
+
@data.each_with_index do |entry, i|
|
56
48
|
begin
|
57
49
|
download_file(entry)
|
58
|
-
|
59
|
-
|
60
|
-
puts
|
61
|
-
rescue StandardError => e
|
62
|
-
puts 'ERROR: failed to download - #{entry}'
|
63
|
-
puts e.message
|
64
|
-
puts
|
50
|
+
rescue StandardError
|
51
|
+
puts "ERROR: failed to download - #{entry}"
|
65
52
|
end
|
53
|
+
lg(" File [#{i + 1} of #{lenn}]\n")
|
66
54
|
end
|
67
|
-
|
55
|
+
end
|
56
|
+
|
57
|
+
def move_to_download_dir(dir_name = 'rr-downloads')
|
58
|
+
if File.directory?(dir_name)
|
59
|
+
Dir.chdir(dir_name)
|
60
|
+
else
|
61
|
+
puts "creating #{dir_name} Directory"
|
62
|
+
Dir.mkdir(dir_name)
|
63
|
+
Dir.chdir(dir_name)
|
64
|
+
end
|
65
|
+
puts "Downloading files to local directory: '/#{dir_name}/'"
|
68
66
|
end
|
69
67
|
end
|
70
68
|
end
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -6,19 +6,11 @@ module Retriever
|
|
6
6
|
# on all unique pages found on the site
|
7
7
|
def initialize(url, options)
|
8
8
|
super
|
9
|
-
@data
|
10
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
11
|
-
lg("URL Crawled: #{@t.target}")
|
12
|
-
|
13
|
-
@link_stack = page_one.parse_internal_visitable
|
14
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
15
|
-
lg("#{@link_stack.size - 1} links found")
|
16
|
-
@link_stack.delete(@t.target)
|
17
|
-
|
18
|
-
@data.push(page_one.parse_seo)
|
9
|
+
@data.push(@page_one.parse_seo)
|
19
10
|
|
20
11
|
async_crawl_and_collect
|
21
|
-
|
12
|
+
# done, make sure progress bar says we are done
|
13
|
+
@progressbar.finish if @progress
|
22
14
|
@data.sort_by! { |x| x[0].length }
|
23
15
|
end
|
24
16
|
end
|
@@ -5,37 +5,38 @@ module Retriever
|
|
5
5
|
# returns an array of all unique pages found on the site
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
@data
|
9
|
-
page_one = Retriever::Page.new(@t.source, @t)
|
10
|
-
lg("URL Crawled: #{@t.target}")
|
11
|
-
@link_stack = page_one.parse_internal_visitable
|
12
|
-
errlog("Bad URL -- #{@t.target}") unless @link_stack
|
13
|
-
lg("#{@link_stack.size - 1} links found")
|
14
|
-
|
15
|
-
@link_stack.delete(@t.target)
|
8
|
+
@data.push(@t.target)
|
16
9
|
@data.concat(@link_stack)
|
17
10
|
|
18
11
|
async_crawl_and_collect
|
19
|
-
|
12
|
+
# done, make sure progress bar says we are done
|
13
|
+
@progressbar.finish if @progress
|
20
14
|
@data.sort_by! { |x| x.length } if @data.size > 1
|
21
15
|
@data.uniq!
|
22
16
|
end
|
23
17
|
|
18
|
+
private
|
19
|
+
|
24
20
|
# produces valid XML sitemap based on page collection fetched.
|
25
21
|
# Writes to current directory.
|
26
22
|
def gen_xml
|
27
|
-
|
28
|
-
f
|
29
|
-
|
30
|
-
|
31
|
-
|
23
|
+
filename = @t.host.split('.')[1]
|
24
|
+
f = File.open("sitemap-#{filename}.xml", 'w+')
|
25
|
+
f << "<?xml version='1.0' encoding='UTF-8'?>"
|
26
|
+
f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
27
|
+
@data.each do |url|
|
28
|
+
f << "<url><loc>#{url}</loc></url>"
|
29
|
+
end
|
32
30
|
f << '</urlset>'
|
33
31
|
f.close
|
34
|
-
|
35
|
-
|
32
|
+
print_file_info(filename)
|
33
|
+
end
|
34
|
+
|
35
|
+
def print_file_info(filename)
|
36
|
+
puts HR
|
37
|
+
puts "File Created: sitemap-#{filename}.xml"
|
36
38
|
puts "Object Count: #{@data.size}"
|
37
|
-
puts
|
38
|
-
puts
|
39
|
+
puts HR + "\n"
|
39
40
|
end
|
40
41
|
end
|
41
42
|
end
|
data/lib/retriever/link.rb
CHANGED
@@ -1,33 +1,35 @@
|
|
1
|
+
require 'addressable/uri'
|
1
2
|
module Retriever
|
2
3
|
#
|
3
4
|
class Link
|
4
|
-
HTTP_RE = Regexp.new(/^http/i).freeze
|
5
|
-
|
6
|
-
DOUBLE_SLASH_RE = Regexp.new(
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
@
|
12
|
-
@
|
5
|
+
# HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
|
7
|
+
DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
|
8
|
+
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
|
+
|
10
|
+
def initialize(target_scheme, target_host, this_link)
|
11
|
+
@link_uri = Addressable::URI.parse(this_link)
|
12
|
+
@scheme = target_scheme
|
13
|
+
@host = target_host
|
14
|
+
@this_link = @link_uri.to_s
|
13
15
|
end
|
14
16
|
|
15
17
|
def path
|
16
|
-
return
|
18
|
+
return this_link if link_uri.absolute?
|
17
19
|
|
18
|
-
return "
|
20
|
+
return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
|
19
21
|
|
20
|
-
return "
|
22
|
+
return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
|
21
23
|
|
22
24
|
# link begins with '//'
|
23
|
-
return "
|
25
|
+
return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
|
24
26
|
|
25
27
|
# link uses relative path with no slashes at all
|
26
|
-
return "
|
28
|
+
return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
|
27
29
|
end
|
28
30
|
|
29
31
|
private
|
30
32
|
|
31
|
-
attr_reader :host, :
|
33
|
+
attr_reader :this_link, :host, :link_uri
|
32
34
|
end
|
33
35
|
end
|
@@ -1,6 +1,7 @@
|
|
1
|
+
#
|
1
2
|
module OpenURI
|
2
3
|
# nesc patch otherwise OPENURI blocks redirects to and from https
|
3
|
-
def
|
4
|
+
def self.redirectable?(uri1, uri2)
|
4
5
|
uri1.scheme.downcase == uri2.scheme.downcase ||
|
5
6
|
(/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
|
6
7
|
end
|
data/lib/retriever/page.rb
CHANGED
@@ -1,21 +1,40 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
1
3
|
module Retriever
|
2
4
|
#
|
3
5
|
class Page
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
7
|
+
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
8
|
+
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
9
|
+
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
|
10
|
+
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
|
11
|
+
[^>]*content=[\"]
|
12
|
+
(
|
13
|
+
[^\"]*
|
14
|
+
)
|
15
|
+
[\"]
|
16
|
+
[^>]
|
17
|
+
*>
|
18
|
+
/ix).freeze
|
19
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=
|
20
|
+
['|"]
|
21
|
+
(
|
22
|
+
[^\s]
|
23
|
+
[a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
|
24
|
+
)
|
25
|
+
['|"]
|
26
|
+
[\s|\W]
|
27
|
+
/ix).freeze
|
28
|
+
NONPAGE_EXT_RE = Regexp.new(/\.
|
29
|
+
(?:css|js|png|gif|jpg|mp4|
|
30
|
+
wmv|flv|mp3|wav|doc|txt|ico|xml)
|
31
|
+
/ix).freeze
|
13
32
|
|
14
33
|
attr_reader :links, :source, :t
|
15
34
|
|
16
35
|
def initialize(source, t)
|
17
36
|
@t = t
|
18
|
-
@source = source.encode('UTF-8', :
|
37
|
+
@source = source.encode('UTF-8', invalid: :replace, undef: :replace)
|
19
38
|
@links = nil
|
20
39
|
end
|
21
40
|
|
@@ -28,20 +47,20 @@ module Retriever
|
|
28
47
|
# filter some malformed URLS that come in
|
29
48
|
# meant to be a loose filter to catch all reasonable HREF attributes.
|
30
49
|
link = match[0]
|
31
|
-
Link.new(@t.host, link).path
|
32
|
-
end.uniq
|
50
|
+
Link.new(@t.scheme, @t.host, link).path
|
51
|
+
end.compact.uniq
|
33
52
|
end
|
34
53
|
|
35
54
|
def parse_internal
|
36
|
-
links.select { |
|
55
|
+
links.select { |x| @t.host == Addressable::URI.parse(x).host }
|
37
56
|
end
|
38
57
|
|
39
58
|
def parse_internal_visitable
|
40
|
-
parse_internal.select { |
|
59
|
+
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
|
41
60
|
end
|
42
61
|
|
43
|
-
def parse_files
|
44
|
-
|
62
|
+
def parse_files(arr)
|
63
|
+
arr.select { |x| @t.file_re =~ x }
|
45
64
|
end
|
46
65
|
|
47
66
|
def title
|
data/lib/retriever/target.rb
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
require 'open-uri'
|
2
|
+
require 'addressable/uri'
|
2
3
|
|
3
4
|
module Retriever
|
4
5
|
#
|
5
6
|
class Target
|
6
|
-
HTTP_RE
|
7
|
-
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
8
8
|
|
9
|
-
attr_reader :host, :target, :host_re, :source, :file_re
|
9
|
+
attr_reader :host, :target, :host_re, :source, :file_re, :scheme
|
10
10
|
|
11
11
|
def initialize(url, file_re = nil)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
@target
|
16
|
-
@host
|
17
|
-
@host_re
|
18
|
-
@file_re
|
12
|
+
fail 'Bad URL' unless url.include?('.')
|
13
|
+
url = "http://#{url}" unless HTTP_RE =~ url
|
14
|
+
target_uri = Addressable::URI.parse(url)
|
15
|
+
@target = target_uri.to_s
|
16
|
+
@host = target_uri.host
|
17
|
+
@host_re = Regexp.new(@host.sub('www.', ''))
|
18
|
+
@file_re ||= file_re
|
19
|
+
@scheme = target_uri.scheme
|
19
20
|
end
|
20
21
|
|
21
22
|
def source
|
@@ -31,13 +32,14 @@ module Retriever
|
|
31
32
|
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
|
32
33
|
fail 'Domain not working. Try HTTPS???' unless resp
|
33
34
|
# consider using scrub from ruby 2.1? this misses some things
|
34
|
-
resp.encode('UTF-8', 'binary', :
|
35
|
+
resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
|
35
36
|
end
|
36
37
|
|
37
38
|
def resync_target_and_return_source(url)
|
38
|
-
new_t
|
39
|
+
new_t = Retriever::Target.new(url)
|
39
40
|
@target = new_t.target
|
40
|
-
@host
|
41
|
+
@host = new_t.host
|
42
|
+
@scheme = new_t.scheme
|
41
43
|
new_t.source
|
42
44
|
end
|
43
45
|
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
data/readme.md
CHANGED
@@ -4,15 +4,29 @@
|
|
4
4
|
|
5
5
|
By Joe Norton
|
6
6
|
|
7
|
-
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
7
|
+
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
|
10
|
-
|
11
|
-
**Use at Own Risk**
|
12
|
-
RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
|
13
10
|
|
14
11
|
**v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
|
12
|
+
mission
|
13
|
+
-------
|
14
|
+
RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
|
15
|
+
|
16
|
+
features
|
17
|
+
--------
|
18
|
+
* Asynchronous HTTP Requests thru EM & Synchrony
|
19
|
+
* Bloom filter for tracking pages visited.
|
20
|
+
* 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
|
21
|
+
|
22
|
+
use-cases
|
23
|
+
---------
|
24
|
+
RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
|
25
|
+
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
26
|
+
2. Crawl a target website and *download all files of a given filetype*.
|
27
|
+
3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
|
15
28
|
|
29
|
+
Help & Forks Welcome!
|
16
30
|
|
17
31
|
getting started
|
18
32
|
-----------
|
data/spec/link_spec.rb
CHANGED
@@ -1,66 +1,68 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe 'Link' do
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
6
|
+
let(:links) { Retriever::Page.new(@source, t).links }
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
<a href='http://www.cnet.com/download.exe'>download</a>
|
8
|
+
it 'collects links in anchor tags' do
|
9
|
+
@source = (<<SOURCE).strip
|
10
|
+
<a href='http://www.cnet.com/download.exe'>download</a>
|
11
11
|
SOURCE
|
12
12
|
|
13
|
-
|
14
|
-
|
13
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
14
|
+
end
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
it 'collects links in link tags' do
|
17
|
+
@source = (<<SOURCE).strip
|
18
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
19
19
|
SOURCE
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
expect(links[0]).to include('formreset.css?ver=1.7.12')
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
it 'does not collect bare links (ones not in an href)' do
|
25
|
+
@source = (<<SOURCE).strip
|
26
26
|
http://www.google.com
|
27
27
|
SOURCE
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
expect(links).to_not include('http://www.google.com')
|
30
|
+
end
|
31
31
|
|
32
|
-
|
33
|
-
|
32
|
+
it 'collects only unique href links on the page' do
|
33
|
+
@source = (<<SOURCE).strip
|
34
34
|
<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
|
35
35
|
<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
|
36
36
|
SOURCE
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
expect(links.size).to eq(1)
|
39
|
+
end
|
40
40
|
|
41
|
-
|
42
|
-
|
41
|
+
it 'adds a protocol to urls missing them (www.)' do
|
42
|
+
@source = (<<SOURCE).strip
|
43
43
|
<a href='www.cnet.com/download.exe'>download</a>
|
44
44
|
SOURCE
|
45
45
|
|
46
|
-
|
47
|
-
|
46
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
49
|
+
it "doesn't care about any extra attributes on the anchor tag" do
|
50
|
+
@source = (<<SOURCE).strip
|
51
51
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'
|
52
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
53
|
+
</a>
|
53
54
|
SOURCE
|
54
55
|
|
55
|
-
|
56
|
-
|
56
|
+
expect(links.size).to eq(1)
|
57
|
+
end
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
it 'returns relative urls with full path based on hostname' do
|
60
|
+
@source = (<<SOURCE).strip
|
60
61
|
<a href='/test.html'>test</a>
|
61
62
|
<a href='cpage_18'>about</a>
|
62
63
|
SOURCE
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
end
|
65
|
+
expect(links).to include('http://www.cnet.com/test.html',
|
66
|
+
'http://www.cnet.com/cpage_18')
|
67
|
+
end
|
68
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,93 +1,97 @@
|
|
1
1
|
require 'retriever/page'
|
2
2
|
require 'retriever/fetch'
|
3
3
|
|
4
|
-
t = Retriever::Target.new(
|
4
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe 'Page' do
|
7
7
|
|
8
|
-
describe
|
9
|
-
let
|
10
|
-
it
|
11
|
-
|
8
|
+
describe '#links' do
|
9
|
+
let(:links) { Retriever::Page.new(@source, t).links }
|
10
|
+
it 'collects all unique href links on the page' do
|
11
|
+
@source = (<<SOURCE).strip
|
12
12
|
<a href='www.cnet.com/download.exe'>download</a>
|
13
13
|
<a href='/test.html'>test</a>
|
14
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
15
|
+
</a>
|
15
16
|
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
16
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
18
|
SOURCE
|
18
19
|
|
19
20
|
expect(links.size).to eq(4)
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
23
|
-
describe
|
24
|
-
let
|
25
|
-
|
26
|
-
|
24
|
+
describe '#parse_internal' do
|
25
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
26
|
+
let(:links) { page.parse_internal }
|
27
|
+
it 'filters links by host' do
|
28
|
+
@source = (<<SOURCE).strip
|
27
29
|
<a href='http://www.cnet.com/'>download</a>
|
28
|
-
|
30
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
29
31
|
SOURCE
|
30
32
|
|
31
|
-
|
33
|
+
expect(links.size).to eq(1)
|
32
34
|
end
|
33
35
|
end
|
34
36
|
|
35
|
-
describe
|
36
|
-
let
|
37
|
+
describe '#parse_internal_visitable' do
|
38
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
39
|
+
let(:links) { page.parse_internal_visitable }
|
37
40
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
38
|
-
|
41
|
+
@source = (<<SOURCE).strip
|
39
42
|
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
40
43
|
SOURCE
|
41
|
-
|
44
|
+
expect(links.size).to eq(0)
|
42
45
|
end
|
43
46
|
end
|
44
47
|
|
45
|
-
describe
|
46
|
-
let
|
47
|
-
|
48
|
-
|
48
|
+
describe '#parse_files' do
|
49
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
50
|
+
let(:files) { page.parse_files(page.parse_internal) }
|
51
|
+
it 'filters links by filetype' do
|
52
|
+
@source = (<<SOURCE).strip
|
49
53
|
<a href='www.cnet.com/download.exe'>download</a>
|
50
|
-
http://www.google.com
|
54
|
+
http://www.google.com
|
51
55
|
<a href='/test.html'>test</a>
|
52
56
|
SOURCE
|
53
|
-
|
57
|
+
expect(files.size).to eq(1)
|
54
58
|
end
|
55
59
|
end
|
56
60
|
|
57
|
-
|
58
|
-
|
59
|
-
it
|
60
|
-
|
61
|
+
describe '#title' do
|
62
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
63
|
+
it 'returns page title' do
|
64
|
+
@source = (<<SOURCE).strip
|
61
65
|
<title>test</title>
|
62
66
|
SOURCE
|
63
|
-
|
67
|
+
expect(page.title).to eq('test')
|
64
68
|
end
|
65
69
|
end
|
66
|
-
|
67
|
-
|
68
|
-
it
|
69
|
-
|
70
|
+
describe '#desc' do
|
71
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
72
|
+
it 'returns meta description' do
|
73
|
+
@source = (<<SOURCE).strip
|
70
74
|
<meta name='description' content="test2 ">
|
71
75
|
SOURCE
|
72
|
-
|
76
|
+
expect(page.desc).to eq('test2 ')
|
73
77
|
end
|
74
78
|
end
|
75
|
-
|
76
|
-
|
77
|
-
it
|
78
|
-
|
79
|
+
describe '#h1' do
|
80
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
81
|
+
it 'returns h1 text' do
|
82
|
+
@source = (<<SOURCE).strip
|
79
83
|
<h1>test 3</h1>
|
80
84
|
SOURCE
|
81
|
-
|
85
|
+
expect(page.h1).to eq('test 3')
|
82
86
|
end
|
83
87
|
end
|
84
|
-
|
85
|
-
|
86
|
-
it
|
87
|
-
|
88
|
+
describe '#h2' do
|
89
|
+
let(:page) { Retriever::Page.new(@source, t) }
|
90
|
+
it 'returns h2 text' do
|
91
|
+
@source = (<<SOURCE).strip
|
88
92
|
<h2> test 4 </h2>
|
89
93
|
SOURCE
|
90
|
-
|
94
|
+
expect(page.h2).to eq(' test 4 ')
|
91
95
|
end
|
92
96
|
end
|
93
97
|
end
|
data/spec/retriever_spec.rb
CHANGED
data/spec/target_spec.rb
CHANGED
@@ -1,44 +1,44 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new(
|
4
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
5
5
|
|
6
|
-
describe
|
6
|
+
describe 'Target' do
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
it 'creates target var' do
|
9
|
+
expect(t.target).to eq('http://www.cnet.com/reviews/')
|
10
|
+
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
it 'creates host var' do
|
13
|
+
expect(t.host).to eq('www.cnet.com')
|
14
|
+
end
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
it 'creates host_re var' do
|
17
|
+
expect(t.host_re).to eq(/cnet.com/)
|
18
|
+
end
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
it 'creates file_re var (when provided)' do
|
21
|
+
expect(t.file_re).to eq(/\.exe\z/)
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
it 'adds protocol to Target URL if none given' do
|
25
|
+
expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
|
26
|
+
end
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
it 'fails if given URL has no dot in it' do
|
29
|
+
expect { Retriever::Target.new('cnetcom') }.to raise_error
|
30
|
+
end
|
31
31
|
|
32
|
-
describe
|
32
|
+
describe '#source' do
|
33
33
|
|
34
|
-
it
|
35
|
-
expect(Retriever::Target.new(
|
34
|
+
it 'opens URL and returns source as String' do
|
35
|
+
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
|
36
|
+
.to eq(String)
|
36
37
|
end
|
37
38
|
|
38
|
-
it
|
39
|
-
expect{Retriever::Target.new(
|
39
|
+
it 'fails if target redirects to new host' do
|
40
|
+
expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
|
41
|
+
.to raise_error
|
40
42
|
end
|
41
|
-
|
42
43
|
end
|
43
|
-
|
44
|
-
end
|
44
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: addressable
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -125,7 +139,7 @@ files:
|
|
125
139
|
- lib/retriever/fetchseo.rb
|
126
140
|
- lib/retriever/fetchsitemap.rb
|
127
141
|
- lib/retriever/link.rb
|
128
|
-
- lib/retriever/
|
142
|
+
- lib/retriever/openuri_redirect_patch.rb
|
129
143
|
- lib/retriever/page.rb
|
130
144
|
- lib/retriever/target.rb
|
131
145
|
- lib/retriever/version.rb
|