rubyretriever 0.1.4 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +12 -6
- data/lib/retriever/cli.rb +27 -0
- data/lib/retriever/fetch.rb +84 -45
- data/lib/retriever/fetchfiles.rb +11 -17
- data/lib/retriever/fetchseo.rb +23 -0
- data/lib/retriever/fetchsitemap.rb +10 -14
- data/lib/retriever/openuri-redirect-patch.rb +6 -0
- data/lib/retriever/page.rb +68 -0
- data/lib/retriever/target.rb +20 -9
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +4 -0
- data/readme.md +18 -6
- data/spec/link_spec.rb +2 -2
- data/spec/page_spec.rb +94 -0
- data/spec/retriever_spec.rb +1 -48
- data/spec/target_spec.rb +7 -2
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
|
4
|
+
data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
|
7
|
+
data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
|
data/bin/rr
CHANGED
@@ -1,18 +1,24 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
|
2
3
|
require 'retriever'
|
3
4
|
require 'optparse'
|
5
|
+
|
4
6
|
options = {}
|
5
7
|
optparse = OptionParser.new do|opts|
|
6
8
|
# Set a banner, displayed at the top
|
7
9
|
# of the help screen.
|
8
10
|
opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
|
9
11
|
options[:sitemap] = false
|
10
|
-
opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode
|
11
|
-
options[:sitemap] = output_type
|
12
|
+
opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
|
13
|
+
options[:sitemap] = output_type||''
|
12
14
|
end
|
13
15
|
options[:fileharvest] = false
|
14
|
-
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode
|
16
|
+
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
|
15
17
|
options[:fileharvest] = file_ext
|
18
|
+
end
|
19
|
+
options[:seo] = false
|
20
|
+
opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
|
21
|
+
options[:seo] = true
|
16
22
|
end
|
17
23
|
options[:filename] = nil
|
18
24
|
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
@@ -56,14 +62,14 @@ ARGV.each do|q|
|
|
56
62
|
puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
|
57
63
|
puts "### Performing File Harvest" if options[:fileharvest]
|
58
64
|
puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
|
65
|
+
puts "### Performing SEO Scrape" if options[:seo]
|
59
66
|
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
60
67
|
puts "### Being verbose"
|
61
|
-
puts "### Stopping after #{options[:maxpages]} pages"
|
68
|
+
puts "### Stopping after #{options[:maxpages]} pages"
|
62
69
|
end
|
63
70
|
puts "###############################"
|
64
71
|
puts "### [RubyRetriever] go fetch #{q}"
|
65
|
-
Retriever::
|
66
|
-
Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
72
|
+
Retriever::CLI.new(q, options)
|
67
73
|
puts "### [RubyRetriever] is done."
|
68
74
|
puts "###############################"
|
69
75
|
puts
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Retriever
|
2
|
+
class CLI
|
3
|
+
def initialize(url,options)
|
4
|
+
|
5
|
+
#kick off the fetch mode of choice
|
6
|
+
if options[:fileharvest]
|
7
|
+
@fetch = Retriever::FetchFiles.new(url, options)
|
8
|
+
elsif options[:sitemap]
|
9
|
+
@fetch = Retriever::FetchSitemap.new(url, options)
|
10
|
+
elsif options[:seo]
|
11
|
+
@fetch = Retriever::FetchSEO.new(url, options)
|
12
|
+
else
|
13
|
+
fail "### Error: No Mode Selected"
|
14
|
+
end
|
15
|
+
|
16
|
+
#all fetch modes
|
17
|
+
@fetch.dump
|
18
|
+
@fetch.write if options[:filename]
|
19
|
+
|
20
|
+
#fileharvest only
|
21
|
+
@fetch.autodownload if options[:autodown] && options[:fileharvest]
|
22
|
+
|
23
|
+
#sitemap only
|
24
|
+
@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/retriever/fetch.rb
CHANGED
@@ -9,12 +9,14 @@ require 'bloomfilter-rb'
|
|
9
9
|
module Retriever
|
10
10
|
class Fetch
|
11
11
|
attr_reader :maxPages, :t
|
12
|
-
#constants
|
13
|
-
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
14
|
-
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
15
12
|
|
16
13
|
def initialize(url,options)
|
17
|
-
@
|
14
|
+
@connection_tally = {
|
15
|
+
:success => 0,
|
16
|
+
:error => 0,
|
17
|
+
:error_client => 0,
|
18
|
+
:error_server => 0
|
19
|
+
}
|
18
20
|
#OPTIONS
|
19
21
|
@prgrss = options[:progress] ? options[:progress] : false
|
20
22
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
@@ -23,6 +25,7 @@ module Retriever
|
|
23
25
|
@fh = options[:fileharvest] ? options[:fileharvest] : false
|
24
26
|
@file_ext = @fh.to_s
|
25
27
|
@s = options[:sitemap] ? options[:sitemap] : false
|
28
|
+
@seo = options[:seo] ? true : false
|
26
29
|
@autodown = options[:autodown] ? true : false
|
27
30
|
#
|
28
31
|
if @fh
|
@@ -30,9 +33,6 @@ module Retriever
|
|
30
33
|
@file_re = Regexp.new(tempExtStr).freeze
|
31
34
|
else
|
32
35
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
|
-
if !@output
|
34
|
-
@output = "rr-#{@t.host.split('.')[1]}"
|
35
|
-
end
|
36
36
|
end
|
37
37
|
if @prgrss
|
38
38
|
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
@@ -44,8 +44,13 @@ module Retriever
|
|
44
44
|
}
|
45
45
|
@progressbar = ProgressBar.create(prgressVars)
|
46
46
|
end
|
47
|
+
@t = Retriever::Target.new(url,@file_re)
|
47
48
|
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
48
49
|
@already_crawled.insert(@t.target)
|
50
|
+
if (@fh && !@output)
|
51
|
+
@output = "rr-#{@t.host.split('.')[1]}"
|
52
|
+
end
|
53
|
+
fail "bad page source on target -- try HTTPS?" if !@t.source
|
49
54
|
end
|
50
55
|
def errlog(msg)
|
51
56
|
raise "ERROR: #{msg}"
|
@@ -53,52 +58,52 @@ module Retriever
|
|
53
58
|
def lg(msg)
|
54
59
|
puts "### #{msg}" if @v
|
55
60
|
end
|
56
|
-
def dump
|
61
|
+
def dump
|
57
62
|
puts "###############################"
|
63
|
+
if @v
|
64
|
+
puts "Connection Tally:"
|
65
|
+
puts @connection_tally.to_s
|
66
|
+
puts "###############################"
|
67
|
+
end
|
58
68
|
if @s
|
59
69
|
puts "#{@t.target} Sitemap"
|
60
|
-
puts "Page Count: #{data.size}"
|
70
|
+
puts "Page Count: #{@data.size}"
|
61
71
|
elsif @fh
|
62
72
|
puts "Target URL: #{@t.target}"
|
63
73
|
puts "Filetype: #{@file_ext}"
|
64
|
-
puts "File Count: #{data.size}"
|
74
|
+
puts "File Count: #{@data.size}"
|
75
|
+
elsif @seo
|
76
|
+
puts "#{@t.target} SEO Metrics"
|
77
|
+
puts "Page Count: #{@data.size}"
|
65
78
|
else
|
66
|
-
|
79
|
+
fail "ERROR - Cannot dump - Mode Not Found"
|
67
80
|
end
|
68
81
|
puts "###############################"
|
69
|
-
|
82
|
+
@data.each do |line|
|
83
|
+
puts line
|
84
|
+
end
|
70
85
|
puts "###############################"
|
71
86
|
puts
|
72
87
|
end
|
73
|
-
def write
|
88
|
+
def write
|
74
89
|
if @output
|
90
|
+
i = 0
|
75
91
|
CSV.open("#{@output}.csv", "w") do |csv|
|
76
|
-
|
77
|
-
|
78
|
-
|
92
|
+
if ((i == 0) && @seo)
|
93
|
+
csv << ['URL','Page Title','Meta Description','H1','H2']
|
94
|
+
i +=1
|
95
|
+
end
|
96
|
+
@data.each do |entry|
|
97
|
+
csv << entry
|
98
|
+
end
|
79
99
|
end
|
80
100
|
puts "###############################"
|
81
101
|
puts "File Created: #{@output}.csv"
|
82
|
-
puts "Object Count: #{data.size}"
|
102
|
+
puts "Object Count: #{@data.size}"
|
83
103
|
puts "###############################"
|
84
104
|
puts
|
85
105
|
end
|
86
106
|
end
|
87
|
-
#recieves page source as string
|
88
|
-
#returns array of unique href links
|
89
|
-
def fetchLinks(doc)
|
90
|
-
return false if !doc
|
91
|
-
doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
92
|
-
link = match[0]
|
93
|
-
Link.new(@t.host, link).path
|
94
|
-
end.uniq
|
95
|
-
end
|
96
|
-
def parseInternalLinks(all_links)
|
97
|
-
all_links.select{ |linky| (@t.host_re =~ linky) }
|
98
|
-
end
|
99
|
-
def parseInternalVisitableLinks(all_links)
|
100
|
-
parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
101
|
-
end
|
102
107
|
def async_crawl_and_collect()
|
103
108
|
while (@already_crawled.size < @maxPages)
|
104
109
|
if @linkStack.empty?
|
@@ -112,11 +117,41 @@ module Retriever
|
|
112
117
|
new_links_arr = self.asyncGetWave()
|
113
118
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
114
119
|
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
115
|
-
@linkStack.concat(new_links_arr)
|
116
|
-
@
|
120
|
+
@linkStack.concat(new_links_arr).uniq!
|
121
|
+
@data.concat(new_links_arr) if @s
|
117
122
|
end
|
118
123
|
@progressbar.finish if @prgrss
|
119
124
|
end
|
125
|
+
def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
|
126
|
+
return false if !resp
|
127
|
+
if resp.response_header.redirection? #we got redirected
|
128
|
+
loc = resp.response_header.location
|
129
|
+
lg("#{url} Redirected to #{loc}")
|
130
|
+
if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
|
131
|
+
@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
|
132
|
+
lg("--Added to linkStack for later")
|
133
|
+
return false
|
134
|
+
end
|
135
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
136
|
+
return false
|
137
|
+
end
|
138
|
+
if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
139
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
140
|
+
@connection_tally[:error] += 1
|
141
|
+
@connection_tally[:error_server] += 1 if resp.response_header.server_error?
|
142
|
+
@connection_tally[:error_client] += 1 if resp.response_header.client_error?
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
146
|
+
@already_crawled.insert(url)
|
147
|
+
@linkStack.delete(url)
|
148
|
+
lg("Page Not text/html -- #{url}")
|
149
|
+
return false
|
150
|
+
end
|
151
|
+
@connection_tally[:success] += 1
|
152
|
+
return true
|
153
|
+
end
|
154
|
+
|
120
155
|
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
121
156
|
new_stuff = []
|
122
157
|
EM.synchrony do
|
@@ -129,20 +164,27 @@ module Retriever
|
|
129
164
|
next
|
130
165
|
end
|
131
166
|
resp = EventMachine::HttpRequest.new(url).get
|
132
|
-
|
167
|
+
next if !good_response?(resp,url)
|
168
|
+
new_page = Retriever::Page.new(resp.response,@t)
|
169
|
+
lg("Page Fetched: #{url}")
|
133
170
|
@already_crawled.insert(url)
|
134
171
|
if @prgrss
|
135
172
|
@progressbar.increment if @already_crawled.size < @maxPages
|
136
173
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
174
|
+
if @seo
|
175
|
+
seos = [url]
|
176
|
+
seos.concat(new_page.parseSEO)
|
177
|
+
@data.push(seos)
|
178
|
+
lg("--page SEO scraped")
|
179
|
+
end
|
180
|
+
if new_page.links
|
181
|
+
lg("--#{new_page.links.size} links found")
|
182
|
+
internal_links_arr = new_page.parseInternalVisitable
|
141
183
|
new_stuff.push(internal_links_arr)
|
142
184
|
if @fh
|
143
|
-
filez =
|
144
|
-
@
|
145
|
-
lg("
|
185
|
+
filez = new_page.parseFiles
|
186
|
+
@data.concat(filez) if !filez.empty?
|
187
|
+
lg("--#{filez.size} files found")
|
146
188
|
end
|
147
189
|
end
|
148
190
|
end
|
@@ -151,8 +193,5 @@ module Retriever
|
|
151
193
|
end
|
152
194
|
new_stuff.uniq!
|
153
195
|
end
|
154
|
-
def parseFiles(all_links)
|
155
|
-
all_links.select{ |linky| (@file_re =~ linky)}
|
156
|
-
end
|
157
196
|
end
|
158
197
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchFiles < Fetch
|
3
|
-
attr_reader :fileStack
|
4
3
|
def initialize(url,options)
|
5
4
|
super
|
6
|
-
@
|
7
|
-
|
8
|
-
@linkStack =
|
5
|
+
@data = []
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
9
8
|
lg("URL Crawled: #{@t.target}")
|
10
|
-
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
11
10
|
|
12
|
-
tempFileCollection =
|
13
|
-
@
|
14
|
-
|
11
|
+
tempFileCollection = page_one.parseFiles
|
12
|
+
@data.concat(tempFileCollection) if tempFileCollection.size>0
|
13
|
+
lg("#{@data.size} new files found")
|
15
14
|
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
16
15
|
|
17
16
|
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
@@ -19,19 +18,14 @@ module Retriever
|
|
19
18
|
|
20
19
|
self.async_crawl_and_collect()
|
21
20
|
|
22
|
-
@
|
23
|
-
@
|
24
|
-
|
25
|
-
self.dump(self.fileStack)
|
26
|
-
self.write(@output,self.fileStack) if @output
|
27
|
-
self.autodownload() if @autodown
|
21
|
+
@data.sort_by! {|x| x.length}
|
22
|
+
@data.uniq!
|
28
23
|
end
|
29
24
|
def download_file(path)
|
30
25
|
arr = path.split('/')
|
31
26
|
shortname = arr.pop
|
32
27
|
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
33
28
|
File.open(shortname, "wb") do |saved_file|
|
34
|
-
# the following "open" is provided by open-uri
|
35
29
|
open(path) do |read_file|
|
36
30
|
saved_file.write(read_file.read)
|
37
31
|
end
|
@@ -39,7 +33,7 @@ module Retriever
|
|
39
33
|
puts " SUCCESS: Download Complete"
|
40
34
|
end
|
41
35
|
def autodownload()
|
42
|
-
lenny = @
|
36
|
+
lenny = @data.count
|
43
37
|
puts "###################"
|
44
38
|
puts "### Initiating Autodownload..."
|
45
39
|
puts "###################"
|
@@ -53,7 +47,7 @@ module Retriever
|
|
53
47
|
Dir.chdir("rr-downloads")
|
54
48
|
end
|
55
49
|
file_counter = 0
|
56
|
-
@
|
50
|
+
@data.each do |entry|
|
57
51
|
begin
|
58
52
|
self.download_file(entry)
|
59
53
|
file_counter+=1
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchSEO < Fetch
|
3
|
+
def initialize(url,options)
|
4
|
+
super
|
5
|
+
@data = []
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
8
|
+
lg("URL Crawled: #{@t.target}")
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
10
|
+
|
11
|
+
@data.push(page_one.parseSEO)
|
12
|
+
lg("#{@data.size} pages scraped")
|
13
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
14
|
+
|
15
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
16
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
17
|
+
|
18
|
+
self.async_crawl_and_collect()
|
19
|
+
|
20
|
+
@data.sort_by! {|x| x[0].length}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,38 +1,34 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSitemap < Fetch
|
3
|
-
attr_reader :sitemap
|
4
3
|
def initialize(url,options)
|
5
4
|
super
|
6
|
-
@
|
7
|
-
|
5
|
+
@data = [@t.target]
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
8
8
|
lg("URL Crawled: #{@t.target}")
|
9
|
-
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
10
10
|
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
11
11
|
|
12
12
|
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
13
13
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
14
|
-
@
|
14
|
+
@data.concat(@linkStack)
|
15
15
|
|
16
16
|
self.async_crawl_and_collect()
|
17
17
|
|
18
|
-
@
|
19
|
-
@
|
20
|
-
|
21
|
-
self.dump(self.sitemap)
|
22
|
-
self.write(self.sitemap) if /CSV/i =~ @s
|
23
|
-
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
18
|
+
@data.sort_by! {|x| x.length} if @data.size>1
|
19
|
+
@data.uniq!
|
24
20
|
end
|
25
|
-
def gen_xml
|
21
|
+
def gen_xml
|
26
22
|
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
27
23
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
|
-
data.each do |url|
|
24
|
+
@data.each do |url|
|
29
25
|
f << "<url><loc>#{url}</loc></url>"
|
30
26
|
end
|
31
27
|
f << "</urlset>"
|
32
28
|
f.close
|
33
29
|
puts "###############################"
|
34
30
|
puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
|
35
|
-
puts "Object Count: #{@
|
31
|
+
puts "Object Count: #{@data.size}"
|
36
32
|
puts "###############################"
|
37
33
|
puts
|
38
34
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Retriever
|
2
|
+
|
3
|
+
class Page
|
4
|
+
|
5
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
6
|
+
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
|
7
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
8
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
|
+
|
10
|
+
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
|
11
|
+
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
|
12
|
+
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
13
|
+
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
14
|
+
|
15
|
+
attr_reader :links, :source, :t
|
16
|
+
|
17
|
+
def initialize(source,t)
|
18
|
+
@t = t
|
19
|
+
@source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
20
|
+
@links = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
#recieves page source as string
|
24
|
+
#returns array of unique href links
|
25
|
+
def links
|
26
|
+
return @links if @links
|
27
|
+
return false if !@source
|
28
|
+
@links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
29
|
+
link = match[0]
|
30
|
+
Link.new(@t.host, link).path
|
31
|
+
end.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
def parseInternal
|
35
|
+
links.select{ |linky| (@t.host_re =~ linky) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def parseInternalVisitable
|
39
|
+
parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def parseFiles
|
43
|
+
links.select{ |linky| (@t.file_re =~ linky)}
|
44
|
+
end
|
45
|
+
|
46
|
+
def title
|
47
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
|
48
|
+
end
|
49
|
+
|
50
|
+
def desc
|
51
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
|
52
|
+
end
|
53
|
+
|
54
|
+
def h1
|
55
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
|
56
|
+
end
|
57
|
+
|
58
|
+
def h2
|
59
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
|
60
|
+
end
|
61
|
+
|
62
|
+
def parseSEO
|
63
|
+
return [title,desc,h1,h2]
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
data/lib/retriever/target.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
|
3
3
|
module Retriever
|
4
|
+
|
4
5
|
class Target
|
6
|
+
|
5
7
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
8
|
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
-
|
8
|
-
|
9
|
+
|
10
|
+
attr_reader :host, :target, :host_re, :source, :file_re
|
11
|
+
|
12
|
+
def initialize(url,file_re=nil)
|
9
13
|
url = "http://#{url}" if (!(HTTP_RE =~ url))
|
10
14
|
fail "Bad URL" if (!(/\./ =~ url))
|
11
15
|
new_uri = URI(url)
|
12
16
|
@target = new_uri.to_s
|
13
17
|
@host = new_uri.host
|
14
|
-
@host_re = Regexp.new(@host)
|
18
|
+
@host_re = Regexp.new(@host.sub('www.',''))
|
19
|
+
@file_re ||= file_re
|
15
20
|
end
|
16
21
|
|
17
22
|
def source
|
@@ -19,23 +24,29 @@ module Retriever
|
|
19
24
|
begin
|
20
25
|
resp = open(@target)
|
21
26
|
rescue StandardError => e
|
22
|
-
#puts e.message + " ## " + url
|
23
|
-
#the trap abrt is nescessary to handle the SSL error
|
24
|
-
#for some ungodly reason it's the only way I found to handle it
|
25
27
|
trap("ABRT"){
|
26
28
|
puts "#{@target} failed SSL Certification Verification"
|
27
29
|
}
|
28
30
|
return false
|
29
31
|
end
|
30
|
-
|
31
|
-
|
32
|
+
resp_url = resp.base_uri.to_s
|
33
|
+
if (@target != resp_url)
|
34
|
+
if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
|
35
|
+
new_t = Retriever::Target.new(resp_url)
|
36
|
+
@target = new_t.target
|
37
|
+
@host = new_t.host
|
38
|
+
return new_t.source
|
39
|
+
end
|
40
|
+
fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
|
32
41
|
end
|
33
42
|
resp = resp.read
|
34
43
|
if resp == ""
|
35
44
|
fail "Domain is not working. Try the non-WWW version."
|
36
45
|
end
|
37
|
-
|
46
|
+
fail "Domain not working. Try HTTPS???" if !resp
|
47
|
+
return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
|
38
48
|
end
|
39
49
|
|
40
50
|
end
|
51
|
+
|
41
52
|
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'retriever/fetch'
|
2
2
|
require 'retriever/fetchfiles'
|
3
3
|
require 'retriever/fetchsitemap'
|
4
|
+
require 'retriever/fetchseo'
|
5
|
+
require 'retriever/cli'
|
4
6
|
require 'retriever/link'
|
5
7
|
require 'retriever/target'
|
8
|
+
require 'retriever/page'
|
9
|
+
require 'retriever/openuri-redirect-patch'
|
6
10
|
|
7
11
|
module Retriever
|
8
12
|
|
data/readme.md
CHANGED
@@ -17,7 +17,7 @@ Install the gem
|
|
17
17
|
```sh
|
18
18
|
gem install rubyretriever
|
19
19
|
```
|
20
|
-
|
20
|
+
|
21
21
|
**Example: Sitemap mode**
|
22
22
|
```sh
|
23
23
|
rr --sitemap CSV --progress --limit 100 http://www.cnet.com
|
@@ -31,14 +31,25 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
|
|
31
31
|
|
32
32
|
**Example: File Harvesting mode**
|
33
33
|
```sh
|
34
|
-
rr --files pdf --progress --limit 1000 --
|
34
|
+
rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
|
35
|
+
```
|
36
|
+
OR -- SAME COMMAND
|
37
|
+
```sh
|
38
|
+
rr -f pdf -p -l 100 http://www.hubspot.com
|
39
|
+
```
|
40
|
+
|
41
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
42
|
+
|
43
|
+
**Example: SEO mode**
|
44
|
+
```sh
|
45
|
+
rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
|
35
46
|
```
|
36
47
|
OR -- SAME COMMAND
|
37
48
|
```sh
|
38
|
-
rr -
|
49
|
+
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
39
50
|
```
|
40
51
|
|
41
|
-
This would go to http://www.
|
52
|
+
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
|
42
53
|
|
43
54
|
|
44
55
|
command-line arguments
|
@@ -47,10 +58,11 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
|
47
58
|
|
48
59
|
Where MODE FLAG is required, and is either:
|
49
60
|
-s, --sitemap FORMAT (only accepts CSV or XML atm)
|
50
|
-
-f, --files FILETYPE
|
61
|
+
-f, --files FILETYPE
|
62
|
+
-e, --seo
|
51
63
|
|
52
64
|
and OPTIONS is the applicable:
|
53
|
-
-o, --out FILENAME *Dump
|
65
|
+
-o, --out FILENAME *Dump fetch data as CSV*
|
54
66
|
-p, --progress *Outputs a progressbar*
|
55
67
|
-v, --verbose *Output more information*
|
56
68
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
data/spec/link_spec.rb
CHANGED
@@ -2,8 +2,8 @@ require 'retriever'
|
|
2
2
|
|
3
3
|
describe "Link" do
|
4
4
|
|
5
|
-
|
6
|
-
let(:links) {
|
5
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
6
|
+
let(:links) { Retriever::Page.new(@source,t).links }
|
7
7
|
|
8
8
|
it "collects links in anchor tags" do
|
9
9
|
@source = (<<SOURCE).strip
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'retriever/page'
|
2
|
+
require 'retriever/fetch'
|
3
|
+
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
|
5
|
+
|
6
|
+
describe "Page" do
|
7
|
+
|
8
|
+
describe "#links" do
|
9
|
+
let (:links){Retriever::Page.new(@source,t).links}
|
10
|
+
it "collects all unique href links on the page" do
|
11
|
+
@source = (<<SOURCE).strip
|
12
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
13
|
+
<a href='/test.html'>test</a>
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
15
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
16
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
|
+
SOURCE
|
18
|
+
|
19
|
+
expect(links).to have(4).items
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#parseInternal" do
|
24
|
+
let (:links){Retriever::Page.new(@source,t).parseInternal}
|
25
|
+
it "filters links by host" do
|
26
|
+
@source = (<<SOURCE).strip
|
27
|
+
<a href='http://www.cnet.com/'>download</a>
|
28
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
29
|
+
SOURCE
|
30
|
+
|
31
|
+
expect(links).to have(1).items
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#parseInternalVisitable" do
|
36
|
+
let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
|
37
|
+
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
38
|
+
@source = (<<SOURCE).strip
|
39
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
40
|
+
SOURCE
|
41
|
+
expect(links).to have(0).items
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#parseFiles" do
|
46
|
+
let (:links){Retriever::Page.new(@source,t).parseFiles}
|
47
|
+
it "filters links by filetype" do
|
48
|
+
@source = (<<SOURCE).strip
|
49
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
50
|
+
http://www.google.com
|
51
|
+
<a href='/test.html'>test</a>
|
52
|
+
SOURCE
|
53
|
+
expect(links).to have(1).items
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#title" do
|
58
|
+
let (:page){Retriever::Page.new(@source,t)}
|
59
|
+
it "returns page title" do
|
60
|
+
@source = (<<SOURCE).strip
|
61
|
+
<title>test</title>
|
62
|
+
SOURCE
|
63
|
+
expect(page.title).to eq('test')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
describe "#desc" do
|
67
|
+
let (:page){Retriever::Page.new(@source,t)}
|
68
|
+
it "returns meta description" do
|
69
|
+
@source = (<<SOURCE).strip
|
70
|
+
<meta name='description' content="test2 ">
|
71
|
+
SOURCE
|
72
|
+
expect(page.desc).to eq('test2 ')
|
73
|
+
end
|
74
|
+
end
|
75
|
+
describe "#h1" do
|
76
|
+
let (:page){Retriever::Page.new(@source,t)}
|
77
|
+
it "returns h1 text" do
|
78
|
+
@source = (<<SOURCE).strip
|
79
|
+
<h1>test 3</h1>
|
80
|
+
SOURCE
|
81
|
+
expect(page.h1).to eq('test 3')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
describe "#h2" do
|
85
|
+
let (:page){Retriever::Page.new(@source,t)}
|
86
|
+
it "returns h2 text" do
|
87
|
+
@source = (<<SOURCE).strip
|
88
|
+
<h2> test 4 </h2>
|
89
|
+
SOURCE
|
90
|
+
expect(page.h2).to eq(' test 4 ')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,52 +1,5 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
|
-
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
|
-
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
5
|
-
http://www.google.com
|
6
|
-
<a href='/test.html'>test</a>
|
7
|
-
<a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
|
8
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
9
|
-
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
10
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
11
|
-
test.com
|
12
|
-
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
|
-
<a href='cpage_18'>about</a>"
|
14
|
-
|
15
|
-
links_collection = r.fetchLinks(test_html)
|
16
|
-
|
17
3
|
describe "Fetch" do
|
18
|
-
|
19
|
-
describe "#fetchLinks" do
|
20
|
-
it "collects all unique href links on the page" do
|
21
|
-
expect(links_collection).to have(6).items
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
describe "#parseInternalLinks" do
|
26
|
-
let (:filtered_links) {r.parseInternalLinks(links_collection)}
|
27
|
-
it "filters links by host" do
|
28
|
-
filtered_links.each do |link|
|
29
|
-
expect(link).to include("www.cnet.com")
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
describe "#parseInternalVisitableLinks" do
|
35
|
-
let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
|
36
|
-
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
37
|
-
filtered_links.each do |link|
|
38
|
-
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
describe "#parseFiles" do
|
44
|
-
let(:file_list) {r.parseFiles(links_collection)}
|
45
|
-
it "filters links by filetype" do
|
46
|
-
file_list.each do |link|
|
47
|
-
expect(link).to include(".exe")
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
4
|
+
|
52
5
|
end
|
data/spec/target_spec.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
|
5
5
|
|
6
6
|
describe "Target" do
|
7
7
|
|
@@ -14,7 +14,11 @@ describe "Target" do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
it "creates host_re var" do
|
17
|
-
expect(t.host_re).to eq(/
|
17
|
+
expect(t.host_re).to eq(/cnet.com/)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "creates file_re var (when provided)" do
|
21
|
+
expect(t.file_re).to eq(/\.exe\z/)
|
18
22
|
end
|
19
23
|
|
20
24
|
it "adds protocol to Target URL if none given" do
|
@@ -34,6 +38,7 @@ describe "Target" do
|
|
34
38
|
it "fails if target redirects to new host" do
|
35
39
|
expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
|
36
40
|
end
|
41
|
+
|
37
42
|
end
|
38
43
|
|
39
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|
@@ -119,18 +119,23 @@ files:
|
|
119
119
|
- LICENSE
|
120
120
|
- bin/rr
|
121
121
|
- lib/retriever.rb
|
122
|
+
- lib/retriever/cli.rb
|
122
123
|
- lib/retriever/fetch.rb
|
123
124
|
- lib/retriever/fetchfiles.rb
|
125
|
+
- lib/retriever/fetchseo.rb
|
124
126
|
- lib/retriever/fetchsitemap.rb
|
125
127
|
- lib/retriever/link.rb
|
128
|
+
- lib/retriever/openuri-redirect-patch.rb
|
129
|
+
- lib/retriever/page.rb
|
126
130
|
- lib/retriever/target.rb
|
127
131
|
- lib/retriever/version.rb
|
128
132
|
- readme.md
|
129
133
|
- spec/link_spec.rb
|
134
|
+
- spec/page_spec.rb
|
130
135
|
- spec/retriever_spec.rb
|
131
136
|
- spec/spec_helper.rb
|
132
137
|
- spec/target_spec.rb
|
133
|
-
homepage: http://
|
138
|
+
homepage: http://softwarebyjoe.com/rubyretriever/
|
134
139
|
licenses:
|
135
140
|
- MIT
|
136
141
|
metadata: {}
|