rubyretriever 0.1.4 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +12 -6
- data/lib/retriever/cli.rb +27 -0
- data/lib/retriever/fetch.rb +84 -45
- data/lib/retriever/fetchfiles.rb +11 -17
- data/lib/retriever/fetchseo.rb +23 -0
- data/lib/retriever/fetchsitemap.rb +10 -14
- data/lib/retriever/openuri-redirect-patch.rb +6 -0
- data/lib/retriever/page.rb +68 -0
- data/lib/retriever/target.rb +20 -9
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +4 -0
- data/readme.md +18 -6
- data/spec/link_spec.rb +2 -2
- data/spec/page_spec.rb +94 -0
- data/spec/retriever_spec.rb +1 -48
- data/spec/target_spec.rb +7 -2
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
|
4
|
+
data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
|
7
|
+
data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
|
data/bin/rr
CHANGED
@@ -1,18 +1,24 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
+
|
2
3
|
require 'retriever'
|
3
4
|
require 'optparse'
|
5
|
+
|
4
6
|
options = {}
|
5
7
|
optparse = OptionParser.new do|opts|
|
6
8
|
# Set a banner, displayed at the top
|
7
9
|
# of the help screen.
|
8
10
|
opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
|
9
11
|
options[:sitemap] = false
|
10
|
-
opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode
|
11
|
-
options[:sitemap] = output_type
|
12
|
+
opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
|
13
|
+
options[:sitemap] = output_type||''
|
12
14
|
end
|
13
15
|
options[:fileharvest] = false
|
14
|
-
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode
|
16
|
+
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
|
15
17
|
options[:fileharvest] = file_ext
|
18
|
+
end
|
19
|
+
options[:seo] = false
|
20
|
+
opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
|
21
|
+
options[:seo] = true
|
16
22
|
end
|
17
23
|
options[:filename] = nil
|
18
24
|
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
@@ -56,14 +62,14 @@ ARGV.each do|q|
|
|
56
62
|
puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
|
57
63
|
puts "### Performing File Harvest" if options[:fileharvest]
|
58
64
|
puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
|
65
|
+
puts "### Performing SEO Scrape" if options[:seo]
|
59
66
|
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
60
67
|
puts "### Being verbose"
|
61
|
-
puts "### Stopping after #{options[:maxpages]} pages"
|
68
|
+
puts "### Stopping after #{options[:maxpages]} pages"
|
62
69
|
end
|
63
70
|
puts "###############################"
|
64
71
|
puts "### [RubyRetriever] go fetch #{q}"
|
65
|
-
Retriever::
|
66
|
-
Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
72
|
+
Retriever::CLI.new(q, options)
|
67
73
|
puts "### [RubyRetriever] is done."
|
68
74
|
puts "###############################"
|
69
75
|
puts
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Retriever
|
2
|
+
class CLI
|
3
|
+
def initialize(url,options)
|
4
|
+
|
5
|
+
#kick off the fetch mode of choice
|
6
|
+
if options[:fileharvest]
|
7
|
+
@fetch = Retriever::FetchFiles.new(url, options)
|
8
|
+
elsif options[:sitemap]
|
9
|
+
@fetch = Retriever::FetchSitemap.new(url, options)
|
10
|
+
elsif options[:seo]
|
11
|
+
@fetch = Retriever::FetchSEO.new(url, options)
|
12
|
+
else
|
13
|
+
fail "### Error: No Mode Selected"
|
14
|
+
end
|
15
|
+
|
16
|
+
#all fetch modes
|
17
|
+
@fetch.dump
|
18
|
+
@fetch.write if options[:filename]
|
19
|
+
|
20
|
+
#fileharvest only
|
21
|
+
@fetch.autodownload if options[:autodown] && options[:fileharvest]
|
22
|
+
|
23
|
+
#sitemap only
|
24
|
+
@fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/retriever/fetch.rb
CHANGED
@@ -9,12 +9,14 @@ require 'bloomfilter-rb'
|
|
9
9
|
module Retriever
|
10
10
|
class Fetch
|
11
11
|
attr_reader :maxPages, :t
|
12
|
-
#constants
|
13
|
-
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
14
|
-
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
15
12
|
|
16
13
|
def initialize(url,options)
|
17
|
-
@
|
14
|
+
@connection_tally = {
|
15
|
+
:success => 0,
|
16
|
+
:error => 0,
|
17
|
+
:error_client => 0,
|
18
|
+
:error_server => 0
|
19
|
+
}
|
18
20
|
#OPTIONS
|
19
21
|
@prgrss = options[:progress] ? options[:progress] : false
|
20
22
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
@@ -23,6 +25,7 @@ module Retriever
|
|
23
25
|
@fh = options[:fileharvest] ? options[:fileharvest] : false
|
24
26
|
@file_ext = @fh.to_s
|
25
27
|
@s = options[:sitemap] ? options[:sitemap] : false
|
28
|
+
@seo = options[:seo] ? true : false
|
26
29
|
@autodown = options[:autodown] ? true : false
|
27
30
|
#
|
28
31
|
if @fh
|
@@ -30,9 +33,6 @@ module Retriever
|
|
30
33
|
@file_re = Regexp.new(tempExtStr).freeze
|
31
34
|
else
|
32
35
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
|
-
if !@output
|
34
|
-
@output = "rr-#{@t.host.split('.')[1]}"
|
35
|
-
end
|
36
36
|
end
|
37
37
|
if @prgrss
|
38
38
|
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
@@ -44,8 +44,13 @@ module Retriever
|
|
44
44
|
}
|
45
45
|
@progressbar = ProgressBar.create(prgressVars)
|
46
46
|
end
|
47
|
+
@t = Retriever::Target.new(url,@file_re)
|
47
48
|
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
48
49
|
@already_crawled.insert(@t.target)
|
50
|
+
if (@fh && !@output)
|
51
|
+
@output = "rr-#{@t.host.split('.')[1]}"
|
52
|
+
end
|
53
|
+
fail "bad page source on target -- try HTTPS?" if !@t.source
|
49
54
|
end
|
50
55
|
def errlog(msg)
|
51
56
|
raise "ERROR: #{msg}"
|
@@ -53,52 +58,52 @@ module Retriever
|
|
53
58
|
def lg(msg)
|
54
59
|
puts "### #{msg}" if @v
|
55
60
|
end
|
56
|
-
def dump
|
61
|
+
def dump
|
57
62
|
puts "###############################"
|
63
|
+
if @v
|
64
|
+
puts "Connection Tally:"
|
65
|
+
puts @connection_tally.to_s
|
66
|
+
puts "###############################"
|
67
|
+
end
|
58
68
|
if @s
|
59
69
|
puts "#{@t.target} Sitemap"
|
60
|
-
puts "Page Count: #{data.size}"
|
70
|
+
puts "Page Count: #{@data.size}"
|
61
71
|
elsif @fh
|
62
72
|
puts "Target URL: #{@t.target}"
|
63
73
|
puts "Filetype: #{@file_ext}"
|
64
|
-
puts "File Count: #{data.size}"
|
74
|
+
puts "File Count: #{@data.size}"
|
75
|
+
elsif @seo
|
76
|
+
puts "#{@t.target} SEO Metrics"
|
77
|
+
puts "Page Count: #{@data.size}"
|
65
78
|
else
|
66
|
-
|
79
|
+
fail "ERROR - Cannot dump - Mode Not Found"
|
67
80
|
end
|
68
81
|
puts "###############################"
|
69
|
-
|
82
|
+
@data.each do |line|
|
83
|
+
puts line
|
84
|
+
end
|
70
85
|
puts "###############################"
|
71
86
|
puts
|
72
87
|
end
|
73
|
-
def write
|
88
|
+
def write
|
74
89
|
if @output
|
90
|
+
i = 0
|
75
91
|
CSV.open("#{@output}.csv", "w") do |csv|
|
76
|
-
|
77
|
-
|
78
|
-
|
92
|
+
if ((i == 0) && @seo)
|
93
|
+
csv << ['URL','Page Title','Meta Description','H1','H2']
|
94
|
+
i +=1
|
95
|
+
end
|
96
|
+
@data.each do |entry|
|
97
|
+
csv << entry
|
98
|
+
end
|
79
99
|
end
|
80
100
|
puts "###############################"
|
81
101
|
puts "File Created: #{@output}.csv"
|
82
|
-
puts "Object Count: #{data.size}"
|
102
|
+
puts "Object Count: #{@data.size}"
|
83
103
|
puts "###############################"
|
84
104
|
puts
|
85
105
|
end
|
86
106
|
end
|
87
|
-
#recieves page source as string
|
88
|
-
#returns array of unique href links
|
89
|
-
def fetchLinks(doc)
|
90
|
-
return false if !doc
|
91
|
-
doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
92
|
-
link = match[0]
|
93
|
-
Link.new(@t.host, link).path
|
94
|
-
end.uniq
|
95
|
-
end
|
96
|
-
def parseInternalLinks(all_links)
|
97
|
-
all_links.select{ |linky| (@t.host_re =~ linky) }
|
98
|
-
end
|
99
|
-
def parseInternalVisitableLinks(all_links)
|
100
|
-
parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
101
|
-
end
|
102
107
|
def async_crawl_and_collect()
|
103
108
|
while (@already_crawled.size < @maxPages)
|
104
109
|
if @linkStack.empty?
|
@@ -112,11 +117,41 @@ module Retriever
|
|
112
117
|
new_links_arr = self.asyncGetWave()
|
113
118
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
114
119
|
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
115
|
-
@linkStack.concat(new_links_arr)
|
116
|
-
@
|
120
|
+
@linkStack.concat(new_links_arr).uniq!
|
121
|
+
@data.concat(new_links_arr) if @s
|
117
122
|
end
|
118
123
|
@progressbar.finish if @prgrss
|
119
124
|
end
|
125
|
+
def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
|
126
|
+
return false if !resp
|
127
|
+
if resp.response_header.redirection? #we got redirected
|
128
|
+
loc = resp.response_header.location
|
129
|
+
lg("#{url} Redirected to #{loc}")
|
130
|
+
if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
|
131
|
+
@linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
|
132
|
+
lg("--Added to linkStack for later")
|
133
|
+
return false
|
134
|
+
end
|
135
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
136
|
+
return false
|
137
|
+
end
|
138
|
+
if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
139
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
140
|
+
@connection_tally[:error] += 1
|
141
|
+
@connection_tally[:error_server] += 1 if resp.response_header.server_error?
|
142
|
+
@connection_tally[:error_client] += 1 if resp.response_header.client_error?
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
|
146
|
+
@already_crawled.insert(url)
|
147
|
+
@linkStack.delete(url)
|
148
|
+
lg("Page Not text/html -- #{url}")
|
149
|
+
return false
|
150
|
+
end
|
151
|
+
@connection_tally[:success] += 1
|
152
|
+
return true
|
153
|
+
end
|
154
|
+
|
120
155
|
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
121
156
|
new_stuff = []
|
122
157
|
EM.synchrony do
|
@@ -129,20 +164,27 @@ module Retriever
|
|
129
164
|
next
|
130
165
|
end
|
131
166
|
resp = EventMachine::HttpRequest.new(url).get
|
132
|
-
|
167
|
+
next if !good_response?(resp,url)
|
168
|
+
new_page = Retriever::Page.new(resp.response,@t)
|
169
|
+
lg("Page Fetched: #{url}")
|
133
170
|
@already_crawled.insert(url)
|
134
171
|
if @prgrss
|
135
172
|
@progressbar.increment if @already_crawled.size < @maxPages
|
136
173
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
174
|
+
if @seo
|
175
|
+
seos = [url]
|
176
|
+
seos.concat(new_page.parseSEO)
|
177
|
+
@data.push(seos)
|
178
|
+
lg("--page SEO scraped")
|
179
|
+
end
|
180
|
+
if new_page.links
|
181
|
+
lg("--#{new_page.links.size} links found")
|
182
|
+
internal_links_arr = new_page.parseInternalVisitable
|
141
183
|
new_stuff.push(internal_links_arr)
|
142
184
|
if @fh
|
143
|
-
filez =
|
144
|
-
@
|
145
|
-
lg("
|
185
|
+
filez = new_page.parseFiles
|
186
|
+
@data.concat(filez) if !filez.empty?
|
187
|
+
lg("--#{filez.size} files found")
|
146
188
|
end
|
147
189
|
end
|
148
190
|
end
|
@@ -151,8 +193,5 @@ module Retriever
|
|
151
193
|
end
|
152
194
|
new_stuff.uniq!
|
153
195
|
end
|
154
|
-
def parseFiles(all_links)
|
155
|
-
all_links.select{ |linky| (@file_re =~ linky)}
|
156
|
-
end
|
157
196
|
end
|
158
197
|
end
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchFiles < Fetch
|
3
|
-
attr_reader :fileStack
|
4
3
|
def initialize(url,options)
|
5
4
|
super
|
6
|
-
@
|
7
|
-
|
8
|
-
@linkStack =
|
5
|
+
@data = []
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
9
8
|
lg("URL Crawled: #{@t.target}")
|
10
|
-
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
11
10
|
|
12
|
-
tempFileCollection =
|
13
|
-
@
|
14
|
-
|
11
|
+
tempFileCollection = page_one.parseFiles
|
12
|
+
@data.concat(tempFileCollection) if tempFileCollection.size>0
|
13
|
+
lg("#{@data.size} new files found")
|
15
14
|
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
16
15
|
|
17
16
|
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
@@ -19,19 +18,14 @@ module Retriever
|
|
19
18
|
|
20
19
|
self.async_crawl_and_collect()
|
21
20
|
|
22
|
-
@
|
23
|
-
@
|
24
|
-
|
25
|
-
self.dump(self.fileStack)
|
26
|
-
self.write(@output,self.fileStack) if @output
|
27
|
-
self.autodownload() if @autodown
|
21
|
+
@data.sort_by! {|x| x.length}
|
22
|
+
@data.uniq!
|
28
23
|
end
|
29
24
|
def download_file(path)
|
30
25
|
arr = path.split('/')
|
31
26
|
shortname = arr.pop
|
32
27
|
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
33
28
|
File.open(shortname, "wb") do |saved_file|
|
34
|
-
# the following "open" is provided by open-uri
|
35
29
|
open(path) do |read_file|
|
36
30
|
saved_file.write(read_file.read)
|
37
31
|
end
|
@@ -39,7 +33,7 @@ module Retriever
|
|
39
33
|
puts " SUCCESS: Download Complete"
|
40
34
|
end
|
41
35
|
def autodownload()
|
42
|
-
lenny = @
|
36
|
+
lenny = @data.count
|
43
37
|
puts "###################"
|
44
38
|
puts "### Initiating Autodownload..."
|
45
39
|
puts "###################"
|
@@ -53,7 +47,7 @@ module Retriever
|
|
53
47
|
Dir.chdir("rr-downloads")
|
54
48
|
end
|
55
49
|
file_counter = 0
|
56
|
-
@
|
50
|
+
@data.each do |entry|
|
57
51
|
begin
|
58
52
|
self.download_file(entry)
|
59
53
|
file_counter+=1
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchSEO < Fetch
|
3
|
+
def initialize(url,options)
|
4
|
+
super
|
5
|
+
@data = []
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
8
|
+
lg("URL Crawled: #{@t.target}")
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
10
|
+
|
11
|
+
@data.push(page_one.parseSEO)
|
12
|
+
lg("#{@data.size} pages scraped")
|
13
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
14
|
+
|
15
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
16
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
17
|
+
|
18
|
+
self.async_crawl_and_collect()
|
19
|
+
|
20
|
+
@data.sort_by! {|x| x[0].length}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -1,38 +1,34 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSitemap < Fetch
|
3
|
-
attr_reader :sitemap
|
4
3
|
def initialize(url,options)
|
5
4
|
super
|
6
|
-
@
|
7
|
-
|
5
|
+
@data = [@t.target]
|
6
|
+
page_one = Retriever::Page.new(@t.source,@t)
|
7
|
+
@linkStack = page_one.parseInternalVisitable
|
8
8
|
lg("URL Crawled: #{@t.target}")
|
9
|
-
|
9
|
+
lg("#{@linkStack.size-1} new links found")
|
10
10
|
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
11
11
|
|
12
12
|
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
13
13
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
14
|
-
@
|
14
|
+
@data.concat(@linkStack)
|
15
15
|
|
16
16
|
self.async_crawl_and_collect()
|
17
17
|
|
18
|
-
@
|
19
|
-
@
|
20
|
-
|
21
|
-
self.dump(self.sitemap)
|
22
|
-
self.write(self.sitemap) if /CSV/i =~ @s
|
23
|
-
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
18
|
+
@data.sort_by! {|x| x.length} if @data.size>1
|
19
|
+
@data.uniq!
|
24
20
|
end
|
25
|
-
def gen_xml
|
21
|
+
def gen_xml
|
26
22
|
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
27
23
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
|
-
data.each do |url|
|
24
|
+
@data.each do |url|
|
29
25
|
f << "<url><loc>#{url}</loc></url>"
|
30
26
|
end
|
31
27
|
f << "</urlset>"
|
32
28
|
f.close
|
33
29
|
puts "###############################"
|
34
30
|
puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
|
35
|
-
puts "Object Count: #{@
|
31
|
+
puts "Object Count: #{@data.size}"
|
36
32
|
puts "###############################"
|
37
33
|
puts
|
38
34
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Retriever
|
2
|
+
|
3
|
+
class Page
|
4
|
+
|
5
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
6
|
+
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
|
7
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
8
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
|
+
|
10
|
+
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
|
11
|
+
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
|
12
|
+
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
13
|
+
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
14
|
+
|
15
|
+
attr_reader :links, :source, :t
|
16
|
+
|
17
|
+
def initialize(source,t)
|
18
|
+
@t = t
|
19
|
+
@source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
20
|
+
@links = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
#recieves page source as string
|
24
|
+
#returns array of unique href links
|
25
|
+
def links
|
26
|
+
return @links if @links
|
27
|
+
return false if !@source
|
28
|
+
@links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
29
|
+
link = match[0]
|
30
|
+
Link.new(@t.host, link).path
|
31
|
+
end.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
def parseInternal
|
35
|
+
links.select{ |linky| (@t.host_re =~ linky) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def parseInternalVisitable
|
39
|
+
parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def parseFiles
|
43
|
+
links.select{ |linky| (@t.file_re =~ linky)}
|
44
|
+
end
|
45
|
+
|
46
|
+
def title
|
47
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
|
48
|
+
end
|
49
|
+
|
50
|
+
def desc
|
51
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
|
52
|
+
end
|
53
|
+
|
54
|
+
def h1
|
55
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
|
56
|
+
end
|
57
|
+
|
58
|
+
def h2
|
59
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
|
60
|
+
end
|
61
|
+
|
62
|
+
def parseSEO
|
63
|
+
return [title,desc,h1,h2]
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
data/lib/retriever/target.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
|
3
3
|
module Retriever
|
4
|
+
|
4
5
|
class Target
|
6
|
+
|
5
7
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
8
|
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
-
|
8
|
-
|
9
|
+
|
10
|
+
attr_reader :host, :target, :host_re, :source, :file_re
|
11
|
+
|
12
|
+
def initialize(url,file_re=nil)
|
9
13
|
url = "http://#{url}" if (!(HTTP_RE =~ url))
|
10
14
|
fail "Bad URL" if (!(/\./ =~ url))
|
11
15
|
new_uri = URI(url)
|
12
16
|
@target = new_uri.to_s
|
13
17
|
@host = new_uri.host
|
14
|
-
@host_re = Regexp.new(@host)
|
18
|
+
@host_re = Regexp.new(@host.sub('www.',''))
|
19
|
+
@file_re ||= file_re
|
15
20
|
end
|
16
21
|
|
17
22
|
def source
|
@@ -19,23 +24,29 @@ module Retriever
|
|
19
24
|
begin
|
20
25
|
resp = open(@target)
|
21
26
|
rescue StandardError => e
|
22
|
-
#puts e.message + " ## " + url
|
23
|
-
#the trap abrt is nescessary to handle the SSL error
|
24
|
-
#for some ungodly reason it's the only way I found to handle it
|
25
27
|
trap("ABRT"){
|
26
28
|
puts "#{@target} failed SSL Certification Verification"
|
27
29
|
}
|
28
30
|
return false
|
29
31
|
end
|
30
|
-
|
31
|
-
|
32
|
+
resp_url = resp.base_uri.to_s
|
33
|
+
if (@target != resp_url)
|
34
|
+
if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
|
35
|
+
new_t = Retriever::Target.new(resp_url)
|
36
|
+
@target = new_t.target
|
37
|
+
@host = new_t.host
|
38
|
+
return new_t.source
|
39
|
+
end
|
40
|
+
fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
|
32
41
|
end
|
33
42
|
resp = resp.read
|
34
43
|
if resp == ""
|
35
44
|
fail "Domain is not working. Try the non-WWW version."
|
36
45
|
end
|
37
|
-
|
46
|
+
fail "Domain not working. Try HTTPS???" if !resp
|
47
|
+
return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
|
38
48
|
end
|
39
49
|
|
40
50
|
end
|
51
|
+
|
41
52
|
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'retriever/fetch'
|
2
2
|
require 'retriever/fetchfiles'
|
3
3
|
require 'retriever/fetchsitemap'
|
4
|
+
require 'retriever/fetchseo'
|
5
|
+
require 'retriever/cli'
|
4
6
|
require 'retriever/link'
|
5
7
|
require 'retriever/target'
|
8
|
+
require 'retriever/page'
|
9
|
+
require 'retriever/openuri-redirect-patch'
|
6
10
|
|
7
11
|
module Retriever
|
8
12
|
|
data/readme.md
CHANGED
@@ -17,7 +17,7 @@ Install the gem
|
|
17
17
|
```sh
|
18
18
|
gem install rubyretriever
|
19
19
|
```
|
20
|
-
|
20
|
+
|
21
21
|
**Example: Sitemap mode**
|
22
22
|
```sh
|
23
23
|
rr --sitemap CSV --progress --limit 100 http://www.cnet.com
|
@@ -31,14 +31,25 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
|
|
31
31
|
|
32
32
|
**Example: File Harvesting mode**
|
33
33
|
```sh
|
34
|
-
rr --files pdf --progress --limit 1000 --
|
34
|
+
rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
|
35
|
+
```
|
36
|
+
OR -- SAME COMMAND
|
37
|
+
```sh
|
38
|
+
rr -f pdf -p -l 100 http://www.hubspot.com
|
39
|
+
```
|
40
|
+
|
41
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
42
|
+
|
43
|
+
**Example: SEO mode**
|
44
|
+
```sh
|
45
|
+
rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
|
35
46
|
```
|
36
47
|
OR -- SAME COMMAND
|
37
48
|
```sh
|
38
|
-
rr -
|
49
|
+
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
39
50
|
```
|
40
51
|
|
41
|
-
This would go to http://www.
|
52
|
+
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
|
42
53
|
|
43
54
|
|
44
55
|
command-line arguments
|
@@ -47,10 +58,11 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
|
47
58
|
|
48
59
|
Where MODE FLAG is required, and is either:
|
49
60
|
-s, --sitemap FORMAT (only accepts CSV or XML atm)
|
50
|
-
-f, --files FILETYPE
|
61
|
+
-f, --files FILETYPE
|
62
|
+
-e, --seo
|
51
63
|
|
52
64
|
and OPTIONS is the applicable:
|
53
|
-
-o, --out FILENAME *Dump
|
65
|
+
-o, --out FILENAME *Dump fetch data as CSV*
|
54
66
|
-p, --progress *Outputs a progressbar*
|
55
67
|
-v, --verbose *Output more information*
|
56
68
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
data/spec/link_spec.rb
CHANGED
@@ -2,8 +2,8 @@ require 'retriever'
|
|
2
2
|
|
3
3
|
describe "Link" do
|
4
4
|
|
5
|
-
|
6
|
-
let(:links) {
|
5
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
6
|
+
let(:links) { Retriever::Page.new(@source,t).links }
|
7
7
|
|
8
8
|
it "collects links in anchor tags" do
|
9
9
|
@source = (<<SOURCE).strip
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'retriever/page'
|
2
|
+
require 'retriever/fetch'
|
3
|
+
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
|
5
|
+
|
6
|
+
describe "Page" do
|
7
|
+
|
8
|
+
describe "#links" do
|
9
|
+
let (:links){Retriever::Page.new(@source,t).links}
|
10
|
+
it "collects all unique href links on the page" do
|
11
|
+
@source = (<<SOURCE).strip
|
12
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
13
|
+
<a href='/test.html'>test</a>
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
15
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
16
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
17
|
+
SOURCE
|
18
|
+
|
19
|
+
expect(links).to have(4).items
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#parseInternal" do
|
24
|
+
let (:links){Retriever::Page.new(@source,t).parseInternal}
|
25
|
+
it "filters links by host" do
|
26
|
+
@source = (<<SOURCE).strip
|
27
|
+
<a href='http://www.cnet.com/'>download</a>
|
28
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
29
|
+
SOURCE
|
30
|
+
|
31
|
+
expect(links).to have(1).items
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#parseInternalVisitable" do
|
36
|
+
let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
|
37
|
+
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
38
|
+
@source = (<<SOURCE).strip
|
39
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
40
|
+
SOURCE
|
41
|
+
expect(links).to have(0).items
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#parseFiles" do
|
46
|
+
let (:links){Retriever::Page.new(@source,t).parseFiles}
|
47
|
+
it "filters links by filetype" do
|
48
|
+
@source = (<<SOURCE).strip
|
49
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
50
|
+
http://www.google.com
|
51
|
+
<a href='/test.html'>test</a>
|
52
|
+
SOURCE
|
53
|
+
expect(links).to have(1).items
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#title" do
|
58
|
+
let (:page){Retriever::Page.new(@source,t)}
|
59
|
+
it "returns page title" do
|
60
|
+
@source = (<<SOURCE).strip
|
61
|
+
<title>test</title>
|
62
|
+
SOURCE
|
63
|
+
expect(page.title).to eq('test')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
describe "#desc" do
|
67
|
+
let (:page){Retriever::Page.new(@source,t)}
|
68
|
+
it "returns meta description" do
|
69
|
+
@source = (<<SOURCE).strip
|
70
|
+
<meta name='description' content="test2 ">
|
71
|
+
SOURCE
|
72
|
+
expect(page.desc).to eq('test2 ')
|
73
|
+
end
|
74
|
+
end
|
75
|
+
describe "#h1" do
|
76
|
+
let (:page){Retriever::Page.new(@source,t)}
|
77
|
+
it "returns h1 text" do
|
78
|
+
@source = (<<SOURCE).strip
|
79
|
+
<h1>test 3</h1>
|
80
|
+
SOURCE
|
81
|
+
expect(page.h1).to eq('test 3')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
describe "#h2" do
|
85
|
+
let (:page){Retriever::Page.new(@source,t)}
|
86
|
+
it "returns h2 text" do
|
87
|
+
@source = (<<SOURCE).strip
|
88
|
+
<h2> test 4 </h2>
|
89
|
+
SOURCE
|
90
|
+
expect(page.h2).to eq(' test 4 ')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,52 +1,5 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
|
3
|
-
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
|
-
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
5
|
-
http://www.google.com
|
6
|
-
<a href='/test.html'>test</a>
|
7
|
-
<a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
|
8
|
-
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
9
|
-
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
10
|
-
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
11
|
-
test.com
|
12
|
-
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
|
-
<a href='cpage_18'>about</a>"
|
14
|
-
|
15
|
-
links_collection = r.fetchLinks(test_html)
|
16
|
-
|
17
3
|
describe "Fetch" do
|
18
|
-
|
19
|
-
describe "#fetchLinks" do
|
20
|
-
it "collects all unique href links on the page" do
|
21
|
-
expect(links_collection).to have(6).items
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
describe "#parseInternalLinks" do
|
26
|
-
let (:filtered_links) {r.parseInternalLinks(links_collection)}
|
27
|
-
it "filters links by host" do
|
28
|
-
filtered_links.each do |link|
|
29
|
-
expect(link).to include("www.cnet.com")
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
describe "#parseInternalVisitableLinks" do
|
35
|
-
let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
|
36
|
-
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
37
|
-
filtered_links.each do |link|
|
38
|
-
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
describe "#parseFiles" do
|
44
|
-
let(:file_list) {r.parseFiles(links_collection)}
|
45
|
-
it "filters links by filetype" do
|
46
|
-
file_list.each do |link|
|
47
|
-
expect(link).to include(".exe")
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
4
|
+
|
52
5
|
end
|
data/spec/target_spec.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'retriever'
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
|
-
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
|
5
5
|
|
6
6
|
describe "Target" do
|
7
7
|
|
@@ -14,7 +14,11 @@ describe "Target" do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
it "creates host_re var" do
|
17
|
-
expect(t.host_re).to eq(/
|
17
|
+
expect(t.host_re).to eq(/cnet.com/)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "creates file_re var (when provided)" do
|
21
|
+
expect(t.file_re).to eq(/\.exe\z/)
|
18
22
|
end
|
19
23
|
|
20
24
|
it "adds protocol to Target URL if none given" do
|
@@ -34,6 +38,7 @@ describe "Target" do
|
|
34
38
|
it "fails if target redirects to new host" do
|
35
39
|
expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
|
36
40
|
end
|
41
|
+
|
37
42
|
end
|
38
43
|
|
39
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|
@@ -119,18 +119,23 @@ files:
|
|
119
119
|
- LICENSE
|
120
120
|
- bin/rr
|
121
121
|
- lib/retriever.rb
|
122
|
+
- lib/retriever/cli.rb
|
122
123
|
- lib/retriever/fetch.rb
|
123
124
|
- lib/retriever/fetchfiles.rb
|
125
|
+
- lib/retriever/fetchseo.rb
|
124
126
|
- lib/retriever/fetchsitemap.rb
|
125
127
|
- lib/retriever/link.rb
|
128
|
+
- lib/retriever/openuri-redirect-patch.rb
|
129
|
+
- lib/retriever/page.rb
|
126
130
|
- lib/retriever/target.rb
|
127
131
|
- lib/retriever/version.rb
|
128
132
|
- readme.md
|
129
133
|
- spec/link_spec.rb
|
134
|
+
- spec/page_spec.rb
|
130
135
|
- spec/retriever_spec.rb
|
131
136
|
- spec/spec_helper.rb
|
132
137
|
- spec/target_spec.rb
|
133
|
-
homepage: http://
|
138
|
+
homepage: http://softwarebyjoe.com/rubyretriever/
|
134
139
|
licenses:
|
135
140
|
- MIT
|
136
141
|
metadata: {}
|