rubyretriever 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +20 -0
- data/bin/rr +80 -0
- data/lib/retriever.rb +19 -0
- data/lib/retriever/fetch.rb +196 -0
- data/lib/retriever/fetchfiles.rb +70 -0
- data/lib/retriever/fetchsitemap.rb +25 -0
- data/lib/retriever/version.rb +3 -0
- data/readme.md +79 -0
- data/spec/retriever_spec.rb +65 -0
- data/spec/spec_helper.rb +17 -0
- metadata +153 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e4e4773d62ec74a55bb3d9afc78622229e7db559
|
4
|
+
data.tar.gz: a2186b11bb3eabdec8c312e0b3365961c97805de
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ae392a910a3c7a6f2b3f9097d82978ba35ba413da46f4e5ce30b9e728bac2fb40f61c3a110ea59b9f59d75d79b0fcf85a8a783597aeaccd31c833b2c3753bd4d
|
7
|
+
data.tar.gz: 72fe87613059ccae6022c65ad5db70445cc4470028d081e918ffb9dfc122adb5dd8c9967abe2622a8daaeea59d878340183597a73768f32dd42679b7b159d64d
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/rr
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require_relative('../lib/retriever.rb')
|
3
|
+
options = {}
|
4
|
+
optparse = OptionParser.new do|opts|
|
5
|
+
# Set a banner, displayed at the top
|
6
|
+
# of the help screen.
|
7
|
+
opts.banner = "Usage: rr [options] Target_URL"
|
8
|
+
|
9
|
+
options[:filename] = nil
|
10
|
+
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
11
|
+
options[:filename] = filename
|
12
|
+
end
|
13
|
+
# Define the options, and what they do
|
14
|
+
options[:verbose] = false
|
15
|
+
opts.on( '-v', '--verbose', 'Output more information' ) do
|
16
|
+
options[:verbose] = true
|
17
|
+
end
|
18
|
+
|
19
|
+
options[:progress] = false
|
20
|
+
opts.on( '-p', '--progressbar', 'Output more information' ) do
|
21
|
+
options[:progress] = true
|
22
|
+
end
|
23
|
+
|
24
|
+
options[:sitemap] = false
|
25
|
+
opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
|
26
|
+
options[:sitemap] = true
|
27
|
+
end
|
28
|
+
|
29
|
+
options[:fileharvest] = false
|
30
|
+
opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
|
31
|
+
options[:fileharvest] = true
|
32
|
+
end
|
33
|
+
|
34
|
+
options[:maxpages] = false
|
35
|
+
opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
|
36
|
+
options[:maxpages] = maxpages
|
37
|
+
end
|
38
|
+
|
39
|
+
options[:file_ext] = false
|
40
|
+
opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
|
41
|
+
options[:file_ext] = file_ext
|
42
|
+
end
|
43
|
+
|
44
|
+
options[:autodown] = false
|
45
|
+
opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
|
46
|
+
options[:autodown] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
# This displays the help screen, all programs are
|
50
|
+
# assumed to have this option.
|
51
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
52
|
+
puts opts
|
53
|
+
exit
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
optparse.parse!
|
58
|
+
if ARGV[0].nil?
|
59
|
+
abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
|
60
|
+
end
|
61
|
+
|
62
|
+
ARGV.each do|q|
|
63
|
+
if options[:verbose]
|
64
|
+
puts "###############################"
|
65
|
+
puts "### [RubyRetriever]"
|
66
|
+
puts "### Creating Sitemap" if options[:sitemap]
|
67
|
+
puts "### Performing File Harvest" if options[:fileharvest]
|
68
|
+
puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
|
69
|
+
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
70
|
+
puts "### Being verbose"
|
71
|
+
puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
|
72
|
+
end
|
73
|
+
puts "###############################"
|
74
|
+
puts "### [RubyRetriever] go fetch #{q}"
|
75
|
+
test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
|
76
|
+
test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
77
|
+
puts "### [RubyRetriever] is done."
|
78
|
+
puts "###############################"
|
79
|
+
puts
|
80
|
+
end
|
data/lib/retriever.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
##################################################################
|
2
|
+
#####RubyRetriever -- web crawler and file harvester
|
3
|
+
#####created by Joe Norton
|
4
|
+
#####http://softwarebyjoe.com
|
5
|
+
##LICENSING: GNU GPLv3 License##################################
|
6
|
+
#! usr/bin/ruby
|
7
|
+
require 'em-synchrony'
|
8
|
+
require 'em-synchrony/em-http'
|
9
|
+
require 'em-synchrony/fiber_iterator'
|
10
|
+
require 'ruby-progressbar'
|
11
|
+
require 'open-uri'
|
12
|
+
require 'optparse'
|
13
|
+
require 'uri'
|
14
|
+
require 'csv'
|
15
|
+
require 'bloomfilter-rb'
|
16
|
+
|
17
|
+
require 'retriever/fetch'
|
18
|
+
require 'retriever/fetchfiles'
|
19
|
+
require 'retriever/fetchsitemap'
|
@@ -0,0 +1,196 @@
|
|
1
|
+
module Retriever
|
2
|
+
class Fetch
|
3
|
+
attr_reader :target, :host, :host_re, :maxPages
|
4
|
+
#constants
|
5
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
7
|
+
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
8
|
+
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
9
|
+
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
10
|
+
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
11
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
12
|
+
|
13
|
+
def initialize(url,options)
|
14
|
+
new_uri = URI(url)
|
15
|
+
@target = new_uri.to_s
|
16
|
+
@host = new_uri.host
|
17
|
+
#OPTIONS
|
18
|
+
@prgrss = options[:progress] ? options[:progress] : false
|
19
|
+
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
20
|
+
@v= options[:verbose] ? true : false
|
21
|
+
@output=options[:filename] ? options[:filename] : false
|
22
|
+
@fh = options[:fileharvest] ? true : false
|
23
|
+
@s = options[:sitemap] ? true : false
|
24
|
+
@file_ext = options[:file_ext] ? options[:file_ext] : false
|
25
|
+
@autodown = options[:autodown] ? true : false
|
26
|
+
#
|
27
|
+
@host_re = Regexp.new(host).freeze
|
28
|
+
if @fh
|
29
|
+
errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
|
30
|
+
tempExtStr = "."+@file_ext+'\z'
|
31
|
+
@file_re = Regexp.new(tempExtStr).freeze
|
32
|
+
else
|
33
|
+
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
34
|
+
end
|
35
|
+
if @prgrss
|
36
|
+
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
37
|
+
prgressVars = {
|
38
|
+
:title => "Pages Crawled",
|
39
|
+
:starting_at => 1,
|
40
|
+
:total => @maxPages,
|
41
|
+
:format => '%a |%b>%i| %c/%C %t',
|
42
|
+
}
|
43
|
+
@progressbar = ProgressBar.create(prgressVars)
|
44
|
+
end
|
45
|
+
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
46
|
+
@already_crawled.insert(@target)
|
47
|
+
end
|
48
|
+
def errlog(msg)
|
49
|
+
raise "ERROR: #{msg}"
|
50
|
+
end
|
51
|
+
def lg(msg)
|
52
|
+
puts "### #{msg}" if @v
|
53
|
+
end
|
54
|
+
def dump(data)
|
55
|
+
puts "###############################"
|
56
|
+
if @s
|
57
|
+
puts "#{@target} Sitemap"
|
58
|
+
puts "Page Count: #{data.size}"
|
59
|
+
elsif @fh
|
60
|
+
puts "Target URL: #{@target}"
|
61
|
+
puts "Filetype: #{@file_ext}"
|
62
|
+
puts "File Count: #{data.size}"
|
63
|
+
else
|
64
|
+
puts "ERROR"
|
65
|
+
end
|
66
|
+
puts "###############################"
|
67
|
+
puts data
|
68
|
+
puts "###############################"
|
69
|
+
puts
|
70
|
+
end
|
71
|
+
def write(data)
|
72
|
+
if @output
|
73
|
+
CSV.open("#{@output}.csv", "w") do |csv|
|
74
|
+
data.each do |entry|
|
75
|
+
csv << [entry]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
puts "###############################"
|
79
|
+
puts "File Created: #{filename}.csv"
|
80
|
+
puts "Object Count: #{data.size}"
|
81
|
+
puts "###############################"
|
82
|
+
puts
|
83
|
+
end
|
84
|
+
end
|
85
|
+
def fetchPage(url)
|
86
|
+
resp = false
|
87
|
+
EM.synchrony do
|
88
|
+
begin
|
89
|
+
resp = EventMachine::HttpRequest.new(url).get
|
90
|
+
rescue StandardError => e
|
91
|
+
#puts e.message + " ## " + url
|
92
|
+
#the trap abrt is nescessary to handle the SSL error
|
93
|
+
#for some ungodly reason it's the only way I found to handle it
|
94
|
+
trap("ABRT"){
|
95
|
+
puts "#{url} failed SSL Certification Verification"
|
96
|
+
}
|
97
|
+
return false
|
98
|
+
end
|
99
|
+
lg("URL Crawled: #{url}")
|
100
|
+
EventMachine.stop
|
101
|
+
end
|
102
|
+
if resp.response == ""
|
103
|
+
errlog("Domain is not working. Try the non-WWW version.")
|
104
|
+
end
|
105
|
+
return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
106
|
+
end
|
107
|
+
#recieves page source as string
|
108
|
+
#returns array of unique href links
|
109
|
+
def fetchLinks(doc)
|
110
|
+
return false if !doc
|
111
|
+
linkArray = []
|
112
|
+
doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
113
|
+
link = arr[0]
|
114
|
+
if (!(HTTP_RE =~ link))
|
115
|
+
if (DUB_DUB_DUB_DOT_RE =~ link)
|
116
|
+
link = "http://#{link}"
|
117
|
+
elsif SINGLE_SLASH_RE =~ link #link uses relative path
|
118
|
+
link = "http://#{@host}"+link #appending hostname to relative paths
|
119
|
+
elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
120
|
+
link = "http:#{link}" #appending current url to relative paths
|
121
|
+
elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
|
122
|
+
link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
|
123
|
+
else
|
124
|
+
next
|
125
|
+
end
|
126
|
+
end
|
127
|
+
linkArray.push(link)
|
128
|
+
end
|
129
|
+
linkArray.uniq!
|
130
|
+
end
|
131
|
+
def parseInternalLinks(all_links)
|
132
|
+
if all_links
|
133
|
+
all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
|
134
|
+
else
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
end
|
138
|
+
def async_crawl_and_collect()
|
139
|
+
while (@already_crawled.size < @maxPages)
|
140
|
+
if @linkStack.empty?
|
141
|
+
if @prgrss
|
142
|
+
@progressbar.log("Can't find any more links. Site might be completely mapped.")
|
143
|
+
else
|
144
|
+
lg("Can't find any more links. Site might be completely mapped.")
|
145
|
+
end
|
146
|
+
break;
|
147
|
+
end
|
148
|
+
#puts "New loop"
|
149
|
+
#puts @linkStack
|
150
|
+
new_links_arr = self.asyncGetWave()
|
151
|
+
next if (new_links_arr.nil? || new_links_arr.empty?)
|
152
|
+
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
153
|
+
@linkStack.concat(new_links_arr)
|
154
|
+
@sitemap.concat(new_links_arr) if @s
|
155
|
+
end
|
156
|
+
end
|
157
|
+
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
158
|
+
new_stuff = []
|
159
|
+
EM.synchrony do
|
160
|
+
lenny = 0
|
161
|
+
concurrency = 10
|
162
|
+
EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
|
163
|
+
next if (@already_crawled.size >= @maxPages)
|
164
|
+
if @already_crawled.include?(url)
|
165
|
+
@linkStack.delete(url)
|
166
|
+
next
|
167
|
+
else
|
168
|
+
@already_crawled.insert(url)
|
169
|
+
end
|
170
|
+
resp = EventMachine::HttpRequest.new(url).get
|
171
|
+
lg("URL Crawled: #{url}")
|
172
|
+
if @prgrss
|
173
|
+
@progressbar.increment if @already_crawled.size < @maxPages
|
174
|
+
end
|
175
|
+
new_links_arr = self.fetchLinks(resp.response)
|
176
|
+
if new_links_arr
|
177
|
+
lg("#{new_links_arr.size} new links found")
|
178
|
+
internal_links_arr = self.parseInternalLinks(new_links_arr)
|
179
|
+
new_stuff.push(internal_links_arr)
|
180
|
+
if @fh
|
181
|
+
filez = self.parseFiles(new_links_arr)
|
182
|
+
@fileStack.concat(filez) if !filez.empty?
|
183
|
+
lg("#{filez.size} files found")
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
new_stuff = new_stuff.flatten # all completed requests
|
188
|
+
EventMachine.stop
|
189
|
+
end
|
190
|
+
new_stuff.uniq!
|
191
|
+
end
|
192
|
+
def parseFiles(all_links)
|
193
|
+
all_links.select{ |linky| (@file_re =~ linky)}
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchFiles < Fetch
|
3
|
+
attr_reader :fileStack
|
4
|
+
def initialize(url,options)
|
5
|
+
super
|
6
|
+
@fileStack = []
|
7
|
+
all_links = self.fetchLinks(fetchPage(@target))
|
8
|
+
@linkStack = self.parseInternalLinks(all_links)
|
9
|
+
self.lg("#{@linkStack.size-1} new links found")
|
10
|
+
|
11
|
+
tempFileCollection = self.parseFiles(all_links)
|
12
|
+
@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
|
13
|
+
self.lg("#{@fileStack.size} new files found")
|
14
|
+
errlog("Bad URL -- #{@target}") if !@linkStack
|
15
|
+
|
16
|
+
@linkStack.delete(@target) if @linkStack.include?(@target)
|
17
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
18
|
+
|
19
|
+
self.async_crawl_and_collect()
|
20
|
+
|
21
|
+
@fileStack.sort_by! {|x| x.length}
|
22
|
+
@fileStack.uniq!
|
23
|
+
|
24
|
+
self.dump(self.fileStack)
|
25
|
+
self.write(@output,self.fileStack) if @output
|
26
|
+
self.autodownload()
|
27
|
+
end
|
28
|
+
def download_file(path)
|
29
|
+
arr = path.split('/')
|
30
|
+
shortname = arr.pop
|
31
|
+
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
32
|
+
File.open(shortname, "wb") do |saved_file|
|
33
|
+
# the following "open" is provided by open-uri
|
34
|
+
open(path) do |read_file|
|
35
|
+
saved_file.write(read_file.read)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
puts " SUCCESS: Download Complete"
|
39
|
+
end
|
40
|
+
def autodownload()
|
41
|
+
lenny = @fileStack.count
|
42
|
+
puts "###################"
|
43
|
+
puts "### Initiating Autodownload..."
|
44
|
+
puts "###################"
|
45
|
+
puts "#{lenny} - #{@file_ext}'s Located"
|
46
|
+
puts "###################"
|
47
|
+
if File::directory?("rr-downloads")
|
48
|
+
Dir.chdir("rr-downloads")
|
49
|
+
else
|
50
|
+
puts "creating rr-downloads Directory"
|
51
|
+
Dir.mkdir("rr-downloads")
|
52
|
+
Dir.chdir("rr-downloads")
|
53
|
+
end
|
54
|
+
file_counter = 0
|
55
|
+
@fileStack.each do |entry|
|
56
|
+
begin
|
57
|
+
self.download_file(entry)
|
58
|
+
file_counter+=1
|
59
|
+
lg(" File [#{file_counter} of #{lenny}]")
|
60
|
+
puts
|
61
|
+
rescue StandardError => e
|
62
|
+
puts "ERROR: failed to download - #{entry}"
|
63
|
+
puts e.message
|
64
|
+
puts
|
65
|
+
end
|
66
|
+
end
|
67
|
+
Dir.chdir("..")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchSitemap < Fetch
|
3
|
+
attr_reader :sitemap
|
4
|
+
def initialize(url,options)
|
5
|
+
super
|
6
|
+
@sitemap = [@target]
|
7
|
+
@linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
|
8
|
+
self.lg("#{@linkStack.size-1} new links found")
|
9
|
+
errlog("Bad URL -- #{@target}") if !@linkStack
|
10
|
+
|
11
|
+
@linkStack.delete(@target) if @linkStack.include?(@target)
|
12
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
13
|
+
@sitemap.concat(@linkStack)
|
14
|
+
|
15
|
+
self.async_crawl_and_collect()
|
16
|
+
|
17
|
+
@sitemap.sort_by! {|x| x.length} if @sitemap.size>1
|
18
|
+
@sitemap.uniq!
|
19
|
+
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
|
+
|
21
|
+
self.dump(self.sitemap)
|
22
|
+
self.write(@output,self.sitemap) if @output
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/readme.md
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
RubyRetriever [](http://badge.fury.io/rb/RubyRetriever)
|
2
|
+
==============
|
3
|
+
|
4
|
+
Now an official RubyGem! --make sure to use camel-casing--
|
5
|
+
```sh
|
6
|
+
gem install RubyRetriever
|
7
|
+
```
|
8
|
+
|
9
|
+
Update (5/25):
|
10
|
+
Version 0.06 - Switches to using a Bloom Filter to keep track of past 'visited pages'. I saw this in [Arachnid] (https://github.com/dchuk/Arachnid) and realized it's a much better idea for performance and implemented it immediately. Hat tip [dchuk] (https://github.com/dchuk/)
|
11
|
+
|
12
|
+
About
|
13
|
+
=====
|
14
|
+
|
15
|
+
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
|
16
|
+
Soon to add some high level scraping options.
|
17
|
+
|
18
|
+
RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
|
19
|
+
|
20
|
+
This is the 2nd or 3rd reincarnation of the RubyRetriever autodownloader project. It started out as a executable autodownloader, intended for malware research. From there it has morphed to become a more well-rounded web-crawler and general purpose file harvesting utility.
|
21
|
+
|
22
|
+
RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
|
23
|
+
|
24
|
+
|
25
|
+
HOW IT WORKS
|
26
|
+
-----------
|
27
|
+
```sh
|
28
|
+
gem install RubyRetriever
|
29
|
+
rr [MODE] [OPTIONS] Target_URL
|
30
|
+
```
|
31
|
+
|
32
|
+
**Site Mapper**
|
33
|
+
```sh
|
34
|
+
rr --sitemap --progress --limit 1000 --output cnet http://www.cnet.com
|
35
|
+
```
|
36
|
+
OR -- SAME COMMAND
|
37
|
+
```sh
|
38
|
+
rr -s -p -l 1000 -o cnet http://www.cnet.com
|
39
|
+
```
|
40
|
+
|
41
|
+
This would go to http://www.cnet.com and map it until it crawled a max of 1,000 pages, and then it would write it out to a csv named cnet.
|
42
|
+
|
43
|
+
**File Harvesting**
|
44
|
+
```sh
|
45
|
+
rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
|
46
|
+
```
|
47
|
+
OR -- SAME COMMAND
|
48
|
+
```sh
|
49
|
+
rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
|
50
|
+
```
|
51
|
+
|
52
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
|
53
|
+
|
54
|
+
|
55
|
+
command-line arguments
|
56
|
+
-----------------------
|
57
|
+
Usage: rr [MODE] [OPTIONS] Target_URL
|
58
|
+
|
59
|
+
Where MODE FLAG is either:
|
60
|
+
-s, --sitemap
|
61
|
+
-f, --files
|
62
|
+
|
63
|
+
and OPTIONS is the applicable:
|
64
|
+
-o, --out FILENAME *Dump output to selected filename*
|
65
|
+
-p, --progress *Outputs a progressbar*
|
66
|
+
-v, --verbose *Output more information*
|
67
|
+
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
68
|
+
-e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
|
69
|
+
-h, --help *Display this screen*
|
70
|
+
|
71
|
+
Current Requirements
|
72
|
+
------------
|
73
|
+
em-synchrony
|
74
|
+
ruby-progressbar
|
75
|
+
bloomfilter-rb
|
76
|
+
|
77
|
+
License
|
78
|
+
-------
|
79
|
+
See included 'LICENSE' file. It's the MIT license.
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../lib/retriever'
|
2
|
+
|
3
|
+
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
|
+
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
5
|
+
http://www.google.com
|
6
|
+
<a href='/test.html'>test</a>
|
7
|
+
<a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
|
8
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
9
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
10
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
11
|
+
test.com
|
12
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
|
+
<a href='cpage_18'>about</a>"
|
14
|
+
|
15
|
+
doc = r.fetchPage(r.target)
|
16
|
+
links_collection = r.fetchLinks(test_html)
|
17
|
+
filtered_links = r.parseInternalLinks(links_collection)
|
18
|
+
file_list = r.parseFiles(links_collection)
|
19
|
+
|
20
|
+
describe "Fetch" do
|
21
|
+
|
22
|
+
describe "#new" do
|
23
|
+
it "sets target, host, and max page vars" do
|
24
|
+
expect(r.target).to eq("http://www.cnet.com/reviews/")
|
25
|
+
expect(r.host).to eq("www.cnet.com")
|
26
|
+
expect(r.maxPages).to eq(100)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#fetchPage" do
|
31
|
+
it "opens URL and returns source as String" do
|
32
|
+
expect(doc.class).to eq(String)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#fetchLinks" do
|
37
|
+
it "collects all unique href links on the page" do
|
38
|
+
expect(links_collection).to have(6).items
|
39
|
+
end
|
40
|
+
it "returns relative urls with full path based on hostname" do
|
41
|
+
expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#parseInternalLinks" do
|
46
|
+
it "filters links by host" do
|
47
|
+
filtered_links.each do |link|
|
48
|
+
expect(link).to include(r.host)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
52
|
+
filtered_links.each do |link|
|
53
|
+
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
describe "#parseFiles" do
|
58
|
+
it "filters links by filetype" do
|
59
|
+
file_list.each do |link|
|
60
|
+
expect(link).to include(".exe")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubyretriever
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.8
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joe Norton
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: em-synchrony
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: em-http-request
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: ruby-progressbar
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bloomfilter-rb
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.14'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.14'
|
111
|
+
description: General purpose web crawler, site mapper, and file harvester
|
112
|
+
email:
|
113
|
+
- joe@softwarebyjoe.com
|
114
|
+
executables:
|
115
|
+
- rr
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- LICENSE
|
120
|
+
- bin/rr
|
121
|
+
- lib/retriever.rb
|
122
|
+
- lib/retriever/fetch.rb
|
123
|
+
- lib/retriever/fetchfiles.rb
|
124
|
+
- lib/retriever/fetchsitemap.rb
|
125
|
+
- lib/retriever/version.rb
|
126
|
+
- readme.md
|
127
|
+
- spec/retriever_spec.rb
|
128
|
+
- spec/spec_helper.rb
|
129
|
+
homepage: http://github.com/joenorton/rubyretriever
|
130
|
+
licenses:
|
131
|
+
- MIT
|
132
|
+
metadata: {}
|
133
|
+
post_install_message:
|
134
|
+
rdoc_options: []
|
135
|
+
require_paths:
|
136
|
+
- lib
|
137
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
138
|
+
requirements:
|
139
|
+
- - '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: 1.8.6
|
142
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
|
+
requirements:
|
144
|
+
- - '>='
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: 1.3.6
|
147
|
+
requirements: []
|
148
|
+
rubyforge_project: rubyretriever
|
149
|
+
rubygems_version: 2.2.2
|
150
|
+
signing_key:
|
151
|
+
specification_version: 4
|
152
|
+
summary: Ruby Web Crawler & File Harvester
|
153
|
+
test_files: []
|