rubyretriever 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +20 -0
- data/bin/rr +80 -0
- data/lib/retriever.rb +19 -0
- data/lib/retriever/fetch.rb +196 -0
- data/lib/retriever/fetchfiles.rb +70 -0
- data/lib/retriever/fetchsitemap.rb +25 -0
- data/lib/retriever/version.rb +3 -0
- data/readme.md +79 -0
- data/spec/retriever_spec.rb +65 -0
- data/spec/spec_helper.rb +17 -0
- metadata +153 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e4e4773d62ec74a55bb3d9afc78622229e7db559
|
4
|
+
data.tar.gz: a2186b11bb3eabdec8c312e0b3365961c97805de
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ae392a910a3c7a6f2b3f9097d82978ba35ba413da46f4e5ce30b9e728bac2fb40f61c3a110ea59b9f59d75d79b0fcf85a8a783597aeaccd31c833b2c3753bd4d
|
7
|
+
data.tar.gz: 72fe87613059ccae6022c65ad5db70445cc4470028d081e918ffb9dfc122adb5dd8c9967abe2622a8daaeea59d878340183597a73768f32dd42679b7b159d64d
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/bin/rr
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require_relative('../lib/retriever.rb')
|
3
|
+
options = {}
|
4
|
+
optparse = OptionParser.new do|opts|
|
5
|
+
# Set a banner, displayed at the top
|
6
|
+
# of the help screen.
|
7
|
+
opts.banner = "Usage: rr [options] Target_URL"
|
8
|
+
|
9
|
+
options[:filename] = nil
|
10
|
+
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
11
|
+
options[:filename] = filename
|
12
|
+
end
|
13
|
+
# Define the options, and what they do
|
14
|
+
options[:verbose] = false
|
15
|
+
opts.on( '-v', '--verbose', 'Output more information' ) do
|
16
|
+
options[:verbose] = true
|
17
|
+
end
|
18
|
+
|
19
|
+
options[:progress] = false
|
20
|
+
opts.on( '-p', '--progressbar', 'Output more information' ) do
|
21
|
+
options[:progress] = true
|
22
|
+
end
|
23
|
+
|
24
|
+
options[:sitemap] = false
|
25
|
+
opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
|
26
|
+
options[:sitemap] = true
|
27
|
+
end
|
28
|
+
|
29
|
+
options[:fileharvest] = false
|
30
|
+
opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
|
31
|
+
options[:fileharvest] = true
|
32
|
+
end
|
33
|
+
|
34
|
+
options[:maxpages] = false
|
35
|
+
opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
|
36
|
+
options[:maxpages] = maxpages
|
37
|
+
end
|
38
|
+
|
39
|
+
options[:file_ext] = false
|
40
|
+
opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
|
41
|
+
options[:file_ext] = file_ext
|
42
|
+
end
|
43
|
+
|
44
|
+
options[:autodown] = false
|
45
|
+
opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
|
46
|
+
options[:autodown] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
# This displays the help screen, all programs are
|
50
|
+
# assumed to have this option.
|
51
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
52
|
+
puts opts
|
53
|
+
exit
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
optparse.parse!
|
58
|
+
if ARGV[0].nil?
|
59
|
+
abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
|
60
|
+
end
|
61
|
+
|
62
|
+
ARGV.each do|q|
|
63
|
+
if options[:verbose]
|
64
|
+
puts "###############################"
|
65
|
+
puts "### [RubyRetriever]"
|
66
|
+
puts "### Creating Sitemap" if options[:sitemap]
|
67
|
+
puts "### Performing File Harvest" if options[:fileharvest]
|
68
|
+
puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
|
69
|
+
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
70
|
+
puts "### Being verbose"
|
71
|
+
puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
|
72
|
+
end
|
73
|
+
puts "###############################"
|
74
|
+
puts "### [RubyRetriever] go fetch #{q}"
|
75
|
+
test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
|
76
|
+
test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
77
|
+
puts "### [RubyRetriever] is done."
|
78
|
+
puts "###############################"
|
79
|
+
puts
|
80
|
+
end
|
data/lib/retriever.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
##################################################################
|
2
|
+
#####RubyRetriever -- web crawler and file harvester
|
3
|
+
#####created by Joe Norton
|
4
|
+
#####http://softwarebyjoe.com
|
5
|
+
##LICENSING: GNU GPLv3 License##################################
|
6
|
+
#! usr/bin/ruby
|
7
|
+
require 'em-synchrony'
|
8
|
+
require 'em-synchrony/em-http'
|
9
|
+
require 'em-synchrony/fiber_iterator'
|
10
|
+
require 'ruby-progressbar'
|
11
|
+
require 'open-uri'
|
12
|
+
require 'optparse'
|
13
|
+
require 'uri'
|
14
|
+
require 'csv'
|
15
|
+
require 'bloomfilter-rb'
|
16
|
+
|
17
|
+
require 'retriever/fetch'
|
18
|
+
require 'retriever/fetchfiles'
|
19
|
+
require 'retriever/fetchsitemap'
|
@@ -0,0 +1,196 @@
|
|
1
|
+
module Retriever
|
2
|
+
class Fetch
|
3
|
+
attr_reader :target, :host, :host_re, :maxPages
|
4
|
+
#constants
|
5
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
7
|
+
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
8
|
+
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
9
|
+
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
10
|
+
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
11
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
12
|
+
|
13
|
+
def initialize(url,options)
|
14
|
+
new_uri = URI(url)
|
15
|
+
@target = new_uri.to_s
|
16
|
+
@host = new_uri.host
|
17
|
+
#OPTIONS
|
18
|
+
@prgrss = options[:progress] ? options[:progress] : false
|
19
|
+
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
20
|
+
@v= options[:verbose] ? true : false
|
21
|
+
@output=options[:filename] ? options[:filename] : false
|
22
|
+
@fh = options[:fileharvest] ? true : false
|
23
|
+
@s = options[:sitemap] ? true : false
|
24
|
+
@file_ext = options[:file_ext] ? options[:file_ext] : false
|
25
|
+
@autodown = options[:autodown] ? true : false
|
26
|
+
#
|
27
|
+
@host_re = Regexp.new(host).freeze
|
28
|
+
if @fh
|
29
|
+
errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
|
30
|
+
tempExtStr = "."+@file_ext+'\z'
|
31
|
+
@file_re = Regexp.new(tempExtStr).freeze
|
32
|
+
else
|
33
|
+
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
34
|
+
end
|
35
|
+
if @prgrss
|
36
|
+
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
37
|
+
prgressVars = {
|
38
|
+
:title => "Pages Crawled",
|
39
|
+
:starting_at => 1,
|
40
|
+
:total => @maxPages,
|
41
|
+
:format => '%a |%b>%i| %c/%C %t',
|
42
|
+
}
|
43
|
+
@progressbar = ProgressBar.create(prgressVars)
|
44
|
+
end
|
45
|
+
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
46
|
+
@already_crawled.insert(@target)
|
47
|
+
end
|
48
|
+
def errlog(msg)
|
49
|
+
raise "ERROR: #{msg}"
|
50
|
+
end
|
51
|
+
def lg(msg)
|
52
|
+
puts "### #{msg}" if @v
|
53
|
+
end
|
54
|
+
def dump(data)
|
55
|
+
puts "###############################"
|
56
|
+
if @s
|
57
|
+
puts "#{@target} Sitemap"
|
58
|
+
puts "Page Count: #{data.size}"
|
59
|
+
elsif @fh
|
60
|
+
puts "Target URL: #{@target}"
|
61
|
+
puts "Filetype: #{@file_ext}"
|
62
|
+
puts "File Count: #{data.size}"
|
63
|
+
else
|
64
|
+
puts "ERROR"
|
65
|
+
end
|
66
|
+
puts "###############################"
|
67
|
+
puts data
|
68
|
+
puts "###############################"
|
69
|
+
puts
|
70
|
+
end
|
71
|
+
def write(data)
|
72
|
+
if @output
|
73
|
+
CSV.open("#{@output}.csv", "w") do |csv|
|
74
|
+
data.each do |entry|
|
75
|
+
csv << [entry]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
puts "###############################"
|
79
|
+
puts "File Created: #{filename}.csv"
|
80
|
+
puts "Object Count: #{data.size}"
|
81
|
+
puts "###############################"
|
82
|
+
puts
|
83
|
+
end
|
84
|
+
end
|
85
|
+
def fetchPage(url)
|
86
|
+
resp = false
|
87
|
+
EM.synchrony do
|
88
|
+
begin
|
89
|
+
resp = EventMachine::HttpRequest.new(url).get
|
90
|
+
rescue StandardError => e
|
91
|
+
#puts e.message + " ## " + url
|
92
|
+
#the trap abrt is nescessary to handle the SSL error
|
93
|
+
#for some ungodly reason it's the only way I found to handle it
|
94
|
+
trap("ABRT"){
|
95
|
+
puts "#{url} failed SSL Certification Verification"
|
96
|
+
}
|
97
|
+
return false
|
98
|
+
end
|
99
|
+
lg("URL Crawled: #{url}")
|
100
|
+
EventMachine.stop
|
101
|
+
end
|
102
|
+
if resp.response == ""
|
103
|
+
errlog("Domain is not working. Try the non-WWW version.")
|
104
|
+
end
|
105
|
+
return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
106
|
+
end
|
107
|
+
#recieves page source as string
|
108
|
+
#returns array of unique href links
|
109
|
+
def fetchLinks(doc)
|
110
|
+
return false if !doc
|
111
|
+
linkArray = []
|
112
|
+
doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
113
|
+
link = arr[0]
|
114
|
+
if (!(HTTP_RE =~ link))
|
115
|
+
if (DUB_DUB_DUB_DOT_RE =~ link)
|
116
|
+
link = "http://#{link}"
|
117
|
+
elsif SINGLE_SLASH_RE =~ link #link uses relative path
|
118
|
+
link = "http://#{@host}"+link #appending hostname to relative paths
|
119
|
+
elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
120
|
+
link = "http:#{link}" #appending current url to relative paths
|
121
|
+
elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
|
122
|
+
link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
|
123
|
+
else
|
124
|
+
next
|
125
|
+
end
|
126
|
+
end
|
127
|
+
linkArray.push(link)
|
128
|
+
end
|
129
|
+
linkArray.uniq!
|
130
|
+
end
|
131
|
+
def parseInternalLinks(all_links)
|
132
|
+
if all_links
|
133
|
+
all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
|
134
|
+
else
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
end
|
138
|
+
def async_crawl_and_collect()
|
139
|
+
while (@already_crawled.size < @maxPages)
|
140
|
+
if @linkStack.empty?
|
141
|
+
if @prgrss
|
142
|
+
@progressbar.log("Can't find any more links. Site might be completely mapped.")
|
143
|
+
else
|
144
|
+
lg("Can't find any more links. Site might be completely mapped.")
|
145
|
+
end
|
146
|
+
break;
|
147
|
+
end
|
148
|
+
#puts "New loop"
|
149
|
+
#puts @linkStack
|
150
|
+
new_links_arr = self.asyncGetWave()
|
151
|
+
next if (new_links_arr.nil? || new_links_arr.empty?)
|
152
|
+
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
153
|
+
@linkStack.concat(new_links_arr)
|
154
|
+
@sitemap.concat(new_links_arr) if @s
|
155
|
+
end
|
156
|
+
end
|
157
|
+
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
158
|
+
new_stuff = []
|
159
|
+
EM.synchrony do
|
160
|
+
lenny = 0
|
161
|
+
concurrency = 10
|
162
|
+
EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
|
163
|
+
next if (@already_crawled.size >= @maxPages)
|
164
|
+
if @already_crawled.include?(url)
|
165
|
+
@linkStack.delete(url)
|
166
|
+
next
|
167
|
+
else
|
168
|
+
@already_crawled.insert(url)
|
169
|
+
end
|
170
|
+
resp = EventMachine::HttpRequest.new(url).get
|
171
|
+
lg("URL Crawled: #{url}")
|
172
|
+
if @prgrss
|
173
|
+
@progressbar.increment if @already_crawled.size < @maxPages
|
174
|
+
end
|
175
|
+
new_links_arr = self.fetchLinks(resp.response)
|
176
|
+
if new_links_arr
|
177
|
+
lg("#{new_links_arr.size} new links found")
|
178
|
+
internal_links_arr = self.parseInternalLinks(new_links_arr)
|
179
|
+
new_stuff.push(internal_links_arr)
|
180
|
+
if @fh
|
181
|
+
filez = self.parseFiles(new_links_arr)
|
182
|
+
@fileStack.concat(filez) if !filez.empty?
|
183
|
+
lg("#{filez.size} files found")
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
new_stuff = new_stuff.flatten # all completed requests
|
188
|
+
EventMachine.stop
|
189
|
+
end
|
190
|
+
new_stuff.uniq!
|
191
|
+
end
|
192
|
+
def parseFiles(all_links)
|
193
|
+
all_links.select{ |linky| (@file_re =~ linky)}
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchFiles < Fetch
|
3
|
+
attr_reader :fileStack
|
4
|
+
def initialize(url,options)
|
5
|
+
super
|
6
|
+
@fileStack = []
|
7
|
+
all_links = self.fetchLinks(fetchPage(@target))
|
8
|
+
@linkStack = self.parseInternalLinks(all_links)
|
9
|
+
self.lg("#{@linkStack.size-1} new links found")
|
10
|
+
|
11
|
+
tempFileCollection = self.parseFiles(all_links)
|
12
|
+
@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
|
13
|
+
self.lg("#{@fileStack.size} new files found")
|
14
|
+
errlog("Bad URL -- #{@target}") if !@linkStack
|
15
|
+
|
16
|
+
@linkStack.delete(@target) if @linkStack.include?(@target)
|
17
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
18
|
+
|
19
|
+
self.async_crawl_and_collect()
|
20
|
+
|
21
|
+
@fileStack.sort_by! {|x| x.length}
|
22
|
+
@fileStack.uniq!
|
23
|
+
|
24
|
+
self.dump(self.fileStack)
|
25
|
+
self.write(@output,self.fileStack) if @output
|
26
|
+
self.autodownload()
|
27
|
+
end
|
28
|
+
def download_file(path)
|
29
|
+
arr = path.split('/')
|
30
|
+
shortname = arr.pop
|
31
|
+
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
32
|
+
File.open(shortname, "wb") do |saved_file|
|
33
|
+
# the following "open" is provided by open-uri
|
34
|
+
open(path) do |read_file|
|
35
|
+
saved_file.write(read_file.read)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
puts " SUCCESS: Download Complete"
|
39
|
+
end
|
40
|
+
def autodownload()
|
41
|
+
lenny = @fileStack.count
|
42
|
+
puts "###################"
|
43
|
+
puts "### Initiating Autodownload..."
|
44
|
+
puts "###################"
|
45
|
+
puts "#{lenny} - #{@file_ext}'s Located"
|
46
|
+
puts "###################"
|
47
|
+
if File::directory?("rr-downloads")
|
48
|
+
Dir.chdir("rr-downloads")
|
49
|
+
else
|
50
|
+
puts "creating rr-downloads Directory"
|
51
|
+
Dir.mkdir("rr-downloads")
|
52
|
+
Dir.chdir("rr-downloads")
|
53
|
+
end
|
54
|
+
file_counter = 0
|
55
|
+
@fileStack.each do |entry|
|
56
|
+
begin
|
57
|
+
self.download_file(entry)
|
58
|
+
file_counter+=1
|
59
|
+
lg(" File [#{file_counter} of #{lenny}]")
|
60
|
+
puts
|
61
|
+
rescue StandardError => e
|
62
|
+
puts "ERROR: failed to download - #{entry}"
|
63
|
+
puts e.message
|
64
|
+
puts
|
65
|
+
end
|
66
|
+
end
|
67
|
+
Dir.chdir("..")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchSitemap < Fetch
|
3
|
+
attr_reader :sitemap
|
4
|
+
def initialize(url,options)
|
5
|
+
super
|
6
|
+
@sitemap = [@target]
|
7
|
+
@linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
|
8
|
+
self.lg("#{@linkStack.size-1} new links found")
|
9
|
+
errlog("Bad URL -- #{@target}") if !@linkStack
|
10
|
+
|
11
|
+
@linkStack.delete(@target) if @linkStack.include?(@target)
|
12
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
13
|
+
@sitemap.concat(@linkStack)
|
14
|
+
|
15
|
+
self.async_crawl_and_collect()
|
16
|
+
|
17
|
+
@sitemap.sort_by! {|x| x.length} if @sitemap.size>1
|
18
|
+
@sitemap.uniq!
|
19
|
+
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
|
+
|
21
|
+
self.dump(self.sitemap)
|
22
|
+
self.write(@output,self.sitemap) if @output
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/readme.md
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
RubyRetriever [![Gem Version](https://badge.fury.io/rb/RubyRetriever.svg)](http://badge.fury.io/rb/RubyRetriever)
|
2
|
+
==============
|
3
|
+
|
4
|
+
Now an official RubyGem! --make sure to use camel-casing--
|
5
|
+
```sh
|
6
|
+
gem install RubyRetriever
|
7
|
+
```
|
8
|
+
|
9
|
+
Update (5/25):
|
10
|
+
Version 0.06 - Switches to using a Bloom Filter to keep track of past 'visited pages'. I saw this in [Arachnid] (https://github.com/dchuk/Arachnid) and realized it's a much better idea for performance and implemented it immediately. Hat tip [dchuk] (https://github.com/dchuk/)
|
11
|
+
|
12
|
+
About
|
13
|
+
=====
|
14
|
+
|
15
|
+
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
|
16
|
+
Soon to add some high level scraping options.
|
17
|
+
|
18
|
+
RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
|
19
|
+
|
20
|
+
This is the 2nd or 3rd reincarnation of the RubyRetriever autodownloader project. It started out as a executable autodownloader, intended for malware research. From there it has morphed to become a more well-rounded web-crawler and general purpose file harvesting utility.
|
21
|
+
|
22
|
+
RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
|
23
|
+
|
24
|
+
|
25
|
+
HOW IT WORKS
|
26
|
+
-----------
|
27
|
+
```sh
|
28
|
+
gem install RubyRetriever
|
29
|
+
rr [MODE] [OPTIONS] Target_URL
|
30
|
+
```
|
31
|
+
|
32
|
+
**Site Mapper**
|
33
|
+
```sh
|
34
|
+
rr --sitemap --progress --limit 1000 --output cnet http://www.cnet.com
|
35
|
+
```
|
36
|
+
OR -- SAME COMMAND
|
37
|
+
```sh
|
38
|
+
rr -s -p -l 1000 -o cnet http://www.cnet.com
|
39
|
+
```
|
40
|
+
|
41
|
+
This would go to http://www.cnet.com and map it until it crawled a max of 1,000 pages, and then it would write it out to a csv named cnet.
|
42
|
+
|
43
|
+
**File Harvesting**
|
44
|
+
```sh
|
45
|
+
rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
|
46
|
+
```
|
47
|
+
OR -- SAME COMMAND
|
48
|
+
```sh
|
49
|
+
rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
|
50
|
+
```
|
51
|
+
|
52
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
|
53
|
+
|
54
|
+
|
55
|
+
command-line arguments
|
56
|
+
-----------------------
|
57
|
+
Usage: rr [MODE] [OPTIONS] Target_URL
|
58
|
+
|
59
|
+
Where MODE FLAG is either:
|
60
|
+
-s, --sitemap
|
61
|
+
-f, --files
|
62
|
+
|
63
|
+
and OPTIONS is the applicable:
|
64
|
+
-o, --out FILENAME *Dump output to selected filename*
|
65
|
+
-p, --progress *Outputs a progressbar*
|
66
|
+
-v, --verbose *Output more information*
|
67
|
+
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
68
|
+
-e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
|
69
|
+
-h, --help *Display this screen*
|
70
|
+
|
71
|
+
Current Requirements
|
72
|
+
------------
|
73
|
+
em-synchrony
|
74
|
+
ruby-progressbar
|
75
|
+
bloomfilter-rb
|
76
|
+
|
77
|
+
License
|
78
|
+
-------
|
79
|
+
See included 'LICENSE' file. It's the MIT license.
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../lib/retriever'
|
2
|
+
|
3
|
+
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
|
+
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
5
|
+
http://www.google.com
|
6
|
+
<a href='/test.html'>test</a>
|
7
|
+
<a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
|
8
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
9
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
10
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>
|
11
|
+
test.com
|
12
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
|
+
<a href='cpage_18'>about</a>"
|
14
|
+
|
15
|
+
doc = r.fetchPage(r.target)
|
16
|
+
links_collection = r.fetchLinks(test_html)
|
17
|
+
filtered_links = r.parseInternalLinks(links_collection)
|
18
|
+
file_list = r.parseFiles(links_collection)
|
19
|
+
|
20
|
+
describe "Fetch" do
|
21
|
+
|
22
|
+
describe "#new" do
|
23
|
+
it "sets target, host, and max page vars" do
|
24
|
+
expect(r.target).to eq("http://www.cnet.com/reviews/")
|
25
|
+
expect(r.host).to eq("www.cnet.com")
|
26
|
+
expect(r.maxPages).to eq(100)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#fetchPage" do
|
31
|
+
it "opens URL and returns source as String" do
|
32
|
+
expect(doc.class).to eq(String)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#fetchLinks" do
|
37
|
+
it "collects all unique href links on the page" do
|
38
|
+
expect(links_collection).to have(6).items
|
39
|
+
end
|
40
|
+
it "returns relative urls with full path based on hostname" do
|
41
|
+
expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#parseInternalLinks" do
|
46
|
+
it "filters links by host" do
|
47
|
+
filtered_links.each do |link|
|
48
|
+
expect(link).to include(r.host)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
52
|
+
filtered_links.each do |link|
|
53
|
+
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
describe "#parseFiles" do
|
58
|
+
it "filters links by filetype" do
|
59
|
+
file_list.each do |link|
|
60
|
+
expect(link).to include(".exe")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubyretriever
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.8
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joe Norton
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: em-synchrony
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: em-http-request
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: ruby-progressbar
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bloomfilter-rb
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.3'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.14'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.14'
|
111
|
+
description: General purpose web crawler, site mapper, and file harvester
|
112
|
+
email:
|
113
|
+
- joe@softwarebyjoe.com
|
114
|
+
executables:
|
115
|
+
- rr
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- LICENSE
|
120
|
+
- bin/rr
|
121
|
+
- lib/retriever.rb
|
122
|
+
- lib/retriever/fetch.rb
|
123
|
+
- lib/retriever/fetchfiles.rb
|
124
|
+
- lib/retriever/fetchsitemap.rb
|
125
|
+
- lib/retriever/version.rb
|
126
|
+
- readme.md
|
127
|
+
- spec/retriever_spec.rb
|
128
|
+
- spec/spec_helper.rb
|
129
|
+
homepage: http://github.com/joenorton/rubyretriever
|
130
|
+
licenses:
|
131
|
+
- MIT
|
132
|
+
metadata: {}
|
133
|
+
post_install_message:
|
134
|
+
rdoc_options: []
|
135
|
+
require_paths:
|
136
|
+
- lib
|
137
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
138
|
+
requirements:
|
139
|
+
- - '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: 1.8.6
|
142
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
|
+
requirements:
|
144
|
+
- - '>='
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: 1.3.6
|
147
|
+
requirements: []
|
148
|
+
rubyforge_project: rubyretriever
|
149
|
+
rubygems_version: 2.2.2
|
150
|
+
signing_key:
|
151
|
+
specification_version: 4
|
152
|
+
summary: Ruby Web Crawler & File Harvester
|
153
|
+
test_files: []
|