rubyretriever 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e4e4773d62ec74a55bb3d9afc78622229e7db559
4
+ data.tar.gz: a2186b11bb3eabdec8c312e0b3365961c97805de
5
+ SHA512:
6
+ metadata.gz: ae392a910a3c7a6f2b3f9097d82978ba35ba413da46f4e5ce30b9e728bac2fb40f61c3a110ea59b9f59d75d79b0fcf85a8a783597aeaccd31c833b2c3753bd4d
7
+ data.tar.gz: 72fe87613059ccae6022c65ad5db70445cc4470028d081e918ffb9dfc122adb5dd8c9967abe2622a8daaeea59d878340183597a73768f32dd42679b7b159d64d
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ 2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/rr ADDED
@@ -0,0 +1,80 @@
1
+ #! /usr/bin/env ruby
2
+ require_relative('../lib/retriever.rb')
3
+ options = {}
4
+ optparse = OptionParser.new do|opts|
5
+ # Set a banner, displayed at the top
6
+ # of the help screen.
7
+ opts.banner = "Usage: rr [options] Target_URL"
8
+
9
+ options[:filename] = nil
10
+ opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
11
+ options[:filename] = filename
12
+ end
13
+ # Define the options, and what they do
14
+ options[:verbose] = false
15
+ opts.on( '-v', '--verbose', 'Output more information' ) do
16
+ options[:verbose] = true
17
+ end
18
+
19
+ options[:progress] = false
20
+ opts.on( '-p', '--progressbar', 'Output more information' ) do
21
+ options[:progress] = true
22
+ end
23
+
24
+ options[:sitemap] = false
25
+ opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
26
+ options[:sitemap] = true
27
+ end
28
+
29
+ options[:fileharvest] = false
30
+ opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
31
+ options[:fileharvest] = true
32
+ end
33
+
34
+ options[:maxpages] = false
35
+ opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
36
+ options[:maxpages] = maxpages
37
+ end
38
+
39
+ options[:file_ext] = false
40
+ opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
41
+ options[:file_ext] = file_ext
42
+ end
43
+
44
+ options[:autodown] = false
45
+ opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
46
+ options[:autodown] = true
47
+ end
48
+
49
+ # This displays the help screen, all programs are
50
+ # assumed to have this option.
51
+ opts.on( '-h', '--help', 'Display this screen' ) do
52
+ puts opts
53
+ exit
54
+ end
55
+ end
56
+
57
+ optparse.parse!
58
+ if ARGV[0].nil?
59
+ abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
60
+ end
61
+
62
+ ARGV.each do|q|
63
+ if options[:verbose]
64
+ puts "###############################"
65
+ puts "### [RubyRetriever]"
66
+ puts "### Creating Sitemap" if options[:sitemap]
67
+ puts "### Performing File Harvest" if options[:fileharvest]
68
+ puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
69
+ puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
70
+ puts "### Being verbose"
71
+ puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
72
+ end
73
+ puts "###############################"
74
+ puts "### [RubyRetriever] go fetch #{q}"
75
+ test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
76
+ test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
77
+ puts "### [RubyRetriever] is done."
78
+ puts "###############################"
79
+ puts
80
+ end
data/lib/retriever.rb ADDED
@@ -0,0 +1,19 @@
1
+ ##################################################################
2
+ #####RubyRetriever -- web crawler and file harvester
3
+ #####created by Joe Norton
4
+ #####http://softwarebyjoe.com
5
+ ##LICENSING: GNU GPLv3 License##################################
6
+ #! usr/bin/ruby
7
+ require 'em-synchrony'
8
+ require 'em-synchrony/em-http'
9
+ require 'em-synchrony/fiber_iterator'
10
+ require 'ruby-progressbar'
11
+ require 'open-uri'
12
+ require 'optparse'
13
+ require 'uri'
14
+ require 'csv'
15
+ require 'bloomfilter-rb'
16
+
17
+ require 'retriever/fetch'
18
+ require 'retriever/fetchfiles'
19
+ require 'retriever/fetchsitemap'
@@ -0,0 +1,196 @@
1
+ module Retriever
2
+ class Fetch
3
+ attr_reader :target, :host, :host_re, :maxPages
4
+ #constants
5
+ HTTP_RE = Regexp.new(/^http/i).freeze
6
+ HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
7
+ NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
8
+ SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
9
+ DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
10
+ NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
11
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
12
+
13
+ def initialize(url,options)
14
+ new_uri = URI(url)
15
+ @target = new_uri.to_s
16
+ @host = new_uri.host
17
+ #OPTIONS
18
+ @prgrss = options[:progress] ? options[:progress] : false
19
+ @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
20
+ @v= options[:verbose] ? true : false
21
+ @output=options[:filename] ? options[:filename] : false
22
+ @fh = options[:fileharvest] ? true : false
23
+ @s = options[:sitemap] ? true : false
24
+ @file_ext = options[:file_ext] ? options[:file_ext] : false
25
+ @autodown = options[:autodown] ? true : false
26
+ #
27
+ @host_re = Regexp.new(host).freeze
28
+ if @fh
29
+ errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
30
+ tempExtStr = "."+@file_ext+'\z'
31
+ @file_re = Regexp.new(tempExtStr).freeze
32
+ else
33
+ errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
34
+ end
35
+ if @prgrss
36
+ errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
37
+ prgressVars = {
38
+ :title => "Pages Crawled",
39
+ :starting_at => 1,
40
+ :total => @maxPages,
41
+ :format => '%a |%b>%i| %c/%C %t',
42
+ }
43
+ @progressbar = ProgressBar.create(prgressVars)
44
+ end
45
+ @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
46
+ @already_crawled.insert(@target)
47
+ end
48
+ def errlog(msg)
49
+ raise "ERROR: #{msg}"
50
+ end
51
+ def lg(msg)
52
+ puts "### #{msg}" if @v
53
+ end
54
+ def dump(data)
55
+ puts "###############################"
56
+ if @s
57
+ puts "#{@target} Sitemap"
58
+ puts "Page Count: #{data.size}"
59
+ elsif @fh
60
+ puts "Target URL: #{@target}"
61
+ puts "Filetype: #{@file_ext}"
62
+ puts "File Count: #{data.size}"
63
+ else
64
+ puts "ERROR"
65
+ end
66
+ puts "###############################"
67
+ puts data
68
+ puts "###############################"
69
+ puts
70
+ end
71
+ def write(data)
72
+ if @output
73
+ CSV.open("#{@output}.csv", "w") do |csv|
74
+ data.each do |entry|
75
+ csv << [entry]
76
+ end
77
+ end
78
+ puts "###############################"
79
+ puts "File Created: #{filename}.csv"
80
+ puts "Object Count: #{data.size}"
81
+ puts "###############################"
82
+ puts
83
+ end
84
+ end
85
+ def fetchPage(url)
86
+ resp = false
87
+ EM.synchrony do
88
+ begin
89
+ resp = EventMachine::HttpRequest.new(url).get
90
+ rescue StandardError => e
91
+ #puts e.message + " ## " + url
92
+ #the trap abrt is nescessary to handle the SSL error
93
+ #for some ungodly reason it's the only way I found to handle it
94
+ trap("ABRT"){
95
+ puts "#{url} failed SSL Certification Verification"
96
+ }
97
+ return false
98
+ end
99
+ lg("URL Crawled: #{url}")
100
+ EventMachine.stop
101
+ end
102
+ if resp.response == ""
103
+ errlog("Domain is not working. Try the non-WWW version.")
104
+ end
105
+ return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
106
+ end
107
+ #recieves page source as string
108
+ #returns array of unique href links
109
+ def fetchLinks(doc)
110
+ return false if !doc
111
+ linkArray = []
112
+ doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
113
+ link = arr[0]
114
+ if (!(HTTP_RE =~ link))
115
+ if (DUB_DUB_DUB_DOT_RE =~ link)
116
+ link = "http://#{link}"
117
+ elsif SINGLE_SLASH_RE =~ link #link uses relative path
118
+ link = "http://#{@host}"+link #appending hostname to relative paths
119
+ elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
120
+ link = "http:#{link}" #appending current url to relative paths
121
+ elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
122
+ link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
123
+ else
124
+ next
125
+ end
126
+ end
127
+ linkArray.push(link)
128
+ end
129
+ linkArray.uniq!
130
+ end
131
+ def parseInternalLinks(all_links)
132
+ if all_links
133
+ all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
134
+ else
135
+ return false
136
+ end
137
+ end
138
+ def async_crawl_and_collect()
139
+ while (@already_crawled.size < @maxPages)
140
+ if @linkStack.empty?
141
+ if @prgrss
142
+ @progressbar.log("Can't find any more links. Site might be completely mapped.")
143
+ else
144
+ lg("Can't find any more links. Site might be completely mapped.")
145
+ end
146
+ break;
147
+ end
148
+ #puts "New loop"
149
+ #puts @linkStack
150
+ new_links_arr = self.asyncGetWave()
151
+ next if (new_links_arr.nil? || new_links_arr.empty?)
152
+ new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
153
+ @linkStack.concat(new_links_arr)
154
+ @sitemap.concat(new_links_arr) if @s
155
+ end
156
+ end
157
+ def asyncGetWave() #send a new wave of GET requests, using current @linkStack
158
+ new_stuff = []
159
+ EM.synchrony do
160
+ lenny = 0
161
+ concurrency = 10
162
+ EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
163
+ next if (@already_crawled.size >= @maxPages)
164
+ if @already_crawled.include?(url)
165
+ @linkStack.delete(url)
166
+ next
167
+ else
168
+ @already_crawled.insert(url)
169
+ end
170
+ resp = EventMachine::HttpRequest.new(url).get
171
+ lg("URL Crawled: #{url}")
172
+ if @prgrss
173
+ @progressbar.increment if @already_crawled.size < @maxPages
174
+ end
175
+ new_links_arr = self.fetchLinks(resp.response)
176
+ if new_links_arr
177
+ lg("#{new_links_arr.size} new links found")
178
+ internal_links_arr = self.parseInternalLinks(new_links_arr)
179
+ new_stuff.push(internal_links_arr)
180
+ if @fh
181
+ filez = self.parseFiles(new_links_arr)
182
+ @fileStack.concat(filez) if !filez.empty?
183
+ lg("#{filez.size} files found")
184
+ end
185
+ end
186
+ end
187
+ new_stuff = new_stuff.flatten # all completed requests
188
+ EventMachine.stop
189
+ end
190
+ new_stuff.uniq!
191
+ end
192
+ def parseFiles(all_links)
193
+ all_links.select{ |linky| (@file_re =~ linky)}
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,70 @@
1
+ module Retriever
2
+ class FetchFiles < Fetch
3
+ attr_reader :fileStack
4
+ def initialize(url,options)
5
+ super
6
+ @fileStack = []
7
+ all_links = self.fetchLinks(fetchPage(@target))
8
+ @linkStack = self.parseInternalLinks(all_links)
9
+ self.lg("#{@linkStack.size-1} new links found")
10
+
11
+ tempFileCollection = self.parseFiles(all_links)
12
+ @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
13
+ self.lg("#{@fileStack.size} new files found")
14
+ errlog("Bad URL -- #{@target}") if !@linkStack
15
+
16
+ @linkStack.delete(@target) if @linkStack.include?(@target)
17
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
+
19
+ self.async_crawl_and_collect()
20
+
21
+ @fileStack.sort_by! {|x| x.length}
22
+ @fileStack.uniq!
23
+
24
+ self.dump(self.fileStack)
25
+ self.write(@output,self.fileStack) if @output
26
+ self.autodownload()
27
+ end
28
+ def download_file(path)
29
+ arr = path.split('/')
30
+ shortname = arr.pop
31
+ puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
32
+ File.open(shortname, "wb") do |saved_file|
33
+ # the following "open" is provided by open-uri
34
+ open(path) do |read_file|
35
+ saved_file.write(read_file.read)
36
+ end
37
+ end
38
+ puts " SUCCESS: Download Complete"
39
+ end
40
+ def autodownload()
41
+ lenny = @fileStack.count
42
+ puts "###################"
43
+ puts "### Initiating Autodownload..."
44
+ puts "###################"
45
+ puts "#{lenny} - #{@file_ext}'s Located"
46
+ puts "###################"
47
+ if File::directory?("rr-downloads")
48
+ Dir.chdir("rr-downloads")
49
+ else
50
+ puts "creating rr-downloads Directory"
51
+ Dir.mkdir("rr-downloads")
52
+ Dir.chdir("rr-downloads")
53
+ end
54
+ file_counter = 0
55
+ @fileStack.each do |entry|
56
+ begin
57
+ self.download_file(entry)
58
+ file_counter+=1
59
+ lg(" File [#{file_counter} of #{lenny}]")
60
+ puts
61
+ rescue StandardError => e
62
+ puts "ERROR: failed to download - #{entry}"
63
+ puts e.message
64
+ puts
65
+ end
66
+ end
67
+ Dir.chdir("..")
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,25 @@
1
+ module Retriever
2
+ class FetchSitemap < Fetch
3
+ attr_reader :sitemap
4
+ def initialize(url,options)
5
+ super
6
+ @sitemap = [@target]
7
+ @linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
8
+ self.lg("#{@linkStack.size-1} new links found")
9
+ errlog("Bad URL -- #{@target}") if !@linkStack
10
+
11
+ @linkStack.delete(@target) if @linkStack.include?(@target)
12
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
13
+ @sitemap.concat(@linkStack)
14
+
15
+ self.async_crawl_and_collect()
16
+
17
+ @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
18
+ @sitemap.uniq!
19
+ @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
+
21
+ self.dump(self.sitemap)
22
+ self.write(@output,self.sitemap) if @output
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Retriever
2
+ VERSION = '0.0.8'
3
+ end
data/readme.md ADDED
@@ -0,0 +1,79 @@
1
+ RubyRetriever [![Gem Version](https://badge.fury.io/rb/RubyRetriever.svg)](http://badge.fury.io/rb/RubyRetriever)
2
+ ==============
3
+
4
+ Now an official RubyGem! --make sure to use camel-casing--
5
+ ```sh
6
+ gem install RubyRetriever
7
+ ```
8
+
9
+ Update (5/25):
10
+ Version 0.06 - Switches to using a Bloom Filter to keep track of past 'visited pages'. I saw this in [Arachnid] (https://github.com/dchuk/Arachnid) and realized it's a much better idea for performance and implemented it immediately. Hat tip [dchuk] (https://github.com/dchuk/)
11
+
12
+ About
13
+ =====
14
+
15
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
16
+ Soon to add some high level scraping options.
17
+
18
+ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
19
+
20
+ This is the 2nd or 3rd reincarnation of the RubyRetriever autodownloader project. It started out as a executable autodownloader, intended for malware research. From there it has morphed to become a more well-rounded web-crawler and general purpose file harvesting utility.
21
+
22
+ RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
23
+
24
+
25
+ HOW IT WORKS
26
+ -----------
27
+ ```sh
28
+ gem install RubyRetriever
29
+ rr [MODE] [OPTIONS] Target_URL
30
+ ```
31
+
32
+ **Site Mapper**
33
+ ```sh
34
+ rr --sitemap --progress --limit 1000 --output cnet http://www.cnet.com
35
+ ```
36
+ OR -- SAME COMMAND
37
+ ```sh
38
+ rr -s -p -l 1000 -o cnet http://www.cnet.com
39
+ ```
40
+
41
+ This would go to http://www.cnet.com and map it until it crawled a max of 1,000 pages, and then it would write it out to a csv named cnet.
42
+
43
+ **File Harvesting**
44
+ ```sh
45
+ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
46
+ ```
47
+ OR -- SAME COMMAND
48
+ ```sh
49
+ rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
50
+ ```
51
+
52
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
53
+
54
+
55
+ command-line arguments
56
+ -----------------------
57
+ Usage: rr [MODE] [OPTIONS] Target_URL
58
+
59
+ Where MODE FLAG is either:
60
+ -s, --sitemap
61
+ -f, --files
62
+
63
+ and OPTIONS is the applicable:
64
+ -o, --out FILENAME *Dump output to selected filename*
65
+ -p, --progress *Outputs a progressbar*
66
+ -v, --verbose *Output more information*
67
+ -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
68
+ -e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
69
+ -h, --help *Display this screen*
70
+
71
+ Current Requirements
72
+ ------------
73
+ em-synchrony
74
+ ruby-progressbar
75
+ bloomfilter-rb
76
+
77
+ License
78
+ -------
79
+ See included 'LICENSE' file. It's the MIT license.
@@ -0,0 +1,65 @@
1
+ require_relative '../lib/retriever'
2
+
3
+ r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
+ test_html = "<a href='www.cnet.com/download.exe'>download</a>
5
+ http://www.google.com
6
+ <a href='/test.html'>test</a>
7
+ <a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
8
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
9
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
10
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
11
+ test.com
12
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
+ <a href='cpage_18'>about</a>"
14
+
15
+ doc = r.fetchPage(r.target)
16
+ links_collection = r.fetchLinks(test_html)
17
+ filtered_links = r.parseInternalLinks(links_collection)
18
+ file_list = r.parseFiles(links_collection)
19
+
20
+ describe "Fetch" do
21
+
22
+ describe "#new" do
23
+ it "sets target, host, and max page vars" do
24
+ expect(r.target).to eq("http://www.cnet.com/reviews/")
25
+ expect(r.host).to eq("www.cnet.com")
26
+ expect(r.maxPages).to eq(100)
27
+ end
28
+ end
29
+
30
+ describe "#fetchPage" do
31
+ it "opens URL and returns source as String" do
32
+ expect(doc.class).to eq(String)
33
+ end
34
+ end
35
+
36
+ describe "#fetchLinks" do
37
+ it "collects all unique href links on the page" do
38
+ expect(links_collection).to have(6).items
39
+ end
40
+ it "returns relative urls with full path based on hostname" do
41
+ expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
42
+ end
43
+ end
44
+
45
+ describe "#parseInternalLinks" do
46
+ it "filters links by host" do
47
+ filtered_links.each do |link|
48
+ expect(link).to include(r.host)
49
+ end
50
+ end
51
+ it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
52
+ filtered_links.each do |link|
53
+ expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
54
+ end
55
+ end
56
+ end
57
+ describe "#parseFiles" do
58
+ it "filters links by filetype" do
59
+ file_list.each do |link|
60
+ expect(link).to include(".exe")
61
+ end
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
metadata ADDED
@@ -0,0 +1,153 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubyretriever
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.8
5
+ platform: ruby
6
+ authors:
7
+ - Joe Norton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: em-synchrony
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: em-http-request
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ruby-progressbar
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bloomfilter-rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '10.3'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '10.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '2.14'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '2.14'
111
+ description: General purpose web crawler, site mapper, and file harvester
112
+ email:
113
+ - joe@softwarebyjoe.com
114
+ executables:
115
+ - rr
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - LICENSE
120
+ - bin/rr
121
+ - lib/retriever.rb
122
+ - lib/retriever/fetch.rb
123
+ - lib/retriever/fetchfiles.rb
124
+ - lib/retriever/fetchsitemap.rb
125
+ - lib/retriever/version.rb
126
+ - readme.md
127
+ - spec/retriever_spec.rb
128
+ - spec/spec_helper.rb
129
+ homepage: http://github.com/joenorton/rubyretriever
130
+ licenses:
131
+ - MIT
132
+ metadata: {}
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ requirements:
139
+ - - '>='
140
+ - !ruby/object:Gem::Version
141
+ version: 1.8.6
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - '>='
145
+ - !ruby/object:Gem::Version
146
+ version: 1.3.6
147
+ requirements: []
148
+ rubyforge_project: rubyretriever
149
+ rubygems_version: 2.2.2
150
+ signing_key:
151
+ specification_version: 4
152
+ summary: Ruby Web Crawler & File Harvester
153
+ test_files: []