rubyretriever 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e4e4773d62ec74a55bb3d9afc78622229e7db559
4
+ data.tar.gz: a2186b11bb3eabdec8c312e0b3365961c97805de
5
+ SHA512:
6
+ metadata.gz: ae392a910a3c7a6f2b3f9097d82978ba35ba413da46f4e5ce30b9e728bac2fb40f61c3a110ea59b9f59d75d79b0fcf85a8a783597aeaccd31c833b2c3753bd4d
7
+ data.tar.gz: 72fe87613059ccae6022c65ad5db70445cc4470028d081e918ffb9dfc122adb5dd8c9967abe2622a8daaeea59d878340183597a73768f32dd42679b7b159d64d
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ 2014 (c) Joseph Michael Norton - 'Joe Norton' - SoftwareByJoe.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/bin/rr ADDED
@@ -0,0 +1,80 @@
1
+ #! /usr/bin/env ruby
2
+ require_relative('../lib/retriever.rb')
3
+ options = {}
4
+ optparse = OptionParser.new do|opts|
5
+ # Set a banner, displayed at the top
6
+ # of the help screen.
7
+ opts.banner = "Usage: rr [options] Target_URL"
8
+
9
+ options[:filename] = nil
10
+ opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
11
+ options[:filename] = filename
12
+ end
13
+ # Define the options, and what they do
14
+ options[:verbose] = false
15
+ opts.on( '-v', '--verbose', 'Output more information' ) do
16
+ options[:verbose] = true
17
+ end
18
+
19
+ options[:progress] = false
20
+ opts.on( '-p', '--progressbar', 'Output more information' ) do
21
+ options[:progress] = true
22
+ end
23
+
24
+ options[:sitemap] = false
25
+ opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
26
+ options[:sitemap] = true
27
+ end
28
+
29
+ options[:fileharvest] = false
30
+ opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
31
+ options[:fileharvest] = true
32
+ end
33
+
34
+ options[:maxpages] = false
35
+ opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
36
+ options[:maxpages] = maxpages
37
+ end
38
+
39
+ options[:file_ext] = false
40
+ opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
41
+ options[:file_ext] = file_ext
42
+ end
43
+
44
+ options[:autodown] = false
45
+ opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
46
+ options[:autodown] = true
47
+ end
48
+
49
+ # This displays the help screen, all programs are
50
+ # assumed to have this option.
51
+ opts.on( '-h', '--help', 'Display this screen' ) do
52
+ puts opts
53
+ exit
54
+ end
55
+ end
56
+
57
+ optparse.parse!
58
+ if ARGV[0].nil?
59
+ abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
60
+ end
61
+
62
+ ARGV.each do|q|
63
+ if options[:verbose]
64
+ puts "###############################"
65
+ puts "### [RubyRetriever]"
66
+ puts "### Creating Sitemap" if options[:sitemap]
67
+ puts "### Performing File Harvest" if options[:fileharvest]
68
+ puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
69
+ puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
70
+ puts "### Being verbose"
71
+ puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
72
+ end
73
+ puts "###############################"
74
+ puts "### [RubyRetriever] go fetch #{q}"
75
+ test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
76
+ test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
77
+ puts "### [RubyRetriever] is done."
78
+ puts "###############################"
79
+ puts
80
+ end
data/lib/retriever.rb ADDED
@@ -0,0 +1,19 @@
1
+ ##################################################################
2
+ #####RubyRetriever -- web crawler and file harvester
3
+ #####created by Joe Norton
4
+ #####http://softwarebyjoe.com
5
+ ##LICENSING: GNU GPLv3 License##################################
6
+ #! usr/bin/ruby
7
+ require 'em-synchrony'
8
+ require 'em-synchrony/em-http'
9
+ require 'em-synchrony/fiber_iterator'
10
+ require 'ruby-progressbar'
11
+ require 'open-uri'
12
+ require 'optparse'
13
+ require 'uri'
14
+ require 'csv'
15
+ require 'bloomfilter-rb'
16
+
17
+ require 'retriever/fetch'
18
+ require 'retriever/fetchfiles'
19
+ require 'retriever/fetchsitemap'
@@ -0,0 +1,196 @@
1
+ module Retriever
2
+ class Fetch
3
+ attr_reader :target, :host, :host_re, :maxPages
4
+ #constants
5
+ HTTP_RE = Regexp.new(/^http/i).freeze
6
+ HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
7
+ NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
8
+ SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
9
+ DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
10
+ NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
11
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
12
+
13
+ def initialize(url,options)
14
+ new_uri = URI(url)
15
+ @target = new_uri.to_s
16
+ @host = new_uri.host
17
+ #OPTIONS
18
+ @prgrss = options[:progress] ? options[:progress] : false
19
+ @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
20
+ @v= options[:verbose] ? true : false
21
+ @output=options[:filename] ? options[:filename] : false
22
+ @fh = options[:fileharvest] ? true : false
23
+ @s = options[:sitemap] ? true : false
24
+ @file_ext = options[:file_ext] ? options[:file_ext] : false
25
+ @autodown = options[:autodown] ? true : false
26
+ #
27
+ @host_re = Regexp.new(host).freeze
28
+ if @fh
29
+ errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
30
+ tempExtStr = "."+@file_ext+'\z'
31
+ @file_re = Regexp.new(tempExtStr).freeze
32
+ else
33
+ errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
34
+ end
35
+ if @prgrss
36
+ errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
37
+ prgressVars = {
38
+ :title => "Pages Crawled",
39
+ :starting_at => 1,
40
+ :total => @maxPages,
41
+ :format => '%a |%b>%i| %c/%C %t',
42
+ }
43
+ @progressbar = ProgressBar.create(prgressVars)
44
+ end
45
+ @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
46
+ @already_crawled.insert(@target)
47
+ end
48
+ def errlog(msg)
49
+ raise "ERROR: #{msg}"
50
+ end
51
+ def lg(msg)
52
+ puts "### #{msg}" if @v
53
+ end
54
+ def dump(data)
55
+ puts "###############################"
56
+ if @s
57
+ puts "#{@target} Sitemap"
58
+ puts "Page Count: #{data.size}"
59
+ elsif @fh
60
+ puts "Target URL: #{@target}"
61
+ puts "Filetype: #{@file_ext}"
62
+ puts "File Count: #{data.size}"
63
+ else
64
+ puts "ERROR"
65
+ end
66
+ puts "###############################"
67
+ puts data
68
+ puts "###############################"
69
+ puts
70
+ end
71
+ def write(data)
72
+ if @output
73
+ CSV.open("#{@output}.csv", "w") do |csv|
74
+ data.each do |entry|
75
+ csv << [entry]
76
+ end
77
+ end
78
+ puts "###############################"
79
+ puts "File Created: #{filename}.csv"
80
+ puts "Object Count: #{data.size}"
81
+ puts "###############################"
82
+ puts
83
+ end
84
+ end
85
+ def fetchPage(url)
86
+ resp = false
87
+ EM.synchrony do
88
+ begin
89
+ resp = EventMachine::HttpRequest.new(url).get
90
+ rescue StandardError => e
91
+ #puts e.message + " ## " + url
92
+ #the trap abrt is nescessary to handle the SSL error
93
+ #for some ungodly reason it's the only way I found to handle it
94
+ trap("ABRT"){
95
+ puts "#{url} failed SSL Certification Verification"
96
+ }
97
+ return false
98
+ end
99
+ lg("URL Crawled: #{url}")
100
+ EventMachine.stop
101
+ end
102
+ if resp.response == ""
103
+ errlog("Domain is not working. Try the non-WWW version.")
104
+ end
105
+ return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
106
+ end
107
+ #recieves page source as string
108
+ #returns array of unique href links
109
+ def fetchLinks(doc)
110
+ return false if !doc
111
+ linkArray = []
112
+ doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
113
+ link = arr[0]
114
+ if (!(HTTP_RE =~ link))
115
+ if (DUB_DUB_DUB_DOT_RE =~ link)
116
+ link = "http://#{link}"
117
+ elsif SINGLE_SLASH_RE =~ link #link uses relative path
118
+ link = "http://#{@host}"+link #appending hostname to relative paths
119
+ elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
120
+ link = "http:#{link}" #appending current url to relative paths
121
+ elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
122
+ link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
123
+ else
124
+ next
125
+ end
126
+ end
127
+ linkArray.push(link)
128
+ end
129
+ linkArray.uniq!
130
+ end
131
+ def parseInternalLinks(all_links)
132
+ if all_links
133
+ all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
134
+ else
135
+ return false
136
+ end
137
+ end
138
+ def async_crawl_and_collect()
139
+ while (@already_crawled.size < @maxPages)
140
+ if @linkStack.empty?
141
+ if @prgrss
142
+ @progressbar.log("Can't find any more links. Site might be completely mapped.")
143
+ else
144
+ lg("Can't find any more links. Site might be completely mapped.")
145
+ end
146
+ break;
147
+ end
148
+ #puts "New loop"
149
+ #puts @linkStack
150
+ new_links_arr = self.asyncGetWave()
151
+ next if (new_links_arr.nil? || new_links_arr.empty?)
152
+ new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
153
+ @linkStack.concat(new_links_arr)
154
+ @sitemap.concat(new_links_arr) if @s
155
+ end
156
+ end
157
+ def asyncGetWave() #send a new wave of GET requests, using current @linkStack
158
+ new_stuff = []
159
+ EM.synchrony do
160
+ lenny = 0
161
+ concurrency = 10
162
+ EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
163
+ next if (@already_crawled.size >= @maxPages)
164
+ if @already_crawled.include?(url)
165
+ @linkStack.delete(url)
166
+ next
167
+ else
168
+ @already_crawled.insert(url)
169
+ end
170
+ resp = EventMachine::HttpRequest.new(url).get
171
+ lg("URL Crawled: #{url}")
172
+ if @prgrss
173
+ @progressbar.increment if @already_crawled.size < @maxPages
174
+ end
175
+ new_links_arr = self.fetchLinks(resp.response)
176
+ if new_links_arr
177
+ lg("#{new_links_arr.size} new links found")
178
+ internal_links_arr = self.parseInternalLinks(new_links_arr)
179
+ new_stuff.push(internal_links_arr)
180
+ if @fh
181
+ filez = self.parseFiles(new_links_arr)
182
+ @fileStack.concat(filez) if !filez.empty?
183
+ lg("#{filez.size} files found")
184
+ end
185
+ end
186
+ end
187
+ new_stuff = new_stuff.flatten # all completed requests
188
+ EventMachine.stop
189
+ end
190
+ new_stuff.uniq!
191
+ end
192
+ def parseFiles(all_links)
193
+ all_links.select{ |linky| (@file_re =~ linky)}
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,70 @@
1
+ module Retriever
2
+ class FetchFiles < Fetch
3
+ attr_reader :fileStack
4
+ def initialize(url,options)
5
+ super
6
+ @fileStack = []
7
+ all_links = self.fetchLinks(fetchPage(@target))
8
+ @linkStack = self.parseInternalLinks(all_links)
9
+ self.lg("#{@linkStack.size-1} new links found")
10
+
11
+ tempFileCollection = self.parseFiles(all_links)
12
+ @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
13
+ self.lg("#{@fileStack.size} new files found")
14
+ errlog("Bad URL -- #{@target}") if !@linkStack
15
+
16
+ @linkStack.delete(@target) if @linkStack.include?(@target)
17
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
+
19
+ self.async_crawl_and_collect()
20
+
21
+ @fileStack.sort_by! {|x| x.length}
22
+ @fileStack.uniq!
23
+
24
+ self.dump(self.fileStack)
25
+ self.write(@output,self.fileStack) if @output
26
+ self.autodownload()
27
+ end
28
+ def download_file(path)
29
+ arr = path.split('/')
30
+ shortname = arr.pop
31
+ puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
32
+ File.open(shortname, "wb") do |saved_file|
33
+ # the following "open" is provided by open-uri
34
+ open(path) do |read_file|
35
+ saved_file.write(read_file.read)
36
+ end
37
+ end
38
+ puts " SUCCESS: Download Complete"
39
+ end
40
+ def autodownload()
41
+ lenny = @fileStack.count
42
+ puts "###################"
43
+ puts "### Initiating Autodownload..."
44
+ puts "###################"
45
+ puts "#{lenny} - #{@file_ext}'s Located"
46
+ puts "###################"
47
+ if File::directory?("rr-downloads")
48
+ Dir.chdir("rr-downloads")
49
+ else
50
+ puts "creating rr-downloads Directory"
51
+ Dir.mkdir("rr-downloads")
52
+ Dir.chdir("rr-downloads")
53
+ end
54
+ file_counter = 0
55
+ @fileStack.each do |entry|
56
+ begin
57
+ self.download_file(entry)
58
+ file_counter+=1
59
+ lg(" File [#{file_counter} of #{lenny}]")
60
+ puts
61
+ rescue StandardError => e
62
+ puts "ERROR: failed to download - #{entry}"
63
+ puts e.message
64
+ puts
65
+ end
66
+ end
67
+ Dir.chdir("..")
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,25 @@
1
+ module Retriever
2
+ class FetchSitemap < Fetch
3
+ attr_reader :sitemap
4
+ def initialize(url,options)
5
+ super
6
+ @sitemap = [@target]
7
+ @linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
8
+ self.lg("#{@linkStack.size-1} new links found")
9
+ errlog("Bad URL -- #{@target}") if !@linkStack
10
+
11
+ @linkStack.delete(@target) if @linkStack.include?(@target)
12
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
13
+ @sitemap.concat(@linkStack)
14
+
15
+ self.async_crawl_and_collect()
16
+
17
+ @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
18
+ @sitemap.uniq!
19
+ @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
+
21
+ self.dump(self.sitemap)
22
+ self.write(@output,self.sitemap) if @output
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Retriever
2
+ VERSION = '0.0.8'
3
+ end
data/readme.md ADDED
@@ -0,0 +1,79 @@
1
+ RubyRetriever [![Gem Version](https://badge.fury.io/rb/RubyRetriever.svg)](http://badge.fury.io/rb/RubyRetriever)
2
+ ==============
3
+
4
+ Now an official RubyGem! --make sure to use camel-casing--
5
+ ```sh
6
+ gem install RubyRetriever
7
+ ```
8
+
9
+ Update (5/25):
10
+ Version 0.06 - Switches to using a Bloom Filter to keep track of past 'visited pages'. I saw this in [Arachnid] (https://github.com/dchuk/Arachnid) and realized it's a much better idea for performance and implemented it immediately. Hat tip [dchuk] (https://github.com/dchuk/)
11
+
12
+ About
13
+ =====
14
+
15
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
16
+ Soon to add some high level scraping options.
17
+
18
+ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
19
+
20
+ This is the 2nd or 3rd reincarnation of the RubyRetriever autodownloader project. It started out as a executable autodownloader, intended for malware research. From there it has morphed to become a more well-rounded web-crawler and general purpose file harvesting utility.
21
+
22
+ RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
23
+
24
+
25
+ HOW IT WORKS
26
+ -----------
27
+ ```sh
28
+ gem install RubyRetriever
29
+ rr [MODE] [OPTIONS] Target_URL
30
+ ```
31
+
32
+ **Site Mapper**
33
+ ```sh
34
+ rr --sitemap --progress --limit 1000 --output cnet http://www.cnet.com
35
+ ```
36
+ OR -- SAME COMMAND
37
+ ```sh
38
+ rr -s -p -l 1000 -o cnet http://www.cnet.com
39
+ ```
40
+
41
+ This would go to http://www.cnet.com and map it until it crawled a max of 1,000 pages, and then it would write it out to a csv named cnet.
42
+
43
+ **File Harvesting**
44
+ ```sh
45
+ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
46
+ ```
47
+ OR -- SAME COMMAND
48
+ ```sh
49
+ rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
50
+ ```
51
+
52
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
53
+
54
+
55
+ command-line arguments
56
+ -----------------------
57
+ Usage: rr [MODE] [OPTIONS] Target_URL
58
+
59
+ Where MODE FLAG is either:
60
+ -s, --sitemap
61
+ -f, --files
62
+
63
+ and OPTIONS is the applicable:
64
+ -o, --out FILENAME *Dump output to selected filename*
65
+ -p, --progress *Outputs a progressbar*
66
+ -v, --verbose *Output more information*
67
+ -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
68
+ -e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
69
+ -h, --help *Display this screen*
70
+
71
+ Current Requirements
72
+ ------------
73
+ em-synchrony
74
+ ruby-progressbar
75
+ bloomfilter-rb
76
+
77
+ License
78
+ -------
79
+ See included 'LICENSE' file. It's the MIT license.
@@ -0,0 +1,65 @@
1
+ require_relative '../lib/retriever'
2
+
3
+ r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
+ test_html = "<a href='www.cnet.com/download.exe'>download</a>
5
+ http://www.google.com
6
+ <a href='/test.html'>test</a>
7
+ <a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
8
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
9
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
10
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
11
+ test.com
12
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
+ <a href='cpage_18'>about</a>"
14
+
15
+ doc = r.fetchPage(r.target)
16
+ links_collection = r.fetchLinks(test_html)
17
+ filtered_links = r.parseInternalLinks(links_collection)
18
+ file_list = r.parseFiles(links_collection)
19
+
20
+ describe "Fetch" do
21
+
22
+ describe "#new" do
23
+ it "sets target, host, and max page vars" do
24
+ expect(r.target).to eq("http://www.cnet.com/reviews/")
25
+ expect(r.host).to eq("www.cnet.com")
26
+ expect(r.maxPages).to eq(100)
27
+ end
28
+ end
29
+
30
+ describe "#fetchPage" do
31
+ it "opens URL and returns source as String" do
32
+ expect(doc.class).to eq(String)
33
+ end
34
+ end
35
+
36
+ describe "#fetchLinks" do
37
+ it "collects all unique href links on the page" do
38
+ expect(links_collection).to have(6).items
39
+ end
40
+ it "returns relative urls with full path based on hostname" do
41
+ expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
42
+ end
43
+ end
44
+
45
+ describe "#parseInternalLinks" do
46
+ it "filters links by host" do
47
+ filtered_links.each do |link|
48
+ expect(link).to include(r.host)
49
+ end
50
+ end
51
+ it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
52
+ filtered_links.each do |link|
53
+ expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
54
+ end
55
+ end
56
+ end
57
+ describe "#parseFiles" do
58
+ it "filters links by filetype" do
59
+ file_list.each do |link|
60
+ expect(link).to include(".exe")
61
+ end
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
metadata ADDED
@@ -0,0 +1,153 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rubyretriever
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.8
5
+ platform: ruby
6
+ authors:
7
+ - Joe Norton
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: em-synchrony
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: em-http-request
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ruby-progressbar
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bloomfilter-rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '10.3'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '10.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '2.14'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '2.14'
111
+ description: General purpose web crawler, site mapper, and file harvester
112
+ email:
113
+ - joe@softwarebyjoe.com
114
+ executables:
115
+ - rr
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - LICENSE
120
+ - bin/rr
121
+ - lib/retriever.rb
122
+ - lib/retriever/fetch.rb
123
+ - lib/retriever/fetchfiles.rb
124
+ - lib/retriever/fetchsitemap.rb
125
+ - lib/retriever/version.rb
126
+ - readme.md
127
+ - spec/retriever_spec.rb
128
+ - spec/spec_helper.rb
129
+ homepage: http://github.com/joenorton/rubyretriever
130
+ licenses:
131
+ - MIT
132
+ metadata: {}
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ requirements:
139
+ - - '>='
140
+ - !ruby/object:Gem::Version
141
+ version: 1.8.6
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - '>='
145
+ - !ruby/object:Gem::Version
146
+ version: 1.3.6
147
+ requirements: []
148
+ rubyforge_project: rubyretriever
149
+ rubygems_version: 2.2.2
150
+ signing_key:
151
+ specification_version: 4
152
+ summary: Ruby Web Crawler & File Harvester
153
+ test_files: []