rubyretriever 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0aa827221b6c3034f4463c376b29e47b740580e6
4
- data.tar.gz: c800c5820d62e45c140dea2d94140a3a9636aeff
3
+ metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
4
+ data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
5
5
  SHA512:
6
- metadata.gz: fba8ef21412309bdfe3435caf8fe4ec01d197cce5cb1698fc9536bc8127bcd9c45d51c6a908bb642c5bfb3ff9caca1e73f98c78669036dc94c38704412a0461a
7
- data.tar.gz: 092058e59d4c591be1d5ceab99dfa4219f86929897d2d1ab25859d03bf918622f4289ab8bca5fff4e87d6b9af228af80ac129deec1122e18d2f33db38544dea6
6
+ metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
7
+ data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
data/bin/rr CHANGED
@@ -4,73 +4,73 @@ require 'retriever'
4
4
  require 'optparse'
5
5
 
6
6
  options = {}
7
- optparse = OptionParser.new do|opts|
7
+ optparse = OptionParser.new do |opts|
8
8
  # Set a banner, displayed at the top
9
9
  # of the help screen.
10
- opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
11
- options[:sitemap] = false
12
- opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
13
- options[:sitemap] = output_type||''
10
+ opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
11
+ options['sitemap'] = false
12
+ opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
13
+ options['sitemap'] = output_type || ''
14
14
  end
15
- options[:fileharvest] = false
16
- opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
17
- options[:fileharvest] = file_ext
15
+ options['fileharvest'] = false
16
+ opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
17
+ options['fileharvest'] = file_ext
18
18
  end
19
- options[:seo] = false
20
- opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
21
- options[:seo] = true
19
+ options['seo'] = false
20
+ opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
21
+ options['seo'] = true
22
22
  end
23
- options[:filename] = nil
24
- opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
25
- options[:filename] = filename
23
+ options['filename'] = nil
24
+ opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
25
+ options['filename'] = filename
26
26
  end
27
27
  # Define the options, and what they do
28
- options[:verbose] = false
29
- opts.on( '-v', '--verbose', 'Output more information' ) do
30
- options[:verbose] = true
28
+ options['verbose'] = false
29
+ opts.on('-v', '--verbose', 'Output more information') do
30
+ options['verbose'] = true
31
31
  end
32
- options[:progress] = false
33
- opts.on( '-p', '--progress', 'Output progress bar' ) do
34
- options[:progress] = true
32
+ options['progress'] = false
33
+ opts.on('-p', '--progress', 'Output progress bar') do
34
+ options['progress'] = true
35
35
  end
36
- options[:maxpages] = false
37
- opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
36
+ options['maxpages'] = false
37
+ opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
38
38
  options[:maxpages] = maxpages
39
39
  end
40
- options[:autodown] = false
41
- opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
40
+ options['autodown'] = false
41
+ opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
42
42
  options[:autodown] = true
43
43
  end
44
44
  # This displays the help screen, all programs are
45
45
  # assumed to have this option.
46
- opts.on( '-h', '--help', 'Display this screen' ) do
46
+ opts.on('-h', '--help', 'Display this screen') do
47
47
  puts opts
48
48
  exit
49
49
  end
50
50
  end
51
-
52
- optparse.parse!
51
+
52
+ optparse.parse!
53
53
  if ARGV[0].nil?
54
- abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
54
+ abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
55
55
  end
56
56
 
57
57
  ARGV.each do|q|
58
58
  if options[:verbose]
59
- puts "###############################"
60
- puts "### [RubyRetriever]"
61
- puts "### Creating Sitemap" if options[:sitemap]
62
- puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
63
- puts "### Performing File Harvest" if options[:fileharvest]
64
- puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
65
- puts "### Performing SEO Scrape" if options[:seo]
66
- puts "### Writing output to filename: #{options[:filename]}" if options[:filename]
67
- puts "### Being verbose"
68
- puts "### Stopping after #{options[:maxpages]} pages"
59
+ puts '###############################'
60
+ puts '### [RubyRetriever]'
61
+ puts '### Creating Sitemap' if options['sitemap']
62
+ puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
63
+ puts '### Performing File Harvest' if options['fileharvest']
64
+ puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
65
+ puts '### Performing SEO Scrape' if options['seo']
66
+ puts "### Writing output to filename: #{options['filename']}" if options['filename']
67
+ puts '### Being verbose'
68
+ puts "### Stopping after #{options['maxpages']} pages"
69
69
  end
70
- puts "###############################"
70
+ puts '###############################'
71
71
  puts "### [RubyRetriever] go fetch #{q}"
72
72
  Retriever::CLI.new(q, options)
73
- puts "### [RubyRetriever] is done."
74
- puts "###############################"
73
+ puts '### [RubyRetriever] is done.'
74
+ puts '###############################'
75
75
  puts
76
76
  end
data/lib/retriever/cli.rb CHANGED
@@ -1,27 +1,21 @@
1
1
  module Retriever
2
- class CLI
3
- def initialize(url,options)
4
-
5
- #kick off the fetch mode of choice
6
- if options[:fileharvest]
7
- @fetch = Retriever::FetchFiles.new(url, options)
8
- elsif options[:sitemap]
9
- @fetch = Retriever::FetchSitemap.new(url, options)
10
- elsif options[:seo]
11
- @fetch = Retriever::FetchSEO.new(url, options)
12
- else
13
- fail "### Error: No Mode Selected"
14
- end
15
-
16
- #all fetch modes
17
- @fetch.dump
18
- @fetch.write if options[:filename]
19
-
20
- #fileharvest only
21
- @fetch.autodownload if options[:autodown] && options[:fileharvest]
22
-
23
- #sitemap only
24
- @fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
25
- end
26
- end
27
- end
2
+ #
3
+ class CLI
4
+ def initialize(url, options)
5
+ # kick off the fetch mode of choice
6
+ if options['fileharvest']
7
+ @fetch = Retriever::FetchFiles.new(url, options)
8
+ elsif options['sitemap']
9
+ @fetch = Retriever::FetchSitemap.new(url, options)
10
+ elsif options['seo']
11
+ @fetch = Retriever::FetchSEO.new(url, options)
12
+ else
13
+ fail '### Error: No Mode Selected'
14
+ end
15
+ @fetch.dump
16
+ @fetch.write if options['filename']
17
+ @fetch.autodownload if options['autodown'] && options['fileharvest']
18
+ @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
19
+ end
20
+ end
21
+ end
@@ -7,191 +7,214 @@ require 'csv'
7
7
  require 'bloomfilter-rb'
8
8
 
9
9
  module Retriever
10
- class Fetch
11
- attr_reader :maxPages, :t
10
+ #
11
+ class Fetch
12
+ attr_reader :max_pages, :t
13
+ # given target URL and RR options, creates a fetch object.
14
+ # There is no direct output
15
+ # this is a parent class that the other fetch classes build off of.
16
+ def initialize(url, options)
17
+ @connection_tally = {
18
+ :success => 0,
19
+ :error => 0,
20
+ :error_client => 0,
21
+ :error_server => 0
22
+ }
23
+ # OPTIONS
24
+ @prgrss = options['progress']
25
+ @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
26
+ @v = options['verbose']
27
+ @output = options['filename']
28
+ @fh = options['fileharvest']
29
+ @file_ext = @fh.to_s
30
+ @s = options['sitemap']
31
+ @seo = options['seo']
32
+ @autodown = options['autodown']
33
+ #
34
+ if @fh
35
+ temp_ext_str = '.' + @file_ext + '\z'
36
+ @file_re = Regexp.new(temp_ext_str).freeze
37
+ else
38
+ # when FH is not true, and autodown is true
39
+ errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
40
+ end
41
+ if @prgrss
42
+ # verbose & progressbar conflict
43
+ errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
44
+ prgress_vars = {
45
+ :title => 'Pages',
46
+ :starting_at => 1,
47
+ :total => @max_pages,
48
+ :format => '%a |%b>%i| %c/%C %t'
49
+ }
50
+ @progressbar = ProgressBar.create(prgress_vars)
51
+ end
52
+ @t = Retriever::Target.new(url, @file_re)
53
+ @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
54
+ @already_crawled = BloomFilter::Native.new(
55
+ :size => 1_000_000,
56
+ :hashes => 5,
57
+ :seed => 1,
58
+ :bucket => 8,
59
+ :raise => false
60
+ )
61
+ @already_crawled.insert(@t.target)
62
+ end
12
63
 
13
- def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
14
- @connection_tally = {
15
- :success => 0,
16
- :error => 0,
17
- :error_client => 0,
18
- :error_server => 0
19
- }
20
- #OPTIONS
21
- @prgrss = options[:progress] ? options[:progress] : false
22
- @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
23
- @v= options[:verbose] ? true : false
24
- @output=options[:filename] ? options[:filename] : false
25
- @fh = options[:fileharvest] ? options[:fileharvest] : false
26
- @file_ext = @fh.to_s
27
- @s = options[:sitemap] ? options[:sitemap] : false
28
- @seo = options[:seo] ? true : false
29
- @autodown = options[:autodown] ? true : false
30
- #
31
- if @fh
32
- tempExtStr = "."+@file_ext+'\z'
33
- @file_re = Regexp.new(tempExtStr).freeze
34
- else
35
- errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
36
- end
37
- if @prgrss
38
- errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
39
- prgressVars = {
40
- :title => "Pages Crawled",
41
- :starting_at => 1,
42
- :total => @maxPages,
43
- :format => '%a |%b>%i| %c/%C %t',
44
- }
45
- @progressbar = ProgressBar.create(prgressVars)
46
- end
47
- @t = Retriever::Target.new(url,@file_re)
48
- @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
49
- @already_crawled.insert(@t.target)
50
- if (@fh && !@output)
51
- @output = "rr-#{@t.host.split('.')[1]}"
52
- end
53
- fail "bad page source on target -- try HTTPS?" if !@t.source
54
- end
55
- def errlog(msg)
56
- raise "ERROR: #{msg}"
57
- end
58
- def lg(msg)
59
- puts "### #{msg}" if @v
60
- end
61
- def dump #prints current data collection to STDOUT, meant for CLI use.
62
- puts "###############################"
63
- if @v
64
- puts "Connection Tally:"
65
- puts @connection_tally.to_s
66
- puts "###############################"
67
- end
68
- if @s
69
- puts "#{@t.target} Sitemap"
70
- puts "Page Count: #{@data.size}"
71
- elsif @fh
72
- puts "Target URL: #{@t.target}"
73
- puts "Filetype: #{@file_ext}"
74
- puts "File Count: #{@data.size}"
75
- elsif @seo
76
- puts "#{@t.target} SEO Metrics"
77
- puts "Page Count: #{@data.size}"
78
- else
79
- fail "ERROR - Cannot dump - Mode Not Found"
80
- end
81
- puts "###############################"
82
- @data.each do |line|
83
- puts line
84
- end
85
- puts "###############################"
86
- puts
87
- end
88
- def write #writes current data collection out to CSV in current directory
89
- if @output
90
- i = 0
91
- CSV.open("#{@output}.csv", "w") do |csv|
92
- if ((i == 0) && @seo)
93
- csv << ['URL','Page Title','Meta Description','H1','H2']
94
- i +=1
95
- end
96
- @data.each do |entry|
97
- csv << entry
98
- end
99
- end
100
- puts "###############################"
101
- puts "File Created: #{@output}.csv"
102
- puts "Object Count: #{@data.size}"
103
- puts "###############################"
104
- puts
105
- end
106
- end
107
- def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
108
- while (@already_crawled.size < @maxPages)
109
- if @linkStack.empty?
110
- if @prgrss
111
- @progressbar.log("Can't find any more links. Site might be completely mapped.")
112
- else
113
- lg("Can't find any more links. Site might be completely mapped.")
114
- end
115
- break;
116
- end
117
- new_links_arr = self.asyncGetWave()
118
- next if (new_links_arr.nil? || new_links_arr.empty?)
119
- new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
120
- @linkStack.concat(new_links_arr).uniq!
121
- @data.concat(new_links_arr) if @s
122
- end
123
- @progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
124
- end
125
- def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
- return false if !resp
127
- if resp.response_header.redirection? #we got redirected
128
- loc = resp.response_header.location
129
- lg("#{url} Redirected to #{loc}")
130
- if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
131
- @linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
132
- lg("--Added to linkStack for later")
133
- return false
134
- end
135
- lg("Redirection outside of target host. No - go. #{loc}")
136
- return false
137
- end
138
- if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
139
- lg("UNSUCCESSFUL CONNECTION -- #{url}")
140
- @connection_tally[:error] += 1
141
- @connection_tally[:error_server] += 1 if resp.response_header.server_error?
142
- @connection_tally[:error_client] += 1 if resp.response_header.client_error?
143
- return false
144
- end
145
- if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
146
- @already_crawled.insert(url)
147
- @linkStack.delete(url)
148
- lg("Page Not text/html -- #{url}")
149
- return false
150
- end
151
- @connection_tally[:success] += 1
152
- return true
153
- end
64
+ def errlog(msg)
65
+ fail "ERROR: #{msg}"
66
+ end
154
67
 
155
- def asyncGetWave() #send a new wave of GET requests, using current @linkStack
156
- new_stuff = []
157
- EM.synchrony do
158
- lenny = 0
159
- concurrency = 10
160
- EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
161
- next if (@already_crawled.size >= @maxPages)
162
- if @already_crawled.include?(url)
163
- @linkStack.delete(url)
164
- next
165
- end
166
- resp = EventMachine::HttpRequest.new(url).get
167
- next if !good_response?(resp,url)
168
- new_page = Retriever::Page.new(resp.response,@t)
169
- lg("Page Fetched: #{url}")
170
- @already_crawled.insert(url)
171
- if @prgrss
172
- @progressbar.increment if @already_crawled.size < @maxPages
173
- end
174
- if @seo
175
- seos = [url]
176
- seos.concat(new_page.parseSEO)
177
- @data.push(seos)
178
- lg("--page SEO scraped")
179
- end
180
- if new_page.links
181
- lg("--#{new_page.links.size} links found")
182
- internal_links_arr = new_page.parseInternalVisitable
183
- new_stuff.push(internal_links_arr)
184
- if @fh
185
- filez = new_page.parseFiles
186
- @data.concat(filez) if !filez.empty?
187
- lg("--#{filez.size} files found")
188
- end
189
- end
190
- end
191
- new_stuff = new_stuff.flatten # all completed requests
192
- EventMachine.stop
193
- end
194
- new_stuff.uniq!
195
- end
196
- end
197
- end
68
+ def lg(msg)
69
+ puts "### #{msg}" if @v
70
+ end
71
+
72
+ # prints current data collection to STDOUT
73
+ def dump
74
+ puts '###############################'
75
+ if @v
76
+ puts 'Connection Tally:'
77
+ puts @connection_tally.to_s
78
+ puts '###############################'
79
+ end
80
+ if @s
81
+ puts "#{@t.target} Sitemap"
82
+ puts "Page Count: #{@data.size}"
83
+ elsif @fh
84
+ puts "Target URL: #{@t.target}"
85
+ puts "Filetype: #{@file_ext}"
86
+ puts "File Count: #{@data.size}"
87
+ elsif @seo
88
+ puts "#{@t.target} SEO Metrics"
89
+ puts "Page Count: #{@data.size}"
90
+ else
91
+ fail 'ERROR - Cannot dump - Mode Not Found'
92
+ end
93
+ puts '###############################'
94
+ @data.each do |line|
95
+ puts line
96
+ end
97
+ puts '###############################'
98
+ puts
99
+ end
100
+
101
+ # writes current data collection out to CSV in current directory
102
+ def write
103
+ return false unless @output
104
+ i = 0
105
+ CSV.open("#{@output}.csv", 'w') do |csv|
106
+ if (i == 0) && @seo
107
+ csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
108
+ i += 1
109
+ end
110
+ @data.each do |entry|
111
+ csv << entry
112
+ end
113
+ end
114
+ puts '###############################'
115
+ puts "File Created: #{@output}.csv"
116
+ puts "Object Count: #{@data.size}"
117
+ puts '###############################'
118
+ puts
119
+ end
120
+
121
+ # iterates over the existing @link_stack
122
+ # running until we reach the @max_pages value.
123
+ def async_crawl_and_collect
124
+ while @already_crawled.size < @max_pages
125
+ if @link_stack.empty?
126
+ if @prgrss
127
+ @progressbar.log("Can't find any more links.")
128
+ else
129
+ lg("Can't find any more links.")
130
+ end
131
+ break
132
+ end
133
+ new_links_arr = process_link_stack
134
+ next if new_links_arr.nil? || new_links_arr.empty?
135
+ # set operations to see are these in our previous visited pages arr
136
+ new_links_arr -= @link_stack
137
+ @link_stack.concat(new_links_arr).uniq!
138
+ @data.concat(new_links_arr) if @s
139
+ end
140
+ # done, make sure progress bar says we are done
141
+ @progressbar.finish if @prgrss
142
+ end
143
+
144
+ # returns true is resp is ok to continue
145
+ def good_response?(resp, url)
146
+ return false unless resp
147
+ hdr = resp.response_header
148
+ if hdr.redirection?
149
+ loc = hdr.location
150
+ lg("#{url} Redirected to #{loc}")
151
+ if t.host_re =~ loc
152
+ @link_stack.push(loc) unless @already_crawled.include?(loc)
153
+ lg('--Added to linkStack for later')
154
+ return false
155
+ end
156
+ lg("Redirection outside of target host. No - go. #{loc}")
157
+ return false
158
+ end
159
+ # lets not continue if unsuccessful connection
160
+ unless hdr.successful?
161
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
162
+
163
+ @connection_tally[:error] += 1
164
+ @connection_tally[:error_server] += 1 if hdr.server_error?
165
+ @connection_tally[:error_client] += 1 if hdr.client_error?
166
+ return false
167
+ end
168
+ # let's not continue if not text/html
169
+ unless hdr['CONTENT_TYPE'].include?('text/html')
170
+ @already_crawled.insert(url)
171
+ @link_stack.delete(url)
172
+ lg("Page Not text/html -- #{url}")
173
+ return false
174
+ end
175
+ @connection_tally[:success] += 1
176
+ true
177
+ end
178
+
179
+ # send a new wave of GET requests, using current @link_stack
180
+ def process_link_stack
181
+ new_stuff = []
182
+ EM.synchrony do
183
+ concurrency = 10
184
+ EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
185
+ next if @already_crawled.size >= @max_pages
186
+ next if @already_crawled.include?(url)
187
+
188
+ resp = EventMachine::HttpRequest.new(url).get
189
+
190
+ next unless good_response?(resp, url)
191
+ lg("Page Fetched: #{url}")
192
+ @already_crawled.insert(url)
193
+
194
+ new_page = Retriever::Page.new(resp.response, @t)
195
+ if @prgrss
196
+ @progressbar.increment if @already_crawled.size < @max_pages
197
+ end
198
+ if @seo
199
+ seos = [url]
200
+ seos.concat(new_page.parse_seo)
201
+ @data.push(seos)
202
+ lg('--page SEO scraped')
203
+ end
204
+ next if new_page.links.size == 0
205
+ lg("--#{new_page.links.size} links found")
206
+ internal_links_arr = new_page.parse_internal_visitable
207
+ new_stuff.push(internal_links_arr)
208
+ if @fh
209
+ filez = new_page.parse_files
210
+ @data.concat(filez) unless filez.empty?
211
+ lg("--#{filez.size} files found")
212
+ end
213
+ end
214
+ new_stuff = new_stuff.flatten # all completed requests
215
+ EventMachine.stop
216
+ end
217
+ new_stuff.uniq!
218
+ end
219
+ end
220
+ end
@@ -1,65 +1,70 @@
1
1
  module Retriever
2
- class FetchFiles < Fetch
3
- def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
4
- super
5
- @data = []
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
2
+ # recieves target url and RR options
3
+ # returns an array of all unique files (based on given filetype)
4
+ # found on the target site
5
+ class FetchFiles < Fetch
6
+ def initialize(url, options)
7
+ super
8
+ @data = []
9
+ page_one = Retriever::Page.new(@t.source, @t)
10
+ @link_stack = page_one.parse_internal_visitable
11
+ lg("URL Crawled: #{@t.target}")
12
+ lg("#{@link_stack.size - 1} new links found")
10
13
 
11
- tempFileCollection = page_one.parseFiles
12
- @data.concat(tempFileCollection) if tempFileCollection.size>0
13
- lg("#{@data.size} new files found")
14
- errlog("Bad URL -- #{@t.target}") if !@linkStack
14
+ temp_file_collection = page_one.parse_files
15
+ @data.concat(tempFileCollection) if temp_file_collection.size > 0
16
+ lg("#{@data.size} new files found")
17
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
18
+ @link_stack.delete(@t.target)
15
19
 
16
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
17
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
20
+ async_crawl_and_collect
18
21
 
19
- self.async_crawl_and_collect()
22
+ @data.sort_by! { |x| x.length }
23
+ @data.uniq!
24
+ end
20
25
 
21
- @data.sort_by! {|x| x.length}
22
- @data.uniq!
23
- end
24
- def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
25
- arr = path.split('/')
26
- shortname = arr.pop
27
- puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
28
- File.open(shortname, "wb") do |saved_file|
29
- open(path) do |read_file|
30
- saved_file.write(read_file.read)
31
- end
32
- end
33
- puts " SUCCESS: Download Complete"
34
- end
35
- def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
36
- lenny = @data.count
37
- puts "###################"
38
- puts "### Initiating Autodownload..."
39
- puts "###################"
40
- puts "#{lenny} - #{@file_ext}'s Located"
41
- puts "###################"
42
- if File::directory?("rr-downloads")
43
- Dir.chdir("rr-downloads")
44
- else
45
- puts "creating rr-downloads Directory"
46
- Dir.mkdir("rr-downloads")
47
- Dir.chdir("rr-downloads")
48
- end
49
- file_counter = 0
50
- @data.each do |entry|
51
- begin
52
- self.download_file(entry)
53
- file_counter+=1
54
- lg(" File [#{file_counter} of #{lenny}]")
55
- puts
56
- rescue StandardError => e
57
- puts "ERROR: failed to download - #{entry}"
58
- puts e.message
59
- puts
60
- end
61
- end
62
- Dir.chdir("..")
63
- end
64
- end
65
- end
26
+ def download_file(path)
27
+ # given valid url, downloads file to current directory in /rr-downloads/
28
+ arr = path.split('/')
29
+ shortname = arr.pop
30
+ puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
31
+ File.open(shortname, 'wb') do |saved_file|
32
+ open(path) do |read_file|
33
+ saved_file.write(read_file.read)
34
+ end
35
+ end
36
+ puts ' SUCCESS: Download Complete'
37
+ end
38
+
39
+ def autodownload
40
+ # go through the fetched file URL collection and download each one.
41
+ lenny = @data.count
42
+ puts '###################'
43
+ puts '### Initiating Autodownload...'
44
+ puts '###################'
45
+ puts "#{lenny} - #{@file_ext}'s Located"
46
+ puts '###################'
47
+ if File.directory?('rr-downloads')
48
+ Dir.chdir('rr-downloads')
49
+ else
50
+ puts 'creating rr-downloads Directory'
51
+ Dir.mkdir('rr-downloads')
52
+ Dir.chdir('rr-downloads')
53
+ end
54
+ file_counter = 0
55
+ @data.each do |entry|
56
+ begin
57
+ download_file(entry)
58
+ file_counter += 1
59
+ lg(' File [#{file_counter} of #{lenny}]')
60
+ puts
61
+ rescue StandardError => e
62
+ puts 'ERROR: failed to download - #{entry}'
63
+ puts e.message
64
+ puts
65
+ end
66
+ end
67
+ Dir.chdir('..')
68
+ end
69
+ end
70
+ end
@@ -1,23 +1,25 @@
1
1
  module Retriever
2
- class FetchSEO < Fetch
3
- def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
4
- super
5
- @data = []
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
2
+ #
3
+ class FetchSEO < Fetch
4
+ # recieves target url and RR options
5
+ # returns an array of onpage SEO related fields
6
+ # on all unique pages found on the site
7
+ def initialize(url, options)
8
+ super
9
+ @data = []
10
+ page_one = Retriever::Page.new(@t.source, @t)
11
+ lg("URL Crawled: #{@t.target}")
10
12
 
11
- @data.push(page_one.parseSEO)
12
- lg("#{@data.size} pages scraped")
13
- errlog("Bad URL -- #{@t.target}") if !@linkStack
13
+ @link_stack = page_one.parse_internal_visitable
14
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
15
+ lg("#{@link_stack.size - 1} links found")
16
+ @link_stack.delete(@t.target)
14
17
 
15
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
16
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
+ @data.push(page_one.parse_seo)
17
19
 
18
- self.async_crawl_and_collect()
20
+ async_crawl_and_collect
19
21
 
20
- @data.sort_by! {|x| x[0].length}
21
- end
22
- end
23
- end
22
+ @data.sort_by! { |x| x[0].length }
23
+ end
24
+ end
25
+ end
@@ -1,36 +1,41 @@
1
1
  module Retriever
2
- class FetchSitemap < Fetch
3
- def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
4
- super
5
- @data = [@t.target]
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
10
- errlog("Bad URL -- #{@t.target}") if !@linkStack
2
+ #
3
+ class FetchSitemap < Fetch
4
+ # recieves target URL and RR options
5
+ # returns an array of all unique pages found on the site
6
+ def initialize(url, options)
7
+ super
8
+ @data = [@t.target]
9
+ page_one = Retriever::Page.new(@t.source, @t)
10
+ lg("URL Crawled: #{@t.target}")
11
+ @link_stack = page_one.parse_internal_visitable
12
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
13
+ lg("#{@link_stack.size - 1} links found")
11
14
 
12
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
13
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
14
- @data.concat(@linkStack)
15
+ @link_stack.delete(@t.target)
16
+ @data.concat(@link_stack)
15
17
 
16
- self.async_crawl_and_collect()
18
+ async_crawl_and_collect
17
19
 
18
- @data.sort_by! {|x| x.length} if @data.size>1
19
- @data.uniq!
20
- end
21
- def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
22
- f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
23
- f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
24
- @data.each do |url|
25
- f << "<url><loc>#{url}</loc></url>"
26
- end
27
- f << "</urlset>"
28
- f.close
29
- puts "###############################"
30
- puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
31
- puts "Object Count: #{@data.size}"
32
- puts "###############################"
33
- puts
34
- end
35
- end
36
- end
20
+ @data.sort_by! { |x| x.length } if @data.size > 1
21
+ @data.uniq!
22
+ end
23
+
24
+ # produces valid XML sitemap based on page collection fetched.
25
+ # Writes to current directory.
26
+ def gen_xml
27
+ f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
28
+ f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
29
+ @data.each do |url|
30
+ f << "<url><loc>#{url}</loc></url>"
31
+ end
32
+ f << '</urlset>'
33
+ f.close
34
+ puts '###############################'
35
+ puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
36
+ puts "Object Count: #{@data.size}"
37
+ puts '###############################'
38
+ puts
39
+ end
40
+ end
41
+ end
@@ -1,4 +1,5 @@
1
1
  module Retriever
2
+ #
2
3
  class Link
3
4
  HTTP_RE = Regexp.new(/^http/i).freeze
4
5
  SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
@@ -18,12 +19,15 @@ module Retriever
18
19
 
19
20
  return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
20
21
 
21
- return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
22
+ # link begins with '//'
23
+ return "http:#{link}" if DOUBLE_SLASH_RE =~ link
22
24
 
23
- return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
25
+ # link uses relative path with no slashes at all
26
+ return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
24
27
  end
25
28
 
26
29
  private
30
+
27
31
  attr_reader :host, :link
28
32
  end
29
33
  end
@@ -1,6 +1,7 @@
1
1
  module OpenURI
2
- def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
2
+ # nesc patch otherwise OPENURI blocks redirects to and from https
3
+ def OpenURI.redirectable?(uri1, uri2)
3
4
  uri1.scheme.downcase == uri2.scheme.downcase ||
4
5
  (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
5
6
  end
6
- end
7
+ end
@@ -1,7 +1,6 @@
1
1
  module Retriever
2
-
2
+ #
3
3
  class Page
4
-
5
4
  HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
6
5
  NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
7
6
  HTTP_RE = Regexp.new(/^http/i).freeze
@@ -14,55 +13,55 @@ module Retriever
14
13
 
15
14
  attr_reader :links, :source, :t
16
15
 
17
- def initialize(source,t)
16
+ def initialize(source, t)
18
17
  @t = t
19
18
  @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
20
19
  @links = nil
21
20
  end
22
21
 
23
- #recieves page source as string
24
- #returns array of unique href links
22
+ # recieves page source as string
23
+ # returns array of unique href links
25
24
  def links
26
25
  return @links if @links
27
- return false if !@source
28
- @links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
26
+ return false unless @source
27
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match|
28
+ # filter some malformed URLS that come in
29
+ # meant to be a loose filter to catch all reasonable HREF attributes.
29
30
  link = match[0]
30
31
  Link.new(@t.host, link).path
31
32
  end.uniq
32
33
  end
33
34
 
34
- def parseInternal
35
- links.select{ |linky| (@t.host_re =~ linky) }
35
+ def parse_internal
36
+ links.select { |linky| (@t.host_re =~ linky) }
36
37
  end
37
38
 
38
- def parseInternalVisitable
39
- parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
39
+ def parse_internal_visitable
40
+ parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
40
41
  end
41
42
 
42
- def parseFiles
43
- links.select{ |linky| (@t.file_re =~ linky)}
43
+ def parse_files
44
+ links.select { |linky| (@t.file_re =~ linky) }
44
45
  end
45
46
 
46
47
  def title
47
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
48
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
48
49
  end
49
50
 
50
51
  def desc
51
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
52
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
52
53
  end
53
54
 
54
55
  def h1
55
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
56
+ H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
56
57
  end
57
58
 
58
59
  def h2
59
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
60
+ H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
60
61
  end
61
62
 
62
- def parseSEO
63
- return [title,desc,h1,h2]
63
+ def parse_seo
64
+ [title, desc, h1, h2]
64
65
  end
65
-
66
66
  end
67
-
68
67
  end
@@ -1,52 +1,44 @@
1
1
  require 'open-uri'
2
2
 
3
3
  module Retriever
4
-
4
+ #
5
5
  class Target
6
-
7
6
  HTTP_RE = Regexp.new(/^http/i).freeze
8
7
  DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
-
8
+
10
9
  attr_reader :host, :target, :host_re, :source, :file_re
11
10
 
12
- def initialize(url,file_re=nil)
13
- url = "http://#{url}" if (!(HTTP_RE =~ url))
14
- fail "Bad URL" if (!(/\./ =~ url))
11
+ def initialize(url, file_re = nil)
12
+ url = "http://#{url}" unless HTTP_RE =~ url
13
+ fail 'Bad URL' unless /\./ =~ url
15
14
  new_uri = URI(url)
16
15
  @target = new_uri.to_s
17
16
  @host = new_uri.host
18
- @host_re = Regexp.new(@host.sub('www.',''))
17
+ @host_re = Regexp.new(@host.sub('www.', ''))
19
18
  @file_re ||= file_re
20
19
  end
21
20
 
22
21
  def source
23
- resp = false
24
- begin
25
- resp = open(@target)
26
- rescue StandardError => e
27
- trap("ABRT"){
28
- puts "#{@target} failed SSL Certification Verification"
29
- }
30
- return false
31
- end
22
+ resp = open(@target)
32
23
  resp_url = resp.base_uri.to_s
33
- if (@target != resp_url)
34
- if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
35
- new_t = Retriever::Target.new(resp_url)
36
- @target = new_t.target
37
- @host = new_t.host
38
- return new_t.source
39
- end
40
- fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
24
+ if @target != resp_url
25
+ fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
26
+ # if redirect URL is same host, we want to re-sync @target
27
+ return resync_target_and_return_source(resp_url)
41
28
  end
42
29
  resp = resp.read
43
- if resp == ""
44
- fail "Domain is not working. Try the non-WWW version."
45
- end
46
- fail "Domain not working. Try HTTPS???" if !resp
47
- return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
30
+ #
31
+ fail 'Domain is not working. Try the non-WWW version.' if resp == ''
32
+ fail 'Domain not working. Try HTTPS???' unless resp
33
+ # consider using scrub from ruby 2.1? this misses some things
34
+ resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
48
35
  end
49
36
 
37
+ def resync_target_and_return_source(url)
38
+ new_t = Retriever::Target.new(url)
39
+ @target = new_t.target
40
+ @host = new_t.host
41
+ new_t.source
42
+ end
50
43
  end
51
-
52
44
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '1.0.3'
2
+ VERSION = '1.1.0'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -8,6 +8,6 @@ require 'retriever/target'
8
8
  require 'retriever/page'
9
9
  require 'retriever/openuri-redirect-patch'
10
10
 
11
+ #
11
12
  module Retriever
12
-
13
- end
13
+ end
data/readme.md CHANGED
@@ -4,13 +4,14 @@
4
4
 
5
5
  By Joe Norton
6
6
 
7
- RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
7
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
10
10
 
11
- RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
11
+ **Use at Own Risk**
12
+ RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
12
13
 
13
- v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
+ **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
15
 
15
16
 
16
17
  getting started
data/spec/page_spec.rb CHANGED
@@ -20,8 +20,8 @@ SOURCE
20
20
  end
21
21
  end
22
22
 
23
- describe "#parseInternal" do
24
- let (:links){Retriever::Page.new(@source,t).parseInternal}
23
+ describe "#parse_internal" do
24
+ let (:links){Retriever::Page.new(@source,t).parse_internal}
25
25
  it "filters links by host" do
26
26
  @source = (<<SOURCE).strip
27
27
  <a href='http://www.cnet.com/'>download</a>
@@ -32,8 +32,8 @@ SOURCE
32
32
  end
33
33
  end
34
34
 
35
- describe "#parseInternalVisitable" do
36
- let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
35
+ describe "#parse_internal_visitable" do
36
+ let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
37
37
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
38
  @source = (<<SOURCE).strip
39
39
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
@@ -43,7 +43,7 @@ SOURCE
43
43
  end
44
44
 
45
45
  describe "#parseFiles" do
46
- let (:links){Retriever::Page.new(@source,t).parseFiles}
46
+ let (:links){Retriever::Page.new(@source,t).parse_files}
47
47
  it "filters links by filetype" do
48
48
  @source = (<<SOURCE).strip
49
49
  <a href='www.cnet.com/download.exe'>download</a>
@@ -90,5 +90,4 @@ SOURCE
90
90
  expect(page.h2).to eq(' test 4 ')
91
91
  end
92
92
  end
93
-
94
- end
93
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-08 00:00:00.000000000 Z
11
+ date: 2014-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony