rubyretriever 1.0.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0aa827221b6c3034f4463c376b29e47b740580e6
4
- data.tar.gz: c800c5820d62e45c140dea2d94140a3a9636aeff
3
+ metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
4
+ data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
5
5
  SHA512:
6
- metadata.gz: fba8ef21412309bdfe3435caf8fe4ec01d197cce5cb1698fc9536bc8127bcd9c45d51c6a908bb642c5bfb3ff9caca1e73f98c78669036dc94c38704412a0461a
7
- data.tar.gz: 092058e59d4c591be1d5ceab99dfa4219f86929897d2d1ab25859d03bf918622f4289ab8bca5fff4e87d6b9af228af80ac129deec1122e18d2f33db38544dea6
6
+ metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
7
+ data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
data/bin/rr CHANGED
@@ -4,73 +4,73 @@ require 'retriever'
4
4
  require 'optparse'
5
5
 
6
6
  options = {}
7
- optparse = OptionParser.new do|opts|
7
+ optparse = OptionParser.new do |opts|
8
8
  # Set a banner, displayed at the top
9
9
  # of the help screen.
10
- opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
11
- options[:sitemap] = false
12
- opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
13
- options[:sitemap] = output_type||''
10
+ opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
11
+ options['sitemap'] = false
12
+ opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
13
+ options['sitemap'] = output_type || ''
14
14
  end
15
- options[:fileharvest] = false
16
- opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
17
- options[:fileharvest] = file_ext
15
+ options['fileharvest'] = false
16
+ opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
17
+ options['fileharvest'] = file_ext
18
18
  end
19
- options[:seo] = false
20
- opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
21
- options[:seo] = true
19
+ options['seo'] = false
20
+ opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
21
+ options['seo'] = true
22
22
  end
23
- options[:filename] = nil
24
- opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
25
- options[:filename] = filename
23
+ options['filename'] = nil
24
+ opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
25
+ options['filename'] = filename
26
26
  end
27
27
  # Define the options, and what they do
28
- options[:verbose] = false
29
- opts.on( '-v', '--verbose', 'Output more information' ) do
30
- options[:verbose] = true
28
+ options['verbose'] = false
29
+ opts.on('-v', '--verbose', 'Output more information') do
30
+ options['verbose'] = true
31
31
  end
32
- options[:progress] = false
33
- opts.on( '-p', '--progress', 'Output progress bar' ) do
34
- options[:progress] = true
32
+ options['progress'] = false
33
+ opts.on('-p', '--progress', 'Output progress bar') do
34
+ options['progress'] = true
35
35
  end
36
- options[:maxpages] = false
37
- opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
36
+ options['maxpages'] = false
37
+ opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
38
38
  options[:maxpages] = maxpages
39
39
  end
40
- options[:autodown] = false
41
- opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
40
+ options['autodown'] = false
41
+ opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
42
42
  options[:autodown] = true
43
43
  end
44
44
  # This displays the help screen, all programs are
45
45
  # assumed to have this option.
46
- opts.on( '-h', '--help', 'Display this screen' ) do
46
+ opts.on('-h', '--help', 'Display this screen') do
47
47
  puts opts
48
48
  exit
49
49
  end
50
50
  end
51
-
52
- optparse.parse!
51
+
52
+ optparse.parse!
53
53
  if ARGV[0].nil?
54
- abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
54
+ abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
55
55
  end
56
56
 
57
57
  ARGV.each do|q|
58
58
  if options[:verbose]
59
- puts "###############################"
60
- puts "### [RubyRetriever]"
61
- puts "### Creating Sitemap" if options[:sitemap]
62
- puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
63
- puts "### Performing File Harvest" if options[:fileharvest]
64
- puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
65
- puts "### Performing SEO Scrape" if options[:seo]
66
- puts "### Writing output to filename: #{options[:filename]}" if options[:filename]
67
- puts "### Being verbose"
68
- puts "### Stopping after #{options[:maxpages]} pages"
59
+ puts '###############################'
60
+ puts '### [RubyRetriever]'
61
+ puts '### Creating Sitemap' if options['sitemap']
62
+ puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
63
+ puts '### Performing File Harvest' if options['fileharvest']
64
+ puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
65
+ puts '### Performing SEO Scrape' if options['seo']
66
+ puts "### Writing output to filename: #{options['filename']}" if options['filename']
67
+ puts '### Being verbose'
68
+ puts "### Stopping after #{options['maxpages']} pages"
69
69
  end
70
- puts "###############################"
70
+ puts '###############################'
71
71
  puts "### [RubyRetriever] go fetch #{q}"
72
72
  Retriever::CLI.new(q, options)
73
- puts "### [RubyRetriever] is done."
74
- puts "###############################"
73
+ puts '### [RubyRetriever] is done.'
74
+ puts '###############################'
75
75
  puts
76
76
  end
data/lib/retriever/cli.rb CHANGED
@@ -1,27 +1,21 @@
1
1
  module Retriever
2
- class CLI
3
- def initialize(url,options)
4
-
5
- #kick off the fetch mode of choice
6
- if options[:fileharvest]
7
- @fetch = Retriever::FetchFiles.new(url, options)
8
- elsif options[:sitemap]
9
- @fetch = Retriever::FetchSitemap.new(url, options)
10
- elsif options[:seo]
11
- @fetch = Retriever::FetchSEO.new(url, options)
12
- else
13
- fail "### Error: No Mode Selected"
14
- end
15
-
16
- #all fetch modes
17
- @fetch.dump
18
- @fetch.write if options[:filename]
19
-
20
- #fileharvest only
21
- @fetch.autodownload if options[:autodown] && options[:fileharvest]
22
-
23
- #sitemap only
24
- @fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
25
- end
26
- end
27
- end
2
+ #
3
+ class CLI
4
+ def initialize(url, options)
5
+ # kick off the fetch mode of choice
6
+ if options['fileharvest']
7
+ @fetch = Retriever::FetchFiles.new(url, options)
8
+ elsif options['sitemap']
9
+ @fetch = Retriever::FetchSitemap.new(url, options)
10
+ elsif options['seo']
11
+ @fetch = Retriever::FetchSEO.new(url, options)
12
+ else
13
+ fail '### Error: No Mode Selected'
14
+ end
15
+ @fetch.dump
16
+ @fetch.write if options['filename']
17
+ @fetch.autodownload if options['autodown'] && options['fileharvest']
18
+ @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
19
+ end
20
+ end
21
+ end
@@ -7,191 +7,214 @@ require 'csv'
7
7
  require 'bloomfilter-rb'
8
8
 
9
9
  module Retriever
10
- class Fetch
11
- attr_reader :maxPages, :t
10
+ #
11
+ class Fetch
12
+ attr_reader :max_pages, :t
13
+ # given target URL and RR options, creates a fetch object.
14
+ # There is no direct output
15
+ # this is a parent class that the other fetch classes build off of.
16
+ def initialize(url, options)
17
+ @connection_tally = {
18
+ :success => 0,
19
+ :error => 0,
20
+ :error_client => 0,
21
+ :error_server => 0
22
+ }
23
+ # OPTIONS
24
+ @prgrss = options['progress']
25
+ @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
26
+ @v = options['verbose']
27
+ @output = options['filename']
28
+ @fh = options['fileharvest']
29
+ @file_ext = @fh.to_s
30
+ @s = options['sitemap']
31
+ @seo = options['seo']
32
+ @autodown = options['autodown']
33
+ #
34
+ if @fh
35
+ temp_ext_str = '.' + @file_ext + '\z'
36
+ @file_re = Regexp.new(temp_ext_str).freeze
37
+ else
38
+ # when FH is not true, and autodown is true
39
+ errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
40
+ end
41
+ if @prgrss
42
+ # verbose & progressbar conflict
43
+ errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
44
+ prgress_vars = {
45
+ :title => 'Pages',
46
+ :starting_at => 1,
47
+ :total => @max_pages,
48
+ :format => '%a |%b>%i| %c/%C %t'
49
+ }
50
+ @progressbar = ProgressBar.create(prgress_vars)
51
+ end
52
+ @t = Retriever::Target.new(url, @file_re)
53
+ @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
54
+ @already_crawled = BloomFilter::Native.new(
55
+ :size => 1_000_000,
56
+ :hashes => 5,
57
+ :seed => 1,
58
+ :bucket => 8,
59
+ :raise => false
60
+ )
61
+ @already_crawled.insert(@t.target)
62
+ end
12
63
 
13
- def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
14
- @connection_tally = {
15
- :success => 0,
16
- :error => 0,
17
- :error_client => 0,
18
- :error_server => 0
19
- }
20
- #OPTIONS
21
- @prgrss = options[:progress] ? options[:progress] : false
22
- @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
23
- @v= options[:verbose] ? true : false
24
- @output=options[:filename] ? options[:filename] : false
25
- @fh = options[:fileharvest] ? options[:fileharvest] : false
26
- @file_ext = @fh.to_s
27
- @s = options[:sitemap] ? options[:sitemap] : false
28
- @seo = options[:seo] ? true : false
29
- @autodown = options[:autodown] ? true : false
30
- #
31
- if @fh
32
- tempExtStr = "."+@file_ext+'\z'
33
- @file_re = Regexp.new(tempExtStr).freeze
34
- else
35
- errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
36
- end
37
- if @prgrss
38
- errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
39
- prgressVars = {
40
- :title => "Pages Crawled",
41
- :starting_at => 1,
42
- :total => @maxPages,
43
- :format => '%a |%b>%i| %c/%C %t',
44
- }
45
- @progressbar = ProgressBar.create(prgressVars)
46
- end
47
- @t = Retriever::Target.new(url,@file_re)
48
- @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
49
- @already_crawled.insert(@t.target)
50
- if (@fh && !@output)
51
- @output = "rr-#{@t.host.split('.')[1]}"
52
- end
53
- fail "bad page source on target -- try HTTPS?" if !@t.source
54
- end
55
- def errlog(msg)
56
- raise "ERROR: #{msg}"
57
- end
58
- def lg(msg)
59
- puts "### #{msg}" if @v
60
- end
61
- def dump #prints current data collection to STDOUT, meant for CLI use.
62
- puts "###############################"
63
- if @v
64
- puts "Connection Tally:"
65
- puts @connection_tally.to_s
66
- puts "###############################"
67
- end
68
- if @s
69
- puts "#{@t.target} Sitemap"
70
- puts "Page Count: #{@data.size}"
71
- elsif @fh
72
- puts "Target URL: #{@t.target}"
73
- puts "Filetype: #{@file_ext}"
74
- puts "File Count: #{@data.size}"
75
- elsif @seo
76
- puts "#{@t.target} SEO Metrics"
77
- puts "Page Count: #{@data.size}"
78
- else
79
- fail "ERROR - Cannot dump - Mode Not Found"
80
- end
81
- puts "###############################"
82
- @data.each do |line|
83
- puts line
84
- end
85
- puts "###############################"
86
- puts
87
- end
88
- def write #writes current data collection out to CSV in current directory
89
- if @output
90
- i = 0
91
- CSV.open("#{@output}.csv", "w") do |csv|
92
- if ((i == 0) && @seo)
93
- csv << ['URL','Page Title','Meta Description','H1','H2']
94
- i +=1
95
- end
96
- @data.each do |entry|
97
- csv << entry
98
- end
99
- end
100
- puts "###############################"
101
- puts "File Created: #{@output}.csv"
102
- puts "Object Count: #{@data.size}"
103
- puts "###############################"
104
- puts
105
- end
106
- end
107
- def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
108
- while (@already_crawled.size < @maxPages)
109
- if @linkStack.empty?
110
- if @prgrss
111
- @progressbar.log("Can't find any more links. Site might be completely mapped.")
112
- else
113
- lg("Can't find any more links. Site might be completely mapped.")
114
- end
115
- break;
116
- end
117
- new_links_arr = self.asyncGetWave()
118
- next if (new_links_arr.nil? || new_links_arr.empty?)
119
- new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
120
- @linkStack.concat(new_links_arr).uniq!
121
- @data.concat(new_links_arr) if @s
122
- end
123
- @progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
124
- end
125
- def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
- return false if !resp
127
- if resp.response_header.redirection? #we got redirected
128
- loc = resp.response_header.location
129
- lg("#{url} Redirected to #{loc}")
130
- if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
131
- @linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
132
- lg("--Added to linkStack for later")
133
- return false
134
- end
135
- lg("Redirection outside of target host. No - go. #{loc}")
136
- return false
137
- end
138
- if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
139
- lg("UNSUCCESSFUL CONNECTION -- #{url}")
140
- @connection_tally[:error] += 1
141
- @connection_tally[:error_server] += 1 if resp.response_header.server_error?
142
- @connection_tally[:error_client] += 1 if resp.response_header.client_error?
143
- return false
144
- end
145
- if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
146
- @already_crawled.insert(url)
147
- @linkStack.delete(url)
148
- lg("Page Not text/html -- #{url}")
149
- return false
150
- end
151
- @connection_tally[:success] += 1
152
- return true
153
- end
64
+ def errlog(msg)
65
+ fail "ERROR: #{msg}"
66
+ end
154
67
 
155
- def asyncGetWave() #send a new wave of GET requests, using current @linkStack
156
- new_stuff = []
157
- EM.synchrony do
158
- lenny = 0
159
- concurrency = 10
160
- EM::Synchrony::FiberIterator.new(@linkStack, concurrency).each do |url|
161
- next if (@already_crawled.size >= @maxPages)
162
- if @already_crawled.include?(url)
163
- @linkStack.delete(url)
164
- next
165
- end
166
- resp = EventMachine::HttpRequest.new(url).get
167
- next if !good_response?(resp,url)
168
- new_page = Retriever::Page.new(resp.response,@t)
169
- lg("Page Fetched: #{url}")
170
- @already_crawled.insert(url)
171
- if @prgrss
172
- @progressbar.increment if @already_crawled.size < @maxPages
173
- end
174
- if @seo
175
- seos = [url]
176
- seos.concat(new_page.parseSEO)
177
- @data.push(seos)
178
- lg("--page SEO scraped")
179
- end
180
- if new_page.links
181
- lg("--#{new_page.links.size} links found")
182
- internal_links_arr = new_page.parseInternalVisitable
183
- new_stuff.push(internal_links_arr)
184
- if @fh
185
- filez = new_page.parseFiles
186
- @data.concat(filez) if !filez.empty?
187
- lg("--#{filez.size} files found")
188
- end
189
- end
190
- end
191
- new_stuff = new_stuff.flatten # all completed requests
192
- EventMachine.stop
193
- end
194
- new_stuff.uniq!
195
- end
196
- end
197
- end
68
+ def lg(msg)
69
+ puts "### #{msg}" if @v
70
+ end
71
+
72
+ # prints current data collection to STDOUT
73
+ def dump
74
+ puts '###############################'
75
+ if @v
76
+ puts 'Connection Tally:'
77
+ puts @connection_tally.to_s
78
+ puts '###############################'
79
+ end
80
+ if @s
81
+ puts "#{@t.target} Sitemap"
82
+ puts "Page Count: #{@data.size}"
83
+ elsif @fh
84
+ puts "Target URL: #{@t.target}"
85
+ puts "Filetype: #{@file_ext}"
86
+ puts "File Count: #{@data.size}"
87
+ elsif @seo
88
+ puts "#{@t.target} SEO Metrics"
89
+ puts "Page Count: #{@data.size}"
90
+ else
91
+ fail 'ERROR - Cannot dump - Mode Not Found'
92
+ end
93
+ puts '###############################'
94
+ @data.each do |line|
95
+ puts line
96
+ end
97
+ puts '###############################'
98
+ puts
99
+ end
100
+
101
+ # writes current data collection out to CSV in current directory
102
+ def write
103
+ return false unless @output
104
+ i = 0
105
+ CSV.open("#{@output}.csv", 'w') do |csv|
106
+ if (i == 0) && @seo
107
+ csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
108
+ i += 1
109
+ end
110
+ @data.each do |entry|
111
+ csv << entry
112
+ end
113
+ end
114
+ puts '###############################'
115
+ puts "File Created: #{@output}.csv"
116
+ puts "Object Count: #{@data.size}"
117
+ puts '###############################'
118
+ puts
119
+ end
120
+
121
+ # iterates over the existing @link_stack
122
+ # running until we reach the @max_pages value.
123
+ def async_crawl_and_collect
124
+ while @already_crawled.size < @max_pages
125
+ if @link_stack.empty?
126
+ if @prgrss
127
+ @progressbar.log("Can't find any more links.")
128
+ else
129
+ lg("Can't find any more links.")
130
+ end
131
+ break
132
+ end
133
+ new_links_arr = process_link_stack
134
+ next if new_links_arr.nil? || new_links_arr.empty?
135
+ # set operations to see are these in our previous visited pages arr
136
+ new_links_arr -= @link_stack
137
+ @link_stack.concat(new_links_arr).uniq!
138
+ @data.concat(new_links_arr) if @s
139
+ end
140
+ # done, make sure progress bar says we are done
141
+ @progressbar.finish if @prgrss
142
+ end
143
+
144
+ # returns true is resp is ok to continue
145
+ def good_response?(resp, url)
146
+ return false unless resp
147
+ hdr = resp.response_header
148
+ if hdr.redirection?
149
+ loc = hdr.location
150
+ lg("#{url} Redirected to #{loc}")
151
+ if t.host_re =~ loc
152
+ @link_stack.push(loc) unless @already_crawled.include?(loc)
153
+ lg('--Added to linkStack for later')
154
+ return false
155
+ end
156
+ lg("Redirection outside of target host. No - go. #{loc}")
157
+ return false
158
+ end
159
+ # lets not continue if unsuccessful connection
160
+ unless hdr.successful?
161
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
162
+
163
+ @connection_tally[:error] += 1
164
+ @connection_tally[:error_server] += 1 if hdr.server_error?
165
+ @connection_tally[:error_client] += 1 if hdr.client_error?
166
+ return false
167
+ end
168
+ # let's not continue if not text/html
169
+ unless hdr['CONTENT_TYPE'].include?('text/html')
170
+ @already_crawled.insert(url)
171
+ @link_stack.delete(url)
172
+ lg("Page Not text/html -- #{url}")
173
+ return false
174
+ end
175
+ @connection_tally[:success] += 1
176
+ true
177
+ end
178
+
179
+ # send a new wave of GET requests, using current @link_stack
180
+ def process_link_stack
181
+ new_stuff = []
182
+ EM.synchrony do
183
+ concurrency = 10
184
+ EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
185
+ next if @already_crawled.size >= @max_pages
186
+ next if @already_crawled.include?(url)
187
+
188
+ resp = EventMachine::HttpRequest.new(url).get
189
+
190
+ next unless good_response?(resp, url)
191
+ lg("Page Fetched: #{url}")
192
+ @already_crawled.insert(url)
193
+
194
+ new_page = Retriever::Page.new(resp.response, @t)
195
+ if @prgrss
196
+ @progressbar.increment if @already_crawled.size < @max_pages
197
+ end
198
+ if @seo
199
+ seos = [url]
200
+ seos.concat(new_page.parse_seo)
201
+ @data.push(seos)
202
+ lg('--page SEO scraped')
203
+ end
204
+ next if new_page.links.size == 0
205
+ lg("--#{new_page.links.size} links found")
206
+ internal_links_arr = new_page.parse_internal_visitable
207
+ new_stuff.push(internal_links_arr)
208
+ if @fh
209
+ filez = new_page.parse_files
210
+ @data.concat(filez) unless filez.empty?
211
+ lg("--#{filez.size} files found")
212
+ end
213
+ end
214
+ new_stuff = new_stuff.flatten # all completed requests
215
+ EventMachine.stop
216
+ end
217
+ new_stuff.uniq!
218
+ end
219
+ end
220
+ end
@@ -1,65 +1,70 @@
1
1
  module Retriever
2
- class FetchFiles < Fetch
3
- def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
4
- super
5
- @data = []
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
2
+ # recieves target url and RR options
3
+ # returns an array of all unique files (based on given filetype)
4
+ # found on the target site
5
+ class FetchFiles < Fetch
6
+ def initialize(url, options)
7
+ super
8
+ @data = []
9
+ page_one = Retriever::Page.new(@t.source, @t)
10
+ @link_stack = page_one.parse_internal_visitable
11
+ lg("URL Crawled: #{@t.target}")
12
+ lg("#{@link_stack.size - 1} new links found")
10
13
 
11
- tempFileCollection = page_one.parseFiles
12
- @data.concat(tempFileCollection) if tempFileCollection.size>0
13
- lg("#{@data.size} new files found")
14
- errlog("Bad URL -- #{@t.target}") if !@linkStack
14
+ temp_file_collection = page_one.parse_files
15
+ @data.concat(tempFileCollection) if temp_file_collection.size > 0
16
+ lg("#{@data.size} new files found")
17
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
18
+ @link_stack.delete(@t.target)
15
19
 
16
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
17
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
20
+ async_crawl_and_collect
18
21
 
19
- self.async_crawl_and_collect()
22
+ @data.sort_by! { |x| x.length }
23
+ @data.uniq!
24
+ end
20
25
 
21
- @data.sort_by! {|x| x.length}
22
- @data.uniq!
23
- end
24
- def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
25
- arr = path.split('/')
26
- shortname = arr.pop
27
- puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
28
- File.open(shortname, "wb") do |saved_file|
29
- open(path) do |read_file|
30
- saved_file.write(read_file.read)
31
- end
32
- end
33
- puts " SUCCESS: Download Complete"
34
- end
35
- def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
36
- lenny = @data.count
37
- puts "###################"
38
- puts "### Initiating Autodownload..."
39
- puts "###################"
40
- puts "#{lenny} - #{@file_ext}'s Located"
41
- puts "###################"
42
- if File::directory?("rr-downloads")
43
- Dir.chdir("rr-downloads")
44
- else
45
- puts "creating rr-downloads Directory"
46
- Dir.mkdir("rr-downloads")
47
- Dir.chdir("rr-downloads")
48
- end
49
- file_counter = 0
50
- @data.each do |entry|
51
- begin
52
- self.download_file(entry)
53
- file_counter+=1
54
- lg(" File [#{file_counter} of #{lenny}]")
55
- puts
56
- rescue StandardError => e
57
- puts "ERROR: failed to download - #{entry}"
58
- puts e.message
59
- puts
60
- end
61
- end
62
- Dir.chdir("..")
63
- end
64
- end
65
- end
26
+ def download_file(path)
27
+ # given valid url, downloads file to current directory in /rr-downloads/
28
+ arr = path.split('/')
29
+ shortname = arr.pop
30
+ puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
31
+ File.open(shortname, 'wb') do |saved_file|
32
+ open(path) do |read_file|
33
+ saved_file.write(read_file.read)
34
+ end
35
+ end
36
+ puts ' SUCCESS: Download Complete'
37
+ end
38
+
39
+ def autodownload
40
+ # go through the fetched file URL collection and download each one.
41
+ lenny = @data.count
42
+ puts '###################'
43
+ puts '### Initiating Autodownload...'
44
+ puts '###################'
45
+ puts "#{lenny} - #{@file_ext}'s Located"
46
+ puts '###################'
47
+ if File.directory?('rr-downloads')
48
+ Dir.chdir('rr-downloads')
49
+ else
50
+ puts 'creating rr-downloads Directory'
51
+ Dir.mkdir('rr-downloads')
52
+ Dir.chdir('rr-downloads')
53
+ end
54
+ file_counter = 0
55
+ @data.each do |entry|
56
+ begin
57
+ download_file(entry)
58
+ file_counter += 1
59
+ lg(' File [#{file_counter} of #{lenny}]')
60
+ puts
61
+ rescue StandardError => e
62
+ puts 'ERROR: failed to download - #{entry}'
63
+ puts e.message
64
+ puts
65
+ end
66
+ end
67
+ Dir.chdir('..')
68
+ end
69
+ end
70
+ end
@@ -1,23 +1,25 @@
1
1
  module Retriever
2
- class FetchSEO < Fetch
3
- def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
4
- super
5
- @data = []
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
2
+ #
3
+ class FetchSEO < Fetch
4
+ # recieves target url and RR options
5
+ # returns an array of onpage SEO related fields
6
+ # on all unique pages found on the site
7
+ def initialize(url, options)
8
+ super
9
+ @data = []
10
+ page_one = Retriever::Page.new(@t.source, @t)
11
+ lg("URL Crawled: #{@t.target}")
10
12
 
11
- @data.push(page_one.parseSEO)
12
- lg("#{@data.size} pages scraped")
13
- errlog("Bad URL -- #{@t.target}") if !@linkStack
13
+ @link_stack = page_one.parse_internal_visitable
14
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
15
+ lg("#{@link_stack.size - 1} links found")
16
+ @link_stack.delete(@t.target)
14
17
 
15
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
16
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
+ @data.push(page_one.parse_seo)
17
19
 
18
- self.async_crawl_and_collect()
20
+ async_crawl_and_collect
19
21
 
20
- @data.sort_by! {|x| x[0].length}
21
- end
22
- end
23
- end
22
+ @data.sort_by! { |x| x[0].length }
23
+ end
24
+ end
25
+ end
@@ -1,36 +1,41 @@
1
1
  module Retriever
2
- class FetchSitemap < Fetch
3
- def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
4
- super
5
- @data = [@t.target]
6
- page_one = Retriever::Page.new(@t.source,@t)
7
- @linkStack = page_one.parseInternalVisitable
8
- lg("URL Crawled: #{@t.target}")
9
- lg("#{@linkStack.size-1} new links found")
10
- errlog("Bad URL -- #{@t.target}") if !@linkStack
2
+ #
3
+ class FetchSitemap < Fetch
4
+ # recieves target URL and RR options
5
+ # returns an array of all unique pages found on the site
6
+ def initialize(url, options)
7
+ super
8
+ @data = [@t.target]
9
+ page_one = Retriever::Page.new(@t.source, @t)
10
+ lg("URL Crawled: #{@t.target}")
11
+ @link_stack = page_one.parse_internal_visitable
12
+ errlog("Bad URL -- #{@t.target}") unless @link_stack
13
+ lg("#{@link_stack.size - 1} links found")
11
14
 
12
- @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
13
- @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
14
- @data.concat(@linkStack)
15
+ @link_stack.delete(@t.target)
16
+ @data.concat(@link_stack)
15
17
 
16
- self.async_crawl_and_collect()
18
+ async_crawl_and_collect
17
19
 
18
- @data.sort_by! {|x| x.length} if @data.size>1
19
- @data.uniq!
20
- end
21
- def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
22
- f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
23
- f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
24
- @data.each do |url|
25
- f << "<url><loc>#{url}</loc></url>"
26
- end
27
- f << "</urlset>"
28
- f.close
29
- puts "###############################"
30
- puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
31
- puts "Object Count: #{@data.size}"
32
- puts "###############################"
33
- puts
34
- end
35
- end
36
- end
20
+ @data.sort_by! { |x| x.length } if @data.size > 1
21
+ @data.uniq!
22
+ end
23
+
24
+ # produces valid XML sitemap based on page collection fetched.
25
+ # Writes to current directory.
26
+ def gen_xml
27
+ f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
28
+ f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
29
+ @data.each do |url|
30
+ f << "<url><loc>#{url}</loc></url>"
31
+ end
32
+ f << '</urlset>'
33
+ f.close
34
+ puts '###############################'
35
+ puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
36
+ puts "Object Count: #{@data.size}"
37
+ puts '###############################'
38
+ puts
39
+ end
40
+ end
41
+ end
@@ -1,4 +1,5 @@
1
1
  module Retriever
2
+ #
2
3
  class Link
3
4
  HTTP_RE = Regexp.new(/^http/i).freeze
4
5
  SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
@@ -18,12 +19,15 @@ module Retriever
18
19
 
19
20
  return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
20
21
 
21
- return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
22
+ # link begins with '//'
23
+ return "http:#{link}" if DOUBLE_SLASH_RE =~ link
22
24
 
23
- return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
25
+ # link uses relative path with no slashes at all
26
+ return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
24
27
  end
25
28
 
26
29
  private
30
+
27
31
  attr_reader :host, :link
28
32
  end
29
33
  end
@@ -1,6 +1,7 @@
1
1
  module OpenURI
2
- def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
2
+ # nesc patch otherwise OPENURI blocks redirects to and from https
3
+ def OpenURI.redirectable?(uri1, uri2)
3
4
  uri1.scheme.downcase == uri2.scheme.downcase ||
4
5
  (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
5
6
  end
6
- end
7
+ end
@@ -1,7 +1,6 @@
1
1
  module Retriever
2
-
2
+ #
3
3
  class Page
4
-
5
4
  HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
6
5
  NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
7
6
  HTTP_RE = Regexp.new(/^http/i).freeze
@@ -14,55 +13,55 @@ module Retriever
14
13
 
15
14
  attr_reader :links, :source, :t
16
15
 
17
- def initialize(source,t)
16
+ def initialize(source, t)
18
17
  @t = t
19
18
  @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
20
19
  @links = nil
21
20
  end
22
21
 
23
- #recieves page source as string
24
- #returns array of unique href links
22
+ # recieves page source as string
23
+ # returns array of unique href links
25
24
  def links
26
25
  return @links if @links
27
- return false if !@source
28
- @links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
26
+ return false unless @source
27
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match|
28
+ # filter some malformed URLS that come in
29
+ # meant to be a loose filter to catch all reasonable HREF attributes.
29
30
  link = match[0]
30
31
  Link.new(@t.host, link).path
31
32
  end.uniq
32
33
  end
33
34
 
34
- def parseInternal
35
- links.select{ |linky| (@t.host_re =~ linky) }
35
+ def parse_internal
36
+ links.select { |linky| (@t.host_re =~ linky) }
36
37
  end
37
38
 
38
- def parseInternalVisitable
39
- parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
39
+ def parse_internal_visitable
40
+ parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
40
41
  end
41
42
 
42
- def parseFiles
43
- links.select{ |linky| (@t.file_re =~ linky)}
43
+ def parse_files
44
+ links.select { |linky| (@t.file_re =~ linky) }
44
45
  end
45
46
 
46
47
  def title
47
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
48
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
48
49
  end
49
50
 
50
51
  def desc
51
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
52
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
52
53
  end
53
54
 
54
55
  def h1
55
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
56
+ H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
56
57
  end
57
58
 
58
59
  def h2
59
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
60
+ H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
60
61
  end
61
62
 
62
- def parseSEO
63
- return [title,desc,h1,h2]
63
+ def parse_seo
64
+ [title, desc, h1, h2]
64
65
  end
65
-
66
66
  end
67
-
68
67
  end
@@ -1,52 +1,44 @@
1
1
  require 'open-uri'
2
2
 
3
3
  module Retriever
4
-
4
+ #
5
5
  class Target
6
-
7
6
  HTTP_RE = Regexp.new(/^http/i).freeze
8
7
  DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
-
8
+
10
9
  attr_reader :host, :target, :host_re, :source, :file_re
11
10
 
12
- def initialize(url,file_re=nil)
13
- url = "http://#{url}" if (!(HTTP_RE =~ url))
14
- fail "Bad URL" if (!(/\./ =~ url))
11
+ def initialize(url, file_re = nil)
12
+ url = "http://#{url}" unless HTTP_RE =~ url
13
+ fail 'Bad URL' unless /\./ =~ url
15
14
  new_uri = URI(url)
16
15
  @target = new_uri.to_s
17
16
  @host = new_uri.host
18
- @host_re = Regexp.new(@host.sub('www.',''))
17
+ @host_re = Regexp.new(@host.sub('www.', ''))
19
18
  @file_re ||= file_re
20
19
  end
21
20
 
22
21
  def source
23
- resp = false
24
- begin
25
- resp = open(@target)
26
- rescue StandardError => e
27
- trap("ABRT"){
28
- puts "#{@target} failed SSL Certification Verification"
29
- }
30
- return false
31
- end
22
+ resp = open(@target)
32
23
  resp_url = resp.base_uri.to_s
33
- if (@target != resp_url)
34
- if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
35
- new_t = Retriever::Target.new(resp_url)
36
- @target = new_t.target
37
- @host = new_t.host
38
- return new_t.source
39
- end
40
- fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
24
+ if @target != resp_url
25
+ fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
26
+ # if redirect URL is same host, we want to re-sync @target
27
+ return resync_target_and_return_source(resp_url)
41
28
  end
42
29
  resp = resp.read
43
- if resp == ""
44
- fail "Domain is not working. Try the non-WWW version."
45
- end
46
- fail "Domain not working. Try HTTPS???" if !resp
47
- return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
30
+ #
31
+ fail 'Domain is not working. Try the non-WWW version.' if resp == ''
32
+ fail 'Domain not working. Try HTTPS???' unless resp
33
+ # consider using scrub from ruby 2.1? this misses some things
34
+ resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
48
35
  end
49
36
 
37
+ def resync_target_and_return_source(url)
38
+ new_t = Retriever::Target.new(url)
39
+ @target = new_t.target
40
+ @host = new_t.host
41
+ new_t.source
42
+ end
50
43
  end
51
-
52
44
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '1.0.3'
2
+ VERSION = '1.1.0'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -8,6 +8,6 @@ require 'retriever/target'
8
8
  require 'retriever/page'
9
9
  require 'retriever/openuri-redirect-patch'
10
10
 
11
+ #
11
12
  module Retriever
12
-
13
- end
13
+ end
data/readme.md CHANGED
@@ -4,13 +4,14 @@
4
4
 
5
5
  By Joe Norton
6
6
 
7
- RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
7
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchrony fibers, to crawl webpages *very quickly*.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
10
10
 
11
- RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
11
+ **Use at Own Risk**
12
+ RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
12
13
 
13
- v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
+ **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
15
 
15
16
 
16
17
  getting started
data/spec/page_spec.rb CHANGED
@@ -20,8 +20,8 @@ SOURCE
20
20
  end
21
21
  end
22
22
 
23
- describe "#parseInternal" do
24
- let (:links){Retriever::Page.new(@source,t).parseInternal}
23
+ describe "#parse_internal" do
24
+ let (:links){Retriever::Page.new(@source,t).parse_internal}
25
25
  it "filters links by host" do
26
26
  @source = (<<SOURCE).strip
27
27
  <a href='http://www.cnet.com/'>download</a>
@@ -32,8 +32,8 @@ SOURCE
32
32
  end
33
33
  end
34
34
 
35
- describe "#parseInternalVisitable" do
36
- let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
35
+ describe "#parse_internal_visitable" do
36
+ let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
37
37
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
38
  @source = (<<SOURCE).strip
39
39
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
@@ -43,7 +43,7 @@ SOURCE
43
43
  end
44
44
 
45
45
  describe "#parseFiles" do
46
- let (:links){Retriever::Page.new(@source,t).parseFiles}
46
+ let (:links){Retriever::Page.new(@source,t).parse_files}
47
47
  it "filters links by filetype" do
48
48
  @source = (<<SOURCE).strip
49
49
  <a href='www.cnet.com/download.exe'>download</a>
@@ -90,5 +90,4 @@ SOURCE
90
90
  expect(page.h2).to eq(' test 4 ')
91
91
  end
92
92
  end
93
-
94
- end
93
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-08 00:00:00.000000000 Z
11
+ date: 2014-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony