rubyretriever 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
4
- data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
3
+ metadata.gz: 79f0b251e367f085f7b84dd83a10f6a1dfcddd3c
4
+ data.tar.gz: 0e9b6bc8f66b9efd14921d8f0fc5fddc45042b5f
5
5
  SHA512:
6
- metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
7
- data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
6
+ metadata.gz: fe1a6c8e118378513c4a4e72adeccc94e212fd5e0d4244f56240830155e42f7b7bde80acdfabbc9fe9ee5b46687bc33d089d73c57690beadee3d04804a9435ac
7
+ data.tar.gz: 4e6bec31d3416293f2fb72b39cfab5602692a2b9d94cbfb6fba9a653afc6bf718e5eebf84b4a5745d01d2e55959df4e489342b0deca2060e2653bb5b31f4731e
data/bin/rr CHANGED
@@ -1,57 +1,58 @@
1
1
  #! /usr/bin/env ruby
2
-
3
2
  require 'retriever'
4
3
  require 'optparse'
5
4
 
6
5
  options = {}
7
- optparse = OptionParser.new do |opts|
8
- # Set a banner, displayed at the top
9
- # of the help screen.
10
- opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
11
- options['sitemap'] = false
12
- opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
13
- options['sitemap'] = output_type || ''
14
- end
15
- options['fileharvest'] = false
16
- opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
17
- options['fileharvest'] = file_ext
18
- end
19
- options['seo'] = false
20
- opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
21
- options['seo'] = true
22
- end
23
- options['filename'] = nil
24
- opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
25
- options['filename'] = filename
26
- end
27
- # Define the options, and what they do
28
- options['verbose'] = false
29
- opts.on('-v', '--verbose', 'Output more information') do
30
- options['verbose'] = true
31
- end
32
- options['progress'] = false
33
- opts.on('-p', '--progress', 'Output progress bar') do
34
- options['progress'] = true
35
- end
36
- options['maxpages'] = false
37
- opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
38
- options[:maxpages] = maxpages
39
- end
40
- options['autodown'] = false
41
- opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
42
- options[:autodown] = true
43
- end
44
- # This displays the help screen, all programs are
45
- # assumed to have this option.
46
- opts.on('-h', '--help', 'Display this screen') do
47
- puts opts
48
- exit
49
- end
50
- end
6
+ optparse = OptionParser.new do |opts|
7
+ # Set a banner, displayed at the top
8
+ # of the help screen.
9
+ opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
10
+ options['sitemap'] = false
11
+ opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
12
+ options['sitemap'] = type || ''
13
+ end
14
+ options['fileharvest'] = false
15
+ opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
16
+ options['fileharvest'] = file_e
17
+ end
18
+ options['seo'] = false
19
+ opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
20
+ options['seo'] = true
21
+ end
22
+ options['filename'] = nil
23
+ opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
24
+ options['filename'] = file
25
+ end
26
+ # Define the options, and what they do
27
+ options['verbose'] = false
28
+ opts.on('-v', '--verbose', 'Output more information') do
29
+ options['verbose'] = true
30
+ end
31
+ options['progress'] = false
32
+ opts.on('-p', '--progress', 'Output progress bar') do
33
+ options['progress'] = true
34
+ end
35
+ options['maxpages'] = false
36
+ opts.on('-l',
37
+ '--limit PAGE_LIMIT_#',
38
+ 'set a max on the total number of crawled pages') do |maxp|
39
+ options['maxpages'] = maxp
40
+ end
41
+ options['autodown'] = false
42
+ opts.on('-a', '--auto', 'Automatically download all files located') do
43
+ options['autodown'] = true
44
+ end
45
+ # This displays the help screen, all programs are
46
+ # assumed to have this option.
47
+ opts.on('-h', '--help', 'Display this screen') do
48
+ puts opts
49
+ exit
50
+ end
51
+ end
51
52
 
52
53
  optparse.parse!
53
54
  if ARGV[0].nil?
54
- abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
55
+ abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
55
56
  end
56
57
 
57
58
  ARGV.each do|q|
@@ -61,9 +62,11 @@ ARGV.each do|q|
61
62
  puts '### Creating Sitemap' if options['sitemap']
62
63
  puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
63
64
  puts '### Performing File Harvest' if options['fileharvest']
64
- puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
65
+ if options['fileharvest']
66
+ puts "### Searching for filetype: #{options['fileharvest']}"
67
+ end
65
68
  puts '### Performing SEO Scrape' if options['seo']
66
- puts "### Writing output to filename: #{options['filename']}" if options['filename']
69
+ puts "### Writing to file: #{options['filename']}" if options['filename']
67
70
  puts '### Being verbose'
68
71
  puts "### Stopping after #{options['maxpages']} pages"
69
72
  end
data/lib/retriever/cli.rb CHANGED
@@ -3,19 +3,23 @@ module Retriever
3
3
  class CLI
4
4
  def initialize(url, options)
5
5
  # kick off the fetch mode of choice
6
+ @fetch = choose_fetch_mode(url, options)
7
+ @fetch.dump
8
+ @fetch.write if options['filename']
9
+ @fetch.autodownload if options['autodown'] && options['fileharvest']
10
+ @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
11
+ end
12
+
13
+ def choose_fetch_mode(url, options)
6
14
  if options['fileharvest']
7
- @fetch = Retriever::FetchFiles.new(url, options)
15
+ Retriever::FetchFiles.new(url, options)
8
16
  elsif options['sitemap']
9
- @fetch = Retriever::FetchSitemap.new(url, options)
17
+ Retriever::FetchSitemap.new(url, options)
10
18
  elsif options['seo']
11
- @fetch = Retriever::FetchSEO.new(url, options)
19
+ Retriever::FetchSEO.new(url, options)
12
20
  else
13
21
  fail '### Error: No Mode Selected'
14
22
  end
15
- @fetch.dump
16
- @fetch.write if options['filename']
17
- @fetch.autodownload if options['autodown'] && options['fileharvest']
18
- @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
19
23
  end
20
24
  end
21
25
  end
@@ -9,56 +9,27 @@ require 'bloomfilter-rb'
9
9
  module Retriever
10
10
  #
11
11
  class Fetch
12
+ HR = '###############################'
12
13
  attr_reader :max_pages, :t
13
14
  # given target URL and RR options, creates a fetch object.
14
15
  # There is no direct output
15
16
  # this is a parent class that the other fetch classes build off of.
16
17
  def initialize(url, options)
18
+ @data = []
17
19
  @connection_tally = {
18
- :success => 0,
19
- :error => 0,
20
- :error_client => 0,
21
- :error_server => 0
20
+ success: 0,
21
+ error: 0,
22
+ error_client: 0,
23
+ error_server: 0
22
24
  }
23
- # OPTIONS
24
- @prgrss = options['progress']
25
- @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
26
- @v = options['verbose']
27
- @output = options['filename']
28
- @fh = options['fileharvest']
29
- @file_ext = @fh.to_s
30
- @s = options['sitemap']
31
- @seo = options['seo']
32
- @autodown = options['autodown']
33
- #
34
- if @fh
35
- temp_ext_str = '.' + @file_ext + '\z'
36
- @file_re = Regexp.new(temp_ext_str).freeze
37
- else
38
- # when FH is not true, and autodown is true
39
- errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
40
- end
41
- if @prgrss
42
- # verbose & progressbar conflict
43
- errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
44
- prgress_vars = {
45
- :title => 'Pages',
46
- :starting_at => 1,
47
- :total => @max_pages,
48
- :format => '%a |%b>%i| %c/%C %t'
49
- }
50
- @progressbar = ProgressBar.create(prgress_vars)
51
- end
25
+ setup_options(options)
26
+ setup_progress_bar if @progress
52
27
  @t = Retriever::Target.new(url, @file_re)
53
- @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
54
- @already_crawled = BloomFilter::Native.new(
55
- :size => 1_000_000,
56
- :hashes => 5,
57
- :seed => 1,
58
- :bucket => 8,
59
- :raise => false
60
- )
61
- @already_crawled.insert(@t.target)
28
+ @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
29
+ @already_crawled = setup_bloom_filter
30
+ @page_one = crawl_page_one
31
+ @link_stack = create_link_stack
32
+ @temp_link_stack = []
62
33
  end
63
34
 
64
35
  def errlog(msg)
@@ -66,35 +37,26 @@ module Retriever
66
37
  end
67
38
 
68
39
  def lg(msg)
69
- puts "### #{msg}" if @v
40
+ puts "### #{msg}" if @verbose
70
41
  end
71
42
 
72
43
  # prints current data collection to STDOUT
73
44
  def dump
74
- puts '###############################'
75
- if @v
76
- puts 'Connection Tally:'
77
- puts @connection_tally.to_s
78
- puts '###############################'
79
- end
80
- if @s
81
- puts "#{@t.target} Sitemap"
82
- puts "Page Count: #{@data.size}"
83
- elsif @fh
84
- puts "Target URL: #{@t.target}"
85
- puts "Filetype: #{@file_ext}"
86
- puts "File Count: #{@data.size}"
45
+ puts HR
46
+ puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
47
+ puts "Target URL: #{@t.target}"
48
+ if @sitemap
49
+ puts 'Sitemap'
50
+ elsif @fileharvest
51
+ puts "File harvest by type: #{@fileharvest}"
87
52
  elsif @seo
88
- puts "#{@t.target} SEO Metrics"
89
- puts "Page Count: #{@data.size}"
90
- else
91
- fail 'ERROR - Cannot dump - Mode Not Found'
53
+ puts 'SEO Metrics'
92
54
  end
93
- puts '###############################'
55
+ puts "Data Dump -- Object Count: #{@data.size}"
56
+ puts HR
94
57
  @data.each do |line|
95
58
  puts line
96
59
  end
97
- puts '###############################'
98
60
  puts
99
61
  end
100
62
 
@@ -111,34 +73,90 @@ module Retriever
111
73
  csv << entry
112
74
  end
113
75
  end
114
- puts '###############################'
76
+ puts HR
115
77
  puts "File Created: #{@output}.csv"
116
78
  puts "Object Count: #{@data.size}"
117
- puts '###############################'
79
+ puts HR
118
80
  puts
119
81
  end
120
82
 
83
+ private
84
+
85
+ def setup_options(options)
86
+ @progress = options['progress']
87
+ @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
88
+ @verbose = options['verbose']
89
+ @output = options['filename']
90
+ @fileharvest = options['fileharvest']
91
+ @sitemap = options['sitemap']
92
+ @seo = options['seo']
93
+ @autodown = options['autodown']
94
+ @file_re = Regexp.new(".#{@fileharvest}\z").freeze if @fileharvest
95
+ end
96
+
97
+ def setup_bloom_filter
98
+ already_crawled = BloomFilter::Native.new(
99
+ size: 1_000_000,
100
+ hashes: 5,
101
+ seed: 1,
102
+ bucket: 8,
103
+ raise: false
104
+ )
105
+ already_crawled.insert(@t.target)
106
+ already_crawled
107
+ end
108
+
109
+ def setup_progress_bar
110
+ # verbose & progressbar conflict
111
+ errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
112
+ prgress_vars = {
113
+ title: 'Pages',
114
+ starting_at: 1,
115
+ total: @max_pages,
116
+ format: '%a |%b>%i| %c/%C %t'
117
+ }
118
+ @progressbar = ProgressBar.create(prgress_vars)
119
+ end
120
+
121
+ def crawl_page_one
122
+ page_one = Retriever::Page.new(@t.source, @t)
123
+ lg("URL Crawled: #{@t.target}")
124
+ page_one
125
+ end
126
+
127
+ def create_link_stack
128
+ link_stack = @page_one.parse_internal_visitable
129
+ errlog("Bad URL -- #{@t.target}") unless link_stack
130
+ lg("#{link_stack.size - 1} links found")
131
+ link_stack.delete(@t.target)
132
+ link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
133
+ link_stack
134
+ end
135
+
136
+ def end_crawl_notice
137
+ notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
138
+ @progressbar.log(notice) if @progress
139
+ lg(notice)
140
+ end
141
+
121
142
  # iterates over the existing @link_stack
122
143
  # running until we reach the @max_pages value.
123
144
  def async_crawl_and_collect
124
145
  while @already_crawled.size < @max_pages
125
146
  if @link_stack.empty?
126
- if @prgrss
127
- @progressbar.log("Can't find any more links.")
128
- else
129
- lg("Can't find any more links.")
130
- end
147
+ end_crawl_notice
131
148
  break
132
149
  end
133
150
  new_links_arr = process_link_stack
151
+ @temp_link_stack = []
134
152
  next if new_links_arr.nil? || new_links_arr.empty?
135
153
  # set operations to see are these in our previous visited pages arr
136
- new_links_arr -= @link_stack
137
- @link_stack.concat(new_links_arr).uniq!
138
- @data.concat(new_links_arr) if @s
154
+ next if new_links_arr.empty?
155
+ @link_stack.concat(new_links_arr)
156
+ next unless @sitemap
157
+ @data.concat(new_links_arr)
139
158
  end
140
- # done, make sure progress bar says we are done
141
- @progressbar.finish if @prgrss
159
+ @data.uniq!
142
160
  end
143
161
 
144
162
  # returns true is resp is ok to continue
@@ -149,8 +167,8 @@ module Retriever
149
167
  loc = hdr.location
150
168
  lg("#{url} Redirected to #{loc}")
151
169
  if t.host_re =~ loc
152
- @link_stack.push(loc) unless @already_crawled.include?(loc)
153
- lg('--Added to linkStack for later')
170
+ @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
171
+ lg('--Added to stack for later')
154
172
  return false
155
173
  end
156
174
  lg("Redirection outside of target host. No - go. #{loc}")
@@ -159,7 +177,6 @@ module Retriever
159
177
  # lets not continue if unsuccessful connection
160
178
  unless hdr.successful?
161
179
  lg("UNSUCCESSFUL CONNECTION -- #{url}")
162
-
163
180
  @connection_tally[:error] += 1
164
181
  @connection_tally[:error_server] += 1 if hdr.server_error?
165
182
  @connection_tally[:error_client] += 1 if hdr.client_error?
@@ -168,7 +185,6 @@ module Retriever
168
185
  # let's not continue if not text/html
169
186
  unless hdr['CONTENT_TYPE'].include?('text/html')
170
187
  @already_crawled.insert(url)
171
- @link_stack.delete(url)
172
188
  lg("Page Not text/html -- #{url}")
173
189
  return false
174
190
  end
@@ -176,45 +192,58 @@ module Retriever
176
192
  true
177
193
  end
178
194
 
195
+ def push_seo_to_data(url, new_page)
196
+ seos = [url]
197
+ seos.concat(new_page.parse_seo)
198
+ @data.push(seos)
199
+ lg('--page SEO scraped')
200
+ end
201
+
202
+ def push_files_to_data(new_page)
203
+ filez = new_page.parse_files(new_page.parse_internal)
204
+ @data.concat(filez) unless filez.empty?
205
+ lg("--#{filez.size} files found")
206
+ end
207
+
208
+ def page_from_response(url, response)
209
+ lg("Page Fetched: #{url}")
210
+ @already_crawled.insert(url)
211
+ if @progress && (@already_crawled.size < @max_pages)
212
+ @progressbar.increment
213
+ end
214
+ Retriever::Page.new(response, @t)
215
+ end
216
+
217
+ def new_visitable_links(current_page)
218
+ lg("--#{current_page.links.size} links found")
219
+ current_page.parse_internal_visitable
220
+ end
221
+
179
222
  # send a new wave of GET requests, using current @link_stack
223
+ # at end of the loop it empties link_stack
224
+ # puts new links into temporary stack
180
225
  def process_link_stack
181
- new_stuff = []
182
226
  EM.synchrony do
183
227
  concurrency = 10
184
228
  EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
185
229
  next if @already_crawled.size >= @max_pages
186
230
  next if @already_crawled.include?(url)
187
-
188
231
  resp = EventMachine::HttpRequest.new(url).get
189
-
190
232
  next unless good_response?(resp, url)
191
- lg("Page Fetched: #{url}")
192
- @already_crawled.insert(url)
193
-
194
- new_page = Retriever::Page.new(resp.response, @t)
195
- if @prgrss
196
- @progressbar.increment if @already_crawled.size < @max_pages
197
- end
198
- if @seo
199
- seos = [url]
200
- seos.concat(new_page.parse_seo)
201
- @data.push(seos)
202
- lg('--page SEO scraped')
203
- end
204
- next if new_page.links.size == 0
205
- lg("--#{new_page.links.size} links found")
206
- internal_links_arr = new_page.parse_internal_visitable
207
- new_stuff.push(internal_links_arr)
208
- if @fh
209
- filez = new_page.parse_files
210
- @data.concat(filez) unless filez.empty?
211
- lg("--#{filez.size} files found")
212
- end
233
+ current_page = page_from_response(url, resp.response)
234
+ # non-link dependent modes
235
+ push_seo_to_data(url, current_page) if @seo
236
+ next unless current_page.links.size > 0
237
+ @temp_link_stack.push(new_visitable_links(current_page))
238
+ # link dependent modes
239
+ next unless @fileharvest
240
+ push_files_to_data(current_page)
213
241
  end
214
- new_stuff = new_stuff.flatten # all completed requests
215
242
  EventMachine.stop
216
243
  end
217
- new_stuff.uniq!
244
+ # empty the stack. most clean way
245
+ @link_stack = []
246
+ @temp_link_stack.flatten.uniq!
218
247
  end
219
248
  end
220
249
  end
@@ -5,29 +5,21 @@ module Retriever
5
5
  class FetchFiles < Fetch
6
6
  def initialize(url, options)
7
7
  super
8
- @data = []
9
- page_one = Retriever::Page.new(@t.source, @t)
10
- @link_stack = page_one.parse_internal_visitable
11
- lg("URL Crawled: #{@t.target}")
12
- lg("#{@link_stack.size - 1} new links found")
13
-
14
- temp_file_collection = page_one.parse_files
8
+ temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
15
9
  @data.concat(tempFileCollection) if temp_file_collection.size > 0
16
10
  lg("#{@data.size} new files found")
17
- errlog("Bad URL -- #{@t.target}") unless @link_stack
18
- @link_stack.delete(@t.target)
19
11
 
20
12
  async_crawl_and_collect
21
-
13
+ # done, make sure progress bar says we are done
14
+ @progressbar.finish if @progress
22
15
  @data.sort_by! { |x| x.length }
23
- @data.uniq!
24
16
  end
25
17
 
26
18
  def download_file(path)
27
19
  # given valid url, downloads file to current directory in /rr-downloads/
28
20
  arr = path.split('/')
29
21
  shortname = arr.pop
30
- puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
22
+ puts "Initiating Download of: #{shortname}"
31
23
  File.open(shortname, 'wb') do |saved_file|
32
24
  open(path) do |read_file|
33
25
  saved_file.write(read_file.read)
@@ -38,33 +30,39 @@ module Retriever
38
30
 
39
31
  def autodownload
40
32
  # go through the fetched file URL collection and download each one.
41
- lenny = @data.count
42
- puts '###################'
33
+ puts HR
43
34
  puts '### Initiating Autodownload...'
44
- puts '###################'
45
- puts "#{lenny} - #{@file_ext}'s Located"
46
- puts '###################'
47
- if File.directory?('rr-downloads')
48
- Dir.chdir('rr-downloads')
49
- else
50
- puts 'creating rr-downloads Directory'
51
- Dir.mkdir('rr-downloads')
52
- Dir.chdir('rr-downloads')
53
- end
54
- file_counter = 0
55
- @data.each do |entry|
35
+ puts HR
36
+ puts "#{@data.count} - #{@file_ext}'s Located"
37
+ puts HR
38
+ move_to_download_dir
39
+ iterate_thru_collection_and_download
40
+ Dir.chdir('..')
41
+ end
42
+
43
+ private
44
+
45
+ def iterate_thru_collection_and_download
46
+ lenn = @data.count
47
+ @data.each_with_index do |entry, i|
56
48
  begin
57
49
  download_file(entry)
58
- file_counter += 1
59
- lg(' File [#{file_counter} of #{lenny}]')
60
- puts
61
- rescue StandardError => e
62
- puts 'ERROR: failed to download - #{entry}'
63
- puts e.message
64
- puts
50
+ rescue StandardError
51
+ puts "ERROR: failed to download - #{entry}"
65
52
  end
53
+ lg(" File [#{i + 1} of #{lenn}]\n")
66
54
  end
67
- Dir.chdir('..')
55
+ end
56
+
57
+ def move_to_download_dir(dir_name = 'rr-downloads')
58
+ if File.directory?(dir_name)
59
+ Dir.chdir(dir_name)
60
+ else
61
+ puts "creating #{dir_name} Directory"
62
+ Dir.mkdir(dir_name)
63
+ Dir.chdir(dir_name)
64
+ end
65
+ puts "Downloading files to local directory: '/#{dir_name}/'"
68
66
  end
69
67
  end
70
68
  end
@@ -6,19 +6,11 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
- @data = []
10
- page_one = Retriever::Page.new(@t.source, @t)
11
- lg("URL Crawled: #{@t.target}")
12
-
13
- @link_stack = page_one.parse_internal_visitable
14
- errlog("Bad URL -- #{@t.target}") unless @link_stack
15
- lg("#{@link_stack.size - 1} links found")
16
- @link_stack.delete(@t.target)
17
-
18
- @data.push(page_one.parse_seo)
9
+ @data.push(@page_one.parse_seo)
19
10
 
20
11
  async_crawl_and_collect
21
-
12
+ # done, make sure progress bar says we are done
13
+ @progressbar.finish if @progress
22
14
  @data.sort_by! { |x| x[0].length }
23
15
  end
24
16
  end
@@ -5,37 +5,38 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
- @data = [@t.target]
9
- page_one = Retriever::Page.new(@t.source, @t)
10
- lg("URL Crawled: #{@t.target}")
11
- @link_stack = page_one.parse_internal_visitable
12
- errlog("Bad URL -- #{@t.target}") unless @link_stack
13
- lg("#{@link_stack.size - 1} links found")
14
-
15
- @link_stack.delete(@t.target)
8
+ @data.push(@t.target)
16
9
  @data.concat(@link_stack)
17
10
 
18
11
  async_crawl_and_collect
19
-
12
+ # done, make sure progress bar says we are done
13
+ @progressbar.finish if @progress
20
14
  @data.sort_by! { |x| x.length } if @data.size > 1
21
15
  @data.uniq!
22
16
  end
23
17
 
18
+ private
19
+
24
20
  # produces valid XML sitemap based on page collection fetched.
25
21
  # Writes to current directory.
26
22
  def gen_xml
27
- f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
28
- f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
29
- @data.each do |url|
30
- f << "<url><loc>#{url}</loc></url>"
31
- end
23
+ filename = @t.host.split('.')[1]
24
+ f = File.open("sitemap-#{filename}.xml", 'w+')
25
+ f << "<?xml version='1.0' encoding='UTF-8'?>"
26
+ f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
27
+ @data.each do |url|
28
+ f << "<url><loc>#{url}</loc></url>"
29
+ end
32
30
  f << '</urlset>'
33
31
  f.close
34
- puts '###############################'
35
- puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
32
+ print_file_info(filename)
33
+ end
34
+
35
+ def print_file_info(filename)
36
+ puts HR
37
+ puts "File Created: sitemap-#{filename}.xml"
36
38
  puts "Object Count: #{@data.size}"
37
- puts '###############################'
38
- puts
39
+ puts HR + "\n"
39
40
  end
40
41
  end
41
42
  end
@@ -1,33 +1,35 @@
1
+ require 'addressable/uri'
1
2
  module Retriever
2
3
  #
3
4
  class Link
4
- HTTP_RE = Regexp.new(/^http/i).freeze
5
- SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
6
- DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
7
- NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
8
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
-
10
- def initialize(host, link)
11
- @host = host
12
- @link = link
5
+ # HTTP_RE = Regexp.new(/^http/i).freeze
6
+ SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
7
+ DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
8
+ WWW_DOT_RE = Regexp.new(/^www\./i).freeze
9
+
10
+ def initialize(target_scheme, target_host, this_link)
11
+ @link_uri = Addressable::URI.parse(this_link)
12
+ @scheme = target_scheme
13
+ @host = target_host
14
+ @this_link = @link_uri.to_s
13
15
  end
14
16
 
15
17
  def path
16
- return link if HTTP_RE =~ link
18
+ return this_link if link_uri.absolute?
17
19
 
18
- return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
20
+ return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
19
21
 
20
- return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
22
+ return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
21
23
 
22
24
  # link begins with '//'
23
- return "http:#{link}" if DOUBLE_SLASH_RE =~ link
25
+ return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
24
26
 
25
27
  # link uses relative path with no slashes at all
26
- return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
28
+ return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
27
29
  end
28
30
 
29
31
  private
30
32
 
31
- attr_reader :host, :link
33
+ attr_reader :this_link, :host, :link_uri
32
34
  end
33
35
  end
@@ -1,6 +1,7 @@
1
+ #
1
2
  module OpenURI
2
3
  # nesc patch otherwise OPENURI blocks redirects to and from https
3
- def OpenURI.redirectable?(uri1, uri2)
4
+ def self.redirectable?(uri1, uri2)
4
5
  uri1.scheme.downcase == uri2.scheme.downcase ||
5
6
  (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
6
7
  end
@@ -1,21 +1,40 @@
1
+ require 'addressable/uri'
2
+
1
3
  module Retriever
2
4
  #
3
5
  class Page
4
- HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
5
- NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
6
- HTTP_RE = Regexp.new(/^http/i).freeze
7
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
8
-
9
- TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
10
- DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
11
- H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
12
- H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
6
+ HTTP_RE = Regexp.new(/^http/i).freeze
7
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
8
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
9
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
10
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
11
+ [^>]*content=[\"]
12
+ (
13
+ [^\"]*
14
+ )
15
+ [\"]
16
+ [^>]
17
+ *>
18
+ /ix).freeze
19
+ HREF_CONTENTS_RE = Regexp.new(/\shref=
20
+ ['|"]
21
+ (
22
+ [^\s]
23
+ [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
24
+ )
25
+ ['|"]
26
+ [\s|\W]
27
+ /ix).freeze
28
+ NONPAGE_EXT_RE = Regexp.new(/\.
29
+ (?:css|js|png|gif|jpg|mp4|
30
+ wmv|flv|mp3|wav|doc|txt|ico|xml)
31
+ /ix).freeze
13
32
 
14
33
  attr_reader :links, :source, :t
15
34
 
16
35
  def initialize(source, t)
17
36
  @t = t
18
- @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
37
+ @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
19
38
  @links = nil
20
39
  end
21
40
 
@@ -28,20 +47,20 @@ module Retriever
28
47
  # filter some malformed URLS that come in
29
48
  # meant to be a loose filter to catch all reasonable HREF attributes.
30
49
  link = match[0]
31
- Link.new(@t.host, link).path
32
- end.uniq
50
+ Link.new(@t.scheme, @t.host, link).path
51
+ end.compact.uniq
33
52
  end
34
53
 
35
54
  def parse_internal
36
- links.select { |linky| (@t.host_re =~ linky) }
55
+ links.select { |x| @t.host == Addressable::URI.parse(x).host }
37
56
  end
38
57
 
39
58
  def parse_internal_visitable
40
- parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
59
+ parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
41
60
  end
42
61
 
43
- def parse_files
44
- links.select { |linky| (@t.file_re =~ linky) }
62
+ def parse_files(arr)
63
+ arr.select { |x| @t.file_re =~ x }
45
64
  end
46
65
 
47
66
  def title
@@ -1,21 +1,22 @@
1
1
  require 'open-uri'
2
+ require 'addressable/uri'
2
3
 
3
4
  module Retriever
4
5
  #
5
6
  class Target
6
- HTTP_RE = Regexp.new(/^http/i).freeze
7
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
+ HTTP_RE = Regexp.new(/^http/i).freeze
8
8
 
9
- attr_reader :host, :target, :host_re, :source, :file_re
9
+ attr_reader :host, :target, :host_re, :source, :file_re, :scheme
10
10
 
11
11
  def initialize(url, file_re = nil)
12
- url = "http://#{url}" unless HTTP_RE =~ url
13
- fail 'Bad URL' unless /\./ =~ url
14
- new_uri = URI(url)
15
- @target = new_uri.to_s
16
- @host = new_uri.host
17
- @host_re = Regexp.new(@host.sub('www.', ''))
18
- @file_re ||= file_re
12
+ fail 'Bad URL' unless url.include?('.')
13
+ url = "http://#{url}" unless HTTP_RE =~ url
14
+ target_uri = Addressable::URI.parse(url)
15
+ @target = target_uri.to_s
16
+ @host = target_uri.host
17
+ @host_re = Regexp.new(@host.sub('www.', ''))
18
+ @file_re ||= file_re
19
+ @scheme = target_uri.scheme
19
20
  end
20
21
 
21
22
  def source
@@ -31,13 +32,14 @@ module Retriever
31
32
  fail 'Domain is not working. Try the non-WWW version.' if resp == ''
32
33
  fail 'Domain not working. Try HTTPS???' unless resp
33
34
  # consider using scrub from ruby 2.1? this misses some things
34
- resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
35
+ resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
35
36
  end
36
37
 
37
38
  def resync_target_and_return_source(url)
38
- new_t = Retriever::Target.new(url)
39
+ new_t = Retriever::Target.new(url)
39
40
  @target = new_t.target
40
- @host = new_t.host
41
+ @host = new_t.host
42
+ @scheme = new_t.scheme
41
43
  new_t.source
42
44
  end
43
45
  end
@@ -1,3 +1,4 @@
1
+ #
1
2
  module Retriever
2
- VERSION = '1.1.0'
3
- end
3
+ VERSION = '1.2.0'
4
+ end
data/lib/retriever.rb CHANGED
@@ -6,7 +6,7 @@ require 'retriever/cli'
6
6
  require 'retriever/link'
7
7
  require 'retriever/target'
8
8
  require 'retriever/page'
9
- require 'retriever/openuri-redirect-patch'
9
+ require 'retriever/openuri_redirect_patch'
10
10
 
11
11
  #
12
12
  module Retriever
data/readme.md CHANGED
@@ -4,15 +4,29 @@
4
4
 
5
5
  By Joe Norton
6
6
 
7
- RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
7
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
10
-
11
- **Use at Own Risk**
12
- RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
13
10
 
14
11
  **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
12
+ mission
13
+ -------
14
+ RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
15
+
16
+ features
17
+ --------
18
+ * Asynchronous HTTP Requests thru EM & Synchrony
19
+ * Bloom filter for tracking pages visited.
20
+ * 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
21
+
22
+ use-cases
23
+ ---------
24
+ RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
25
+ 1. Crawl your website and output a *valid XML sitemap* based on what it found.
26
+ 2. Crawl a target website and *download all files of a given filetype*.
27
+ 3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
15
28
 
29
+ Help & Forks Welcome!
16
30
 
17
31
  getting started
18
32
  -----------
data/spec/link_spec.rb CHANGED
@@ -1,66 +1,68 @@
1
1
  require 'retriever'
2
2
 
3
- describe "Link" do
3
+ describe 'Link' do
4
4
 
5
- t = Retriever::Target.new("http://www.cnet.com/reviews/")
6
- let(:links) { Retriever::Page.new(@source,t).links }
5
+ t = Retriever::Target.new('http://www.cnet.com/reviews/')
6
+ let(:links) { Retriever::Page.new(@source, t).links }
7
7
 
8
- it "collects links in anchor tags" do
9
- @source = (<<SOURCE).strip
10
- <a href='http://www.cnet.com/download.exe'>download</a>
8
+ it 'collects links in anchor tags' do
9
+ @source = (<<SOURCE).strip
10
+ <a href='http://www.cnet.com/download.exe'>download</a>
11
11
  SOURCE
12
12
 
13
- expect(links).to include('http://www.cnet.com/download.exe')
14
- end
13
+ expect(links).to include('http://www.cnet.com/download.exe')
14
+ end
15
15
 
16
- it "collects links in link tags" do
17
- @source = (<<SOURCE).strip
18
- <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
16
+ it 'collects links in link tags' do
17
+ @source = (<<SOURCE).strip
18
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
19
19
  SOURCE
20
20
 
21
- expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
22
- end
21
+ expect(links[0]).to include('formreset.css?ver=1.7.12')
22
+ end
23
23
 
24
- it "does not collect bare links (ones not in an href)" do
25
- @source = (<<SOURCE).strip
24
+ it 'does not collect bare links (ones not in an href)' do
25
+ @source = (<<SOURCE).strip
26
26
  http://www.google.com
27
27
  SOURCE
28
28
 
29
- expect(links).to_not include('http://www.google.com')
30
- end
29
+ expect(links).to_not include('http://www.google.com')
30
+ end
31
31
 
32
- it "collects only unique href links on the page" do
33
- @source = (<<SOURCE).strip
32
+ it 'collects only unique href links on the page' do
33
+ @source = (<<SOURCE).strip
34
34
  <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
35
35
  <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
36
36
  SOURCE
37
37
 
38
- expect(links.size).to eq(1)
39
- end
38
+ expect(links.size).to eq(1)
39
+ end
40
40
 
41
- it "adds a protocol to urls missing them (www.)" do
42
- @source = (<<SOURCE).strip
41
+ it 'adds a protocol to urls missing them (www.)' do
42
+ @source = (<<SOURCE).strip
43
43
  <a href='www.cnet.com/download.exe'>download</a>
44
44
  SOURCE
45
45
 
46
- expect(links).to include('http://www.cnet.com/download.exe')
47
- end
46
+ expect(links).to include('http://www.cnet.com/download.exe')
47
+ end
48
48
 
49
- it "doesn't care about any extra attributes on the anchor tag" do
50
- @source = (<<SOURCE).strip
49
+ it "doesn't care about any extra attributes on the anchor tag" do
50
+ @source = (<<SOURCE).strip
51
51
  <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
52
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
52
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
53
+ </a>
53
54
  SOURCE
54
55
 
55
- expect(links.size).to eq(1)
56
- end
56
+ expect(links.size).to eq(1)
57
+ end
57
58
 
58
- it "returns relative urls with full path based on hostname" do
59
- @source = (<<SOURCE).strip
59
+ it 'returns relative urls with full path based on hostname' do
60
+ @source = (<<SOURCE).strip
60
61
  <a href='/test.html'>test</a>
61
62
  <a href='cpage_18'>about</a>
62
63
  SOURCE
63
64
 
64
- expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
65
- end
66
- end
65
+ expect(links).to include('http://www.cnet.com/test.html',
66
+ 'http://www.cnet.com/cpage_18')
67
+ end
68
+ end
data/spec/page_spec.rb CHANGED
@@ -1,93 +1,97 @@
1
1
  require 'retriever/page'
2
2
  require 'retriever/fetch'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
4
+ t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
- describe "Page" do
6
+ describe 'Page' do
7
7
 
8
- describe "#links" do
9
- let (:links){Retriever::Page.new(@source,t).links}
10
- it "collects all unique href links on the page" do
11
- @source = (<<SOURCE).strip
8
+ describe '#links' do
9
+ let(:links) { Retriever::Page.new(@source, t).links }
10
+ it 'collects all unique href links on the page' do
11
+ @source = (<<SOURCE).strip
12
12
  <a href='www.cnet.com/download.exe'>download</a>
13
13
  <a href='/test.html'>test</a>
14
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
14
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
15
+ </a>
15
16
  <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
16
- <a href='http://www.yahoo.com/test/'>yahoo</a>
17
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
17
18
  SOURCE
18
19
 
19
20
  expect(links.size).to eq(4)
20
21
  end
21
22
  end
22
23
 
23
- describe "#parse_internal" do
24
- let (:links){Retriever::Page.new(@source,t).parse_internal}
25
- it "filters links by host" do
26
- @source = (<<SOURCE).strip
24
+ describe '#parse_internal' do
25
+ let(:page) { Retriever::Page.new(@source, t) }
26
+ let(:links) { page.parse_internal }
27
+ it 'filters links by host' do
28
+ @source = (<<SOURCE).strip
27
29
  <a href='http://www.cnet.com/'>download</a>
28
- <a href='http://www.yahoo.com/test/'>yahoo</a>
30
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
29
31
  SOURCE
30
32
 
31
- expect(links.size).to eq(1)
33
+ expect(links.size).to eq(1)
32
34
  end
33
35
  end
34
36
 
35
- describe "#parse_internal_visitable" do
36
- let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
37
+ describe '#parse_internal_visitable' do
38
+ let(:page) { Retriever::Page.new(@source, t) }
39
+ let(:links) { page.parse_internal_visitable }
37
40
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
- @source = (<<SOURCE).strip
41
+ @source = (<<SOURCE).strip
39
42
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
40
43
  SOURCE
41
- expect(links.size).to eq(0)
44
+ expect(links.size).to eq(0)
42
45
  end
43
46
  end
44
47
 
45
- describe "#parseFiles" do
46
- let (:links){Retriever::Page.new(@source,t).parse_files}
47
- it "filters links by filetype" do
48
- @source = (<<SOURCE).strip
48
+ describe '#parse_files' do
49
+ let(:page) { Retriever::Page.new(@source, t) }
50
+ let(:files) { page.parse_files(page.parse_internal) }
51
+ it 'filters links by filetype' do
52
+ @source = (<<SOURCE).strip
49
53
  <a href='www.cnet.com/download.exe'>download</a>
50
- http://www.google.com
54
+ http://www.google.com
51
55
  <a href='/test.html'>test</a>
52
56
  SOURCE
53
- expect(links.size).to eq(1)
57
+ expect(files.size).to eq(1)
54
58
  end
55
59
  end
56
60
 
57
- describe "#title" do
58
- let (:page){Retriever::Page.new(@source,t)}
59
- it "returns page title" do
60
- @source = (<<SOURCE).strip
61
+ describe '#title' do
62
+ let(:page) { Retriever::Page.new(@source, t) }
63
+ it 'returns page title' do
64
+ @source = (<<SOURCE).strip
61
65
  <title>test</title>
62
66
  SOURCE
63
- expect(page.title).to eq('test')
67
+ expect(page.title).to eq('test')
64
68
  end
65
69
  end
66
- describe "#desc" do
67
- let (:page){Retriever::Page.new(@source,t)}
68
- it "returns meta description" do
69
- @source = (<<SOURCE).strip
70
+ describe '#desc' do
71
+ let(:page) { Retriever::Page.new(@source, t) }
72
+ it 'returns meta description' do
73
+ @source = (<<SOURCE).strip
70
74
  <meta name='description' content="test2 ">
71
75
  SOURCE
72
- expect(page.desc).to eq('test2 ')
76
+ expect(page.desc).to eq('test2 ')
73
77
  end
74
78
  end
75
- describe "#h1" do
76
- let (:page){Retriever::Page.new(@source,t)}
77
- it "returns h1 text" do
78
- @source = (<<SOURCE).strip
79
+ describe '#h1' do
80
+ let(:page) { Retriever::Page.new(@source, t) }
81
+ it 'returns h1 text' do
82
+ @source = (<<SOURCE).strip
79
83
  <h1>test 3</h1>
80
84
  SOURCE
81
- expect(page.h1).to eq('test 3')
85
+ expect(page.h1).to eq('test 3')
82
86
  end
83
87
  end
84
- describe "#h2" do
85
- let (:page){Retriever::Page.new(@source,t)}
86
- it "returns h2 text" do
87
- @source = (<<SOURCE).strip
88
+ describe '#h2' do
89
+ let(:page) { Retriever::Page.new(@source, t) }
90
+ it 'returns h2 text' do
91
+ @source = (<<SOURCE).strip
88
92
  <h2> test 4 </h2>
89
93
  SOURCE
90
- expect(page.h2).to eq(' test 4 ')
94
+ expect(page.h2).to eq(' test 4 ')
91
95
  end
92
96
  end
93
97
  end
@@ -1,5 +1,4 @@
1
1
  require 'retriever'
2
2
 
3
- describe "Fetch" do
4
-
5
- end
3
+ describe 'Fetch' do
4
+ end
data/spec/target_spec.rb CHANGED
@@ -1,44 +1,44 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
4
+ t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
- describe "Target" do
6
+ describe 'Target' do
7
7
 
8
- it "creates target var" do
9
- expect(t.target).to eq("http://www.cnet.com/reviews/")
10
- end
8
+ it 'creates target var' do
9
+ expect(t.target).to eq('http://www.cnet.com/reviews/')
10
+ end
11
11
 
12
- it "creates host var" do
13
- expect(t.host).to eq("www.cnet.com")
14
- end
12
+ it 'creates host var' do
13
+ expect(t.host).to eq('www.cnet.com')
14
+ end
15
15
 
16
- it "creates host_re var" do
17
- expect(t.host_re).to eq(/cnet.com/)
18
- end
16
+ it 'creates host_re var' do
17
+ expect(t.host_re).to eq(/cnet.com/)
18
+ end
19
19
 
20
- it "creates file_re var (when provided)" do
21
- expect(t.file_re).to eq(/\.exe\z/)
22
- end
20
+ it 'creates file_re var (when provided)' do
21
+ expect(t.file_re).to eq(/\.exe\z/)
22
+ end
23
23
 
24
- it "adds protocol to Target URL if none given" do
25
- expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
26
- end
24
+ it 'adds protocol to Target URL if none given' do
25
+ expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
26
+ end
27
27
 
28
- it "fails if given URL has no dot in it" do
29
- expect{Retriever::Target.new("cnetcom")}.to raise_error
30
- end
28
+ it 'fails if given URL has no dot in it' do
29
+ expect { Retriever::Target.new('cnetcom') }.to raise_error
30
+ end
31
31
 
32
- describe "#source" do
32
+ describe '#source' do
33
33
 
34
- it "opens URL and returns source as String" do
35
- expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
34
+ it 'opens URL and returns source as String' do
35
+ expect(Retriever::Target.new('http://techcrunch.com/').source.class)
36
+ .to eq(String)
36
37
  end
37
38
 
38
- it "fails if target redirects to new host" do
39
- expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
39
+ it 'fails if target redirects to new host' do
40
+ expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
41
+ .to raise_error
40
42
  end
41
-
42
43
  end
43
-
44
- end
44
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: addressable
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,7 @@ files:
125
139
  - lib/retriever/fetchseo.rb
126
140
  - lib/retriever/fetchsitemap.rb
127
141
  - lib/retriever/link.rb
128
- - lib/retriever/openuri-redirect-patch.rb
142
+ - lib/retriever/openuri_redirect_patch.rb
129
143
  - lib/retriever/page.rb
130
144
  - lib/retriever/target.rb
131
145
  - lib/retriever/version.rb