rubyretriever 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 02c2b2530b3b83eb12325443c3c9214d977d8a56
4
- data.tar.gz: f1a8b163ae3c3caed750eacb7724c2c38693ccd2
3
+ metadata.gz: 79f0b251e367f085f7b84dd83a10f6a1dfcddd3c
4
+ data.tar.gz: 0e9b6bc8f66b9efd14921d8f0fc5fddc45042b5f
5
5
  SHA512:
6
- metadata.gz: 68c56e76fff7cee17b2e48251413df199cbe46df84bb8b51f0333510d9507627f59714bd7c4fb4a502796cefa76d362b9dbe912ea1a4d419b356f0de403e606a
7
- data.tar.gz: 2f149643ba65999c783bf17bbcbffc92648eec3979f610df2fdd101c55a5a58607c2087f47911d65457d41567f5f574b79ad0a25cfac82e61c7255c1170e3e64
6
+ metadata.gz: fe1a6c8e118378513c4a4e72adeccc94e212fd5e0d4244f56240830155e42f7b7bde80acdfabbc9fe9ee5b46687bc33d089d73c57690beadee3d04804a9435ac
7
+ data.tar.gz: 4e6bec31d3416293f2fb72b39cfab5602692a2b9d94cbfb6fba9a653afc6bf718e5eebf84b4a5745d01d2e55959df4e489342b0deca2060e2653bb5b31f4731e
data/bin/rr CHANGED
@@ -1,57 +1,58 @@
1
1
  #! /usr/bin/env ruby
2
-
3
2
  require 'retriever'
4
3
  require 'optparse'
5
4
 
6
5
  options = {}
7
- optparse = OptionParser.new do |opts|
8
- # Set a banner, displayed at the top
9
- # of the help screen.
10
- opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
11
- options['sitemap'] = false
12
- opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |output_type|
13
- options['sitemap'] = output_type || ''
14
- end
15
- options['fileharvest'] = false
16
- opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_ext|
17
- options['fileharvest'] = file_ext
18
- end
19
- options['seo'] = false
20
- opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
21
- options['seo'] = true
22
- end
23
- options['filename'] = nil
24
- opts.on('-o', '--out FILENAME', 'Dump output to selected filename') do |filename|
25
- options['filename'] = filename
26
- end
27
- # Define the options, and what they do
28
- options['verbose'] = false
29
- opts.on('-v', '--verbose', 'Output more information') do
30
- options['verbose'] = true
31
- end
32
- options['progress'] = false
33
- opts.on('-p', '--progress', 'Output progress bar') do
34
- options['progress'] = true
35
- end
36
- options['maxpages'] = false
37
- opts.on('-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages') do |maxpages|
38
- options[:maxpages] = maxpages
39
- end
40
- options['autodown'] = false
41
- opts.on('-a', '--auto', 'Automatically download all files of filetype located') do
42
- options[:autodown] = true
43
- end
44
- # This displays the help screen, all programs are
45
- # assumed to have this option.
46
- opts.on('-h', '--help', 'Display this screen') do
47
- puts opts
48
- exit
49
- end
50
- end
6
+ optparse = OptionParser.new do |opts|
7
+ # Set a banner, displayed at the top
8
+ # of the help screen.
9
+ opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
10
+ options['sitemap'] = false
11
+ opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
12
+ options['sitemap'] = type || ''
13
+ end
14
+ options['fileharvest'] = false
15
+ opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
16
+ options['fileharvest'] = file_e
17
+ end
18
+ options['seo'] = false
19
+ opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
20
+ options['seo'] = true
21
+ end
22
+ options['filename'] = nil
23
+ opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
24
+ options['filename'] = file
25
+ end
26
+ # Define the options, and what they do
27
+ options['verbose'] = false
28
+ opts.on('-v', '--verbose', 'Output more information') do
29
+ options['verbose'] = true
30
+ end
31
+ options['progress'] = false
32
+ opts.on('-p', '--progress', 'Output progress bar') do
33
+ options['progress'] = true
34
+ end
35
+ options['maxpages'] = false
36
+ opts.on('-l',
37
+ '--limit PAGE_LIMIT_#',
38
+ 'set a max on the total number of crawled pages') do |maxp|
39
+ options['maxpages'] = maxp
40
+ end
41
+ options['autodown'] = false
42
+ opts.on('-a', '--auto', 'Automatically download all files located') do
43
+ options['autodown'] = true
44
+ end
45
+ # This displays the help screen, all programs are
46
+ # assumed to have this option.
47
+ opts.on('-h', '--help', 'Display this screen') do
48
+ puts opts
49
+ exit
50
+ end
51
+ end
51
52
 
52
53
  optparse.parse!
53
54
  if ARGV[0].nil?
54
- abort('###Missing Required Argument\nUsage: rr [mode] [options] Target_URL')
55
+ abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
55
56
  end
56
57
 
57
58
  ARGV.each do|q|
@@ -61,9 +62,11 @@ ARGV.each do|q|
61
62
  puts '### Creating Sitemap' if options['sitemap']
62
63
  puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
63
64
  puts '### Performing File Harvest' if options['fileharvest']
64
- puts "### Searching for file extension: #{options['fileharvest']} pages" if options['fileharvest']
65
+ if options['fileharvest']
66
+ puts "### Searching for filetype: #{options['fileharvest']}"
67
+ end
65
68
  puts '### Performing SEO Scrape' if options['seo']
66
- puts "### Writing output to filename: #{options['filename']}" if options['filename']
69
+ puts "### Writing to file: #{options['filename']}" if options['filename']
67
70
  puts '### Being verbose'
68
71
  puts "### Stopping after #{options['maxpages']} pages"
69
72
  end
data/lib/retriever/cli.rb CHANGED
@@ -3,19 +3,23 @@ module Retriever
3
3
  class CLI
4
4
  def initialize(url, options)
5
5
  # kick off the fetch mode of choice
6
+ @fetch = choose_fetch_mode(url, options)
7
+ @fetch.dump
8
+ @fetch.write if options['filename']
9
+ @fetch.autodownload if options['autodown'] && options['fileharvest']
10
+ @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
11
+ end
12
+
13
+ def choose_fetch_mode(url, options)
6
14
  if options['fileharvest']
7
- @fetch = Retriever::FetchFiles.new(url, options)
15
+ Retriever::FetchFiles.new(url, options)
8
16
  elsif options['sitemap']
9
- @fetch = Retriever::FetchSitemap.new(url, options)
17
+ Retriever::FetchSitemap.new(url, options)
10
18
  elsif options['seo']
11
- @fetch = Retriever::FetchSEO.new(url, options)
19
+ Retriever::FetchSEO.new(url, options)
12
20
  else
13
21
  fail '### Error: No Mode Selected'
14
22
  end
15
- @fetch.dump
16
- @fetch.write if options['filename']
17
- @fetch.autodownload if options['autodown'] && options['fileharvest']
18
- @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
19
23
  end
20
24
  end
21
25
  end
@@ -9,56 +9,27 @@ require 'bloomfilter-rb'
9
9
  module Retriever
10
10
  #
11
11
  class Fetch
12
+ HR = '###############################'
12
13
  attr_reader :max_pages, :t
13
14
  # given target URL and RR options, creates a fetch object.
14
15
  # There is no direct output
15
16
  # this is a parent class that the other fetch classes build off of.
16
17
  def initialize(url, options)
18
+ @data = []
17
19
  @connection_tally = {
18
- :success => 0,
19
- :error => 0,
20
- :error_client => 0,
21
- :error_server => 0
20
+ success: 0,
21
+ error: 0,
22
+ error_client: 0,
23
+ error_server: 0
22
24
  }
23
- # OPTIONS
24
- @prgrss = options['progress']
25
- @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
26
- @v = options['verbose']
27
- @output = options['filename']
28
- @fh = options['fileharvest']
29
- @file_ext = @fh.to_s
30
- @s = options['sitemap']
31
- @seo = options['seo']
32
- @autodown = options['autodown']
33
- #
34
- if @fh
35
- temp_ext_str = '.' + @file_ext + '\z'
36
- @file_re = Regexp.new(temp_ext_str).freeze
37
- else
38
- # when FH is not true, and autodown is true
39
- errlog('Cannot AUTODOWNLOAD when not in FILEHARVEST MODE') if @autodown
40
- end
41
- if @prgrss
42
- # verbose & progressbar conflict
43
- errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p') if @v
44
- prgress_vars = {
45
- :title => 'Pages',
46
- :starting_at => 1,
47
- :total => @max_pages,
48
- :format => '%a |%b>%i| %c/%C %t'
49
- }
50
- @progressbar = ProgressBar.create(prgress_vars)
51
- end
25
+ setup_options(options)
26
+ setup_progress_bar if @progress
52
27
  @t = Retriever::Target.new(url, @file_re)
53
- @output = "rr-#{@t.host.split('.')[1]}" if @fh && !@output
54
- @already_crawled = BloomFilter::Native.new(
55
- :size => 1_000_000,
56
- :hashes => 5,
57
- :seed => 1,
58
- :bucket => 8,
59
- :raise => false
60
- )
61
- @already_crawled.insert(@t.target)
28
+ @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
29
+ @already_crawled = setup_bloom_filter
30
+ @page_one = crawl_page_one
31
+ @link_stack = create_link_stack
32
+ @temp_link_stack = []
62
33
  end
63
34
 
64
35
  def errlog(msg)
@@ -66,35 +37,26 @@ module Retriever
66
37
  end
67
38
 
68
39
  def lg(msg)
69
- puts "### #{msg}" if @v
40
+ puts "### #{msg}" if @verbose
70
41
  end
71
42
 
72
43
  # prints current data collection to STDOUT
73
44
  def dump
74
- puts '###############################'
75
- if @v
76
- puts 'Connection Tally:'
77
- puts @connection_tally.to_s
78
- puts '###############################'
79
- end
80
- if @s
81
- puts "#{@t.target} Sitemap"
82
- puts "Page Count: #{@data.size}"
83
- elsif @fh
84
- puts "Target URL: #{@t.target}"
85
- puts "Filetype: #{@file_ext}"
86
- puts "File Count: #{@data.size}"
45
+ puts HR
46
+ puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
47
+ puts "Target URL: #{@t.target}"
48
+ if @sitemap
49
+ puts 'Sitemap'
50
+ elsif @fileharvest
51
+ puts "File harvest by type: #{@fileharvest}"
87
52
  elsif @seo
88
- puts "#{@t.target} SEO Metrics"
89
- puts "Page Count: #{@data.size}"
90
- else
91
- fail 'ERROR - Cannot dump - Mode Not Found'
53
+ puts 'SEO Metrics'
92
54
  end
93
- puts '###############################'
55
+ puts "Data Dump -- Object Count: #{@data.size}"
56
+ puts HR
94
57
  @data.each do |line|
95
58
  puts line
96
59
  end
97
- puts '###############################'
98
60
  puts
99
61
  end
100
62
 
@@ -111,34 +73,90 @@ module Retriever
111
73
  csv << entry
112
74
  end
113
75
  end
114
- puts '###############################'
76
+ puts HR
115
77
  puts "File Created: #{@output}.csv"
116
78
  puts "Object Count: #{@data.size}"
117
- puts '###############################'
79
+ puts HR
118
80
  puts
119
81
  end
120
82
 
83
+ private
84
+
85
+ def setup_options(options)
86
+ @progress = options['progress']
87
+ @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
88
+ @verbose = options['verbose']
89
+ @output = options['filename']
90
+ @fileharvest = options['fileharvest']
91
+ @sitemap = options['sitemap']
92
+ @seo = options['seo']
93
+ @autodown = options['autodown']
94
+ @file_re = Regexp.new(".#{@fileharvest}\z").freeze if @fileharvest
95
+ end
96
+
97
+ def setup_bloom_filter
98
+ already_crawled = BloomFilter::Native.new(
99
+ size: 1_000_000,
100
+ hashes: 5,
101
+ seed: 1,
102
+ bucket: 8,
103
+ raise: false
104
+ )
105
+ already_crawled.insert(@t.target)
106
+ already_crawled
107
+ end
108
+
109
+ def setup_progress_bar
110
+ # verbose & progressbar conflict
111
+ errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
112
+ prgress_vars = {
113
+ title: 'Pages',
114
+ starting_at: 1,
115
+ total: @max_pages,
116
+ format: '%a |%b>%i| %c/%C %t'
117
+ }
118
+ @progressbar = ProgressBar.create(prgress_vars)
119
+ end
120
+
121
+ def crawl_page_one
122
+ page_one = Retriever::Page.new(@t.source, @t)
123
+ lg("URL Crawled: #{@t.target}")
124
+ page_one
125
+ end
126
+
127
+ def create_link_stack
128
+ link_stack = @page_one.parse_internal_visitable
129
+ errlog("Bad URL -- #{@t.target}") unless link_stack
130
+ lg("#{link_stack.size - 1} links found")
131
+ link_stack.delete(@t.target)
132
+ link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
133
+ link_stack
134
+ end
135
+
136
+ def end_crawl_notice
137
+ notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
138
+ @progressbar.log(notice) if @progress
139
+ lg(notice)
140
+ end
141
+
121
142
  # iterates over the existing @link_stack
122
143
  # running until we reach the @max_pages value.
123
144
  def async_crawl_and_collect
124
145
  while @already_crawled.size < @max_pages
125
146
  if @link_stack.empty?
126
- if @prgrss
127
- @progressbar.log("Can't find any more links.")
128
- else
129
- lg("Can't find any more links.")
130
- end
147
+ end_crawl_notice
131
148
  break
132
149
  end
133
150
  new_links_arr = process_link_stack
151
+ @temp_link_stack = []
134
152
  next if new_links_arr.nil? || new_links_arr.empty?
135
153
  # set operations to see are these in our previous visited pages arr
136
- new_links_arr -= @link_stack
137
- @link_stack.concat(new_links_arr).uniq!
138
- @data.concat(new_links_arr) if @s
154
+ next if new_links_arr.empty?
155
+ @link_stack.concat(new_links_arr)
156
+ next unless @sitemap
157
+ @data.concat(new_links_arr)
139
158
  end
140
- # done, make sure progress bar says we are done
141
- @progressbar.finish if @prgrss
159
+ @data.uniq!
142
160
  end
143
161
 
144
162
  # returns true is resp is ok to continue
@@ -149,8 +167,8 @@ module Retriever
149
167
  loc = hdr.location
150
168
  lg("#{url} Redirected to #{loc}")
151
169
  if t.host_re =~ loc
152
- @link_stack.push(loc) unless @already_crawled.include?(loc)
153
- lg('--Added to linkStack for later')
170
+ @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
171
+ lg('--Added to stack for later')
154
172
  return false
155
173
  end
156
174
  lg("Redirection outside of target host. No - go. #{loc}")
@@ -159,7 +177,6 @@ module Retriever
159
177
  # lets not continue if unsuccessful connection
160
178
  unless hdr.successful?
161
179
  lg("UNSUCCESSFUL CONNECTION -- #{url}")
162
-
163
180
  @connection_tally[:error] += 1
164
181
  @connection_tally[:error_server] += 1 if hdr.server_error?
165
182
  @connection_tally[:error_client] += 1 if hdr.client_error?
@@ -168,7 +185,6 @@ module Retriever
168
185
  # let's not continue if not text/html
169
186
  unless hdr['CONTENT_TYPE'].include?('text/html')
170
187
  @already_crawled.insert(url)
171
- @link_stack.delete(url)
172
188
  lg("Page Not text/html -- #{url}")
173
189
  return false
174
190
  end
@@ -176,45 +192,58 @@ module Retriever
176
192
  true
177
193
  end
178
194
 
195
+ def push_seo_to_data(url, new_page)
196
+ seos = [url]
197
+ seos.concat(new_page.parse_seo)
198
+ @data.push(seos)
199
+ lg('--page SEO scraped')
200
+ end
201
+
202
+ def push_files_to_data(new_page)
203
+ filez = new_page.parse_files(new_page.parse_internal)
204
+ @data.concat(filez) unless filez.empty?
205
+ lg("--#{filez.size} files found")
206
+ end
207
+
208
+ def page_from_response(url, response)
209
+ lg("Page Fetched: #{url}")
210
+ @already_crawled.insert(url)
211
+ if @progress && (@already_crawled.size < @max_pages)
212
+ @progressbar.increment
213
+ end
214
+ Retriever::Page.new(response, @t)
215
+ end
216
+
217
+ def new_visitable_links(current_page)
218
+ lg("--#{current_page.links.size} links found")
219
+ current_page.parse_internal_visitable
220
+ end
221
+
179
222
  # send a new wave of GET requests, using current @link_stack
223
+ # at end of the loop it empties link_stack
224
+ # puts new links into temporary stack
180
225
  def process_link_stack
181
- new_stuff = []
182
226
  EM.synchrony do
183
227
  concurrency = 10
184
228
  EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
185
229
  next if @already_crawled.size >= @max_pages
186
230
  next if @already_crawled.include?(url)
187
-
188
231
  resp = EventMachine::HttpRequest.new(url).get
189
-
190
232
  next unless good_response?(resp, url)
191
- lg("Page Fetched: #{url}")
192
- @already_crawled.insert(url)
193
-
194
- new_page = Retriever::Page.new(resp.response, @t)
195
- if @prgrss
196
- @progressbar.increment if @already_crawled.size < @max_pages
197
- end
198
- if @seo
199
- seos = [url]
200
- seos.concat(new_page.parse_seo)
201
- @data.push(seos)
202
- lg('--page SEO scraped')
203
- end
204
- next if new_page.links.size == 0
205
- lg("--#{new_page.links.size} links found")
206
- internal_links_arr = new_page.parse_internal_visitable
207
- new_stuff.push(internal_links_arr)
208
- if @fh
209
- filez = new_page.parse_files
210
- @data.concat(filez) unless filez.empty?
211
- lg("--#{filez.size} files found")
212
- end
233
+ current_page = page_from_response(url, resp.response)
234
+ # non-link dependent modes
235
+ push_seo_to_data(url, current_page) if @seo
236
+ next unless current_page.links.size > 0
237
+ @temp_link_stack.push(new_visitable_links(current_page))
238
+ # link dependent modes
239
+ next unless @fileharvest
240
+ push_files_to_data(current_page)
213
241
  end
214
- new_stuff = new_stuff.flatten # all completed requests
215
242
  EventMachine.stop
216
243
  end
217
- new_stuff.uniq!
244
+ # empty the stack. most clean way
245
+ @link_stack = []
246
+ @temp_link_stack.flatten.uniq!
218
247
  end
219
248
  end
220
249
  end
@@ -5,29 +5,21 @@ module Retriever
5
5
  class FetchFiles < Fetch
6
6
  def initialize(url, options)
7
7
  super
8
- @data = []
9
- page_one = Retriever::Page.new(@t.source, @t)
10
- @link_stack = page_one.parse_internal_visitable
11
- lg("URL Crawled: #{@t.target}")
12
- lg("#{@link_stack.size - 1} new links found")
13
-
14
- temp_file_collection = page_one.parse_files
8
+ temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
15
9
  @data.concat(tempFileCollection) if temp_file_collection.size > 0
16
10
  lg("#{@data.size} new files found")
17
- errlog("Bad URL -- #{@t.target}") unless @link_stack
18
- @link_stack.delete(@t.target)
19
11
 
20
12
  async_crawl_and_collect
21
-
13
+ # done, make sure progress bar says we are done
14
+ @progressbar.finish if @progress
22
15
  @data.sort_by! { |x| x.length }
23
- @data.uniq!
24
16
  end
25
17
 
26
18
  def download_file(path)
27
19
  # given valid url, downloads file to current directory in /rr-downloads/
28
20
  arr = path.split('/')
29
21
  shortname = arr.pop
30
- puts "Initiating Download to: '/rr-downloads/' + #{shortname}"
22
+ puts "Initiating Download of: #{shortname}"
31
23
  File.open(shortname, 'wb') do |saved_file|
32
24
  open(path) do |read_file|
33
25
  saved_file.write(read_file.read)
@@ -38,33 +30,39 @@ module Retriever
38
30
 
39
31
  def autodownload
40
32
  # go through the fetched file URL collection and download each one.
41
- lenny = @data.count
42
- puts '###################'
33
+ puts HR
43
34
  puts '### Initiating Autodownload...'
44
- puts '###################'
45
- puts "#{lenny} - #{@file_ext}'s Located"
46
- puts '###################'
47
- if File.directory?('rr-downloads')
48
- Dir.chdir('rr-downloads')
49
- else
50
- puts 'creating rr-downloads Directory'
51
- Dir.mkdir('rr-downloads')
52
- Dir.chdir('rr-downloads')
53
- end
54
- file_counter = 0
55
- @data.each do |entry|
35
+ puts HR
36
+ puts "#{@data.count} - #{@file_ext}'s Located"
37
+ puts HR
38
+ move_to_download_dir
39
+ iterate_thru_collection_and_download
40
+ Dir.chdir('..')
41
+ end
42
+
43
+ private
44
+
45
+ def iterate_thru_collection_and_download
46
+ lenn = @data.count
47
+ @data.each_with_index do |entry, i|
56
48
  begin
57
49
  download_file(entry)
58
- file_counter += 1
59
- lg(' File [#{file_counter} of #{lenny}]')
60
- puts
61
- rescue StandardError => e
62
- puts 'ERROR: failed to download - #{entry}'
63
- puts e.message
64
- puts
50
+ rescue StandardError
51
+ puts "ERROR: failed to download - #{entry}"
65
52
  end
53
+ lg(" File [#{i + 1} of #{lenn}]\n")
66
54
  end
67
- Dir.chdir('..')
55
+ end
56
+
57
+ def move_to_download_dir(dir_name = 'rr-downloads')
58
+ if File.directory?(dir_name)
59
+ Dir.chdir(dir_name)
60
+ else
61
+ puts "creating #{dir_name} Directory"
62
+ Dir.mkdir(dir_name)
63
+ Dir.chdir(dir_name)
64
+ end
65
+ puts "Downloading files to local directory: '/#{dir_name}/'"
68
66
  end
69
67
  end
70
68
  end
@@ -6,19 +6,11 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
- @data = []
10
- page_one = Retriever::Page.new(@t.source, @t)
11
- lg("URL Crawled: #{@t.target}")
12
-
13
- @link_stack = page_one.parse_internal_visitable
14
- errlog("Bad URL -- #{@t.target}") unless @link_stack
15
- lg("#{@link_stack.size - 1} links found")
16
- @link_stack.delete(@t.target)
17
-
18
- @data.push(page_one.parse_seo)
9
+ @data.push(@page_one.parse_seo)
19
10
 
20
11
  async_crawl_and_collect
21
-
12
+ # done, make sure progress bar says we are done
13
+ @progressbar.finish if @progress
22
14
  @data.sort_by! { |x| x[0].length }
23
15
  end
24
16
  end
@@ -5,37 +5,38 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
- @data = [@t.target]
9
- page_one = Retriever::Page.new(@t.source, @t)
10
- lg("URL Crawled: #{@t.target}")
11
- @link_stack = page_one.parse_internal_visitable
12
- errlog("Bad URL -- #{@t.target}") unless @link_stack
13
- lg("#{@link_stack.size - 1} links found")
14
-
15
- @link_stack.delete(@t.target)
8
+ @data.push(@t.target)
16
9
  @data.concat(@link_stack)
17
10
 
18
11
  async_crawl_and_collect
19
-
12
+ # done, make sure progress bar says we are done
13
+ @progressbar.finish if @progress
20
14
  @data.sort_by! { |x| x.length } if @data.size > 1
21
15
  @data.uniq!
22
16
  end
23
17
 
18
+ private
19
+
24
20
  # produces valid XML sitemap based on page collection fetched.
25
21
  # Writes to current directory.
26
22
  def gen_xml
27
- f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
28
- f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
29
- @data.each do |url|
30
- f << "<url><loc>#{url}</loc></url>"
31
- end
23
+ filename = @t.host.split('.')[1]
24
+ f = File.open("sitemap-#{filename}.xml", 'w+')
25
+ f << "<?xml version='1.0' encoding='UTF-8'?>"
26
+ f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
27
+ @data.each do |url|
28
+ f << "<url><loc>#{url}</loc></url>"
29
+ end
32
30
  f << '</urlset>'
33
31
  f.close
34
- puts '###############################'
35
- puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
32
+ print_file_info(filename)
33
+ end
34
+
35
+ def print_file_info(filename)
36
+ puts HR
37
+ puts "File Created: sitemap-#{filename}.xml"
36
38
  puts "Object Count: #{@data.size}"
37
- puts '###############################'
38
- puts
39
+ puts HR + "\n"
39
40
  end
40
41
  end
41
42
  end
@@ -1,33 +1,35 @@
1
+ require 'addressable/uri'
1
2
  module Retriever
2
3
  #
3
4
  class Link
4
- HTTP_RE = Regexp.new(/^http/i).freeze
5
- SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
6
- DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
7
- NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
8
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
-
10
- def initialize(host, link)
11
- @host = host
12
- @link = link
5
+ # HTTP_RE = Regexp.new(/^http/i).freeze
6
+ SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
7
+ DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
8
+ WWW_DOT_RE = Regexp.new(/^www\./i).freeze
9
+
10
+ def initialize(target_scheme, target_host, this_link)
11
+ @link_uri = Addressable::URI.parse(this_link)
12
+ @scheme = target_scheme
13
+ @host = target_host
14
+ @this_link = @link_uri.to_s
13
15
  end
14
16
 
15
17
  def path
16
- return link if HTTP_RE =~ link
18
+ return this_link if link_uri.absolute?
17
19
 
18
- return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
20
+ return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
19
21
 
20
- return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
22
+ return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
21
23
 
22
24
  # link begins with '//'
23
- return "http:#{link}" if DOUBLE_SLASH_RE =~ link
25
+ return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
24
26
 
25
27
  # link uses relative path with no slashes at all
26
- return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link
28
+ return "#{@scheme}://#{host}/#{this_link}" if link_uri.relative?
27
29
  end
28
30
 
29
31
  private
30
32
 
31
- attr_reader :host, :link
33
+ attr_reader :this_link, :host, :link_uri
32
34
  end
33
35
  end
@@ -1,6 +1,7 @@
1
+ #
1
2
  module OpenURI
2
3
  # nesc patch otherwise OPENURI blocks redirects to and from https
3
- def OpenURI.redirectable?(uri1, uri2)
4
+ def self.redirectable?(uri1, uri2)
4
5
  uri1.scheme.downcase == uri2.scheme.downcase ||
5
6
  (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
6
7
  end
@@ -1,21 +1,40 @@
1
+ require 'addressable/uri'
2
+
1
3
  module Retriever
2
4
  #
3
5
  class Page
4
- HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
5
- NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
6
- HTTP_RE = Regexp.new(/^http/i).freeze
7
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
8
-
9
- TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
10
- DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
11
- H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
12
- H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
6
+ HTTP_RE = Regexp.new(/^http/i).freeze
7
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
8
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
9
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
10
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
11
+ [^>]*content=[\"]
12
+ (
13
+ [^\"]*
14
+ )
15
+ [\"]
16
+ [^>]
17
+ *>
18
+ /ix).freeze
19
+ HREF_CONTENTS_RE = Regexp.new(/\shref=
20
+ ['|"]
21
+ (
22
+ [^\s]
23
+ [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
24
+ )
25
+ ['|"]
26
+ [\s|\W]
27
+ /ix).freeze
28
+ NONPAGE_EXT_RE = Regexp.new(/\.
29
+ (?:css|js|png|gif|jpg|mp4|
30
+ wmv|flv|mp3|wav|doc|txt|ico|xml)
31
+ /ix).freeze
13
32
 
14
33
  attr_reader :links, :source, :t
15
34
 
16
35
  def initialize(source, t)
17
36
  @t = t
18
- @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
37
+ @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
19
38
  @links = nil
20
39
  end
21
40
 
@@ -28,20 +47,20 @@ module Retriever
28
47
  # filter some malformed URLS that come in
29
48
  # meant to be a loose filter to catch all reasonable HREF attributes.
30
49
  link = match[0]
31
- Link.new(@t.host, link).path
32
- end.uniq
50
+ Link.new(@t.scheme, @t.host, link).path
51
+ end.compact.uniq
33
52
  end
34
53
 
35
54
  def parse_internal
36
- links.select { |linky| (@t.host_re =~ linky) }
55
+ links.select { |x| @t.host == Addressable::URI.parse(x).host }
37
56
  end
38
57
 
39
58
  def parse_internal_visitable
40
- parse_internal.select { |linky| (!(NONPAGE_EXT_RE =~ linky)) }
59
+ parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
41
60
  end
42
61
 
43
- def parse_files
44
- links.select { |linky| (@t.file_re =~ linky) }
62
+ def parse_files(arr)
63
+ arr.select { |x| @t.file_re =~ x }
45
64
  end
46
65
 
47
66
  def title
@@ -1,21 +1,22 @@
1
1
  require 'open-uri'
2
+ require 'addressable/uri'
2
3
 
3
4
  module Retriever
4
5
  #
5
6
  class Target
6
- HTTP_RE = Regexp.new(/^http/i).freeze
7
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
+ HTTP_RE = Regexp.new(/^http/i).freeze
8
8
 
9
- attr_reader :host, :target, :host_re, :source, :file_re
9
+ attr_reader :host, :target, :host_re, :source, :file_re, :scheme
10
10
 
11
11
  def initialize(url, file_re = nil)
12
- url = "http://#{url}" unless HTTP_RE =~ url
13
- fail 'Bad URL' unless /\./ =~ url
14
- new_uri = URI(url)
15
- @target = new_uri.to_s
16
- @host = new_uri.host
17
- @host_re = Regexp.new(@host.sub('www.', ''))
18
- @file_re ||= file_re
12
+ fail 'Bad URL' unless url.include?('.')
13
+ url = "http://#{url}" unless HTTP_RE =~ url
14
+ target_uri = Addressable::URI.parse(url)
15
+ @target = target_uri.to_s
16
+ @host = target_uri.host
17
+ @host_re = Regexp.new(@host.sub('www.', ''))
18
+ @file_re ||= file_re
19
+ @scheme = target_uri.scheme
19
20
  end
20
21
 
21
22
  def source
@@ -31,13 +32,14 @@ module Retriever
31
32
  fail 'Domain is not working. Try the non-WWW version.' if resp == ''
32
33
  fail 'Domain not working. Try HTTPS???' unless resp
33
34
  # consider using scrub from ruby 2.1? this misses some things
34
- resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
35
+ resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
35
36
  end
36
37
 
37
38
  def resync_target_and_return_source(url)
38
- new_t = Retriever::Target.new(url)
39
+ new_t = Retriever::Target.new(url)
39
40
  @target = new_t.target
40
- @host = new_t.host
41
+ @host = new_t.host
42
+ @scheme = new_t.scheme
41
43
  new_t.source
42
44
  end
43
45
  end
@@ -1,3 +1,4 @@
1
+ #
1
2
  module Retriever
2
- VERSION = '1.1.0'
3
- end
3
+ VERSION = '1.2.0'
4
+ end
data/lib/retriever.rb CHANGED
@@ -6,7 +6,7 @@ require 'retriever/cli'
6
6
  require 'retriever/link'
7
7
  require 'retriever/target'
8
8
  require 'retriever/page'
9
- require 'retriever/openuri-redirect-patch'
9
+ require 'retriever/openuri_redirect_patch'
10
10
 
11
11
  #
12
12
  module Retriever
data/readme.md CHANGED
@@ -4,15 +4,29 @@
4
4
 
5
5
  By Joe Norton
6
6
 
7
- RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
7
+ RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
10
-
11
- **Use at Own Risk**
12
- RR does NOT respect robots.txt, and RR currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of page's it has already crawled.
13
10
 
14
11
  **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
12
+ mission
13
+ -------
14
+ RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
15
+
16
+ features
17
+ --------
18
+ * Asynchronous HTTP Requests thru EM & Synchrony
19
+ * Bloom filter for tracking pages visited.
20
+ * 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
21
+
22
+ use-cases
23
+ ---------
24
+ RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
25
+ 1. Crawl your website and output a *valid XML sitemap* based on what it found.
26
+ 2. Crawl a target website and *download all files of a given filetype*.
27
+ 3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
15
28
 
29
+ Help & Forks Welcome!
16
30
 
17
31
  getting started
18
32
  -----------
data/spec/link_spec.rb CHANGED
@@ -1,66 +1,68 @@
1
1
  require 'retriever'
2
2
 
3
- describe "Link" do
3
+ describe 'Link' do
4
4
 
5
- t = Retriever::Target.new("http://www.cnet.com/reviews/")
6
- let(:links) { Retriever::Page.new(@source,t).links }
5
+ t = Retriever::Target.new('http://www.cnet.com/reviews/')
6
+ let(:links) { Retriever::Page.new(@source, t).links }
7
7
 
8
- it "collects links in anchor tags" do
9
- @source = (<<SOURCE).strip
10
- <a href='http://www.cnet.com/download.exe'>download</a>
8
+ it 'collects links in anchor tags' do
9
+ @source = (<<SOURCE).strip
10
+ <a href='http://www.cnet.com/download.exe'>download</a>
11
11
  SOURCE
12
12
 
13
- expect(links).to include('http://www.cnet.com/download.exe')
14
- end
13
+ expect(links).to include('http://www.cnet.com/download.exe')
14
+ end
15
15
 
16
- it "collects links in link tags" do
17
- @source = (<<SOURCE).strip
18
- <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
16
+ it 'collects links in link tags' do
17
+ @source = (<<SOURCE).strip
18
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
19
19
  SOURCE
20
20
 
21
- expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
22
- end
21
+ expect(links[0]).to include('formreset.css?ver=1.7.12')
22
+ end
23
23
 
24
- it "does not collect bare links (ones not in an href)" do
25
- @source = (<<SOURCE).strip
24
+ it 'does not collect bare links (ones not in an href)' do
25
+ @source = (<<SOURCE).strip
26
26
  http://www.google.com
27
27
  SOURCE
28
28
 
29
- expect(links).to_not include('http://www.google.com')
30
- end
29
+ expect(links).to_not include('http://www.google.com')
30
+ end
31
31
 
32
- it "collects only unique href links on the page" do
33
- @source = (<<SOURCE).strip
32
+ it 'collects only unique href links on the page' do
33
+ @source = (<<SOURCE).strip
34
34
  <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
35
35
  <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
36
36
  SOURCE
37
37
 
38
- expect(links.size).to eq(1)
39
- end
38
+ expect(links.size).to eq(1)
39
+ end
40
40
 
41
- it "adds a protocol to urls missing them (www.)" do
42
- @source = (<<SOURCE).strip
41
+ it 'adds a protocol to urls missing them (www.)' do
42
+ @source = (<<SOURCE).strip
43
43
  <a href='www.cnet.com/download.exe'>download</a>
44
44
  SOURCE
45
45
 
46
- expect(links).to include('http://www.cnet.com/download.exe')
47
- end
46
+ expect(links).to include('http://www.cnet.com/download.exe')
47
+ end
48
48
 
49
- it "doesn't care about any extra attributes on the anchor tag" do
50
- @source = (<<SOURCE).strip
49
+ it "doesn't care about any extra attributes on the anchor tag" do
50
+ @source = (<<SOURCE).strip
51
51
  <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
52
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
52
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
53
+ </a>
53
54
  SOURCE
54
55
 
55
- expect(links.size).to eq(1)
56
- end
56
+ expect(links.size).to eq(1)
57
+ end
57
58
 
58
- it "returns relative urls with full path based on hostname" do
59
- @source = (<<SOURCE).strip
59
+ it 'returns relative urls with full path based on hostname' do
60
+ @source = (<<SOURCE).strip
60
61
  <a href='/test.html'>test</a>
61
62
  <a href='cpage_18'>about</a>
62
63
  SOURCE
63
64
 
64
- expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
65
- end
66
- end
65
+ expect(links).to include('http://www.cnet.com/test.html',
66
+ 'http://www.cnet.com/cpage_18')
67
+ end
68
+ end
data/spec/page_spec.rb CHANGED
@@ -1,93 +1,97 @@
1
1
  require 'retriever/page'
2
2
  require 'retriever/fetch'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
4
+ t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
- describe "Page" do
6
+ describe 'Page' do
7
7
 
8
- describe "#links" do
9
- let (:links){Retriever::Page.new(@source,t).links}
10
- it "collects all unique href links on the page" do
11
- @source = (<<SOURCE).strip
8
+ describe '#links' do
9
+ let(:links) { Retriever::Page.new(@source, t).links }
10
+ it 'collects all unique href links on the page' do
11
+ @source = (<<SOURCE).strip
12
12
  <a href='www.cnet.com/download.exe'>download</a>
13
13
  <a href='/test.html'>test</a>
14
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
14
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
15
+ </a>
15
16
  <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
16
- <a href='http://www.yahoo.com/test/'>yahoo</a>
17
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
17
18
  SOURCE
18
19
 
19
20
  expect(links.size).to eq(4)
20
21
  end
21
22
  end
22
23
 
23
- describe "#parse_internal" do
24
- let (:links){Retriever::Page.new(@source,t).parse_internal}
25
- it "filters links by host" do
26
- @source = (<<SOURCE).strip
24
+ describe '#parse_internal' do
25
+ let(:page) { Retriever::Page.new(@source, t) }
26
+ let(:links) { page.parse_internal }
27
+ it 'filters links by host' do
28
+ @source = (<<SOURCE).strip
27
29
  <a href='http://www.cnet.com/'>download</a>
28
- <a href='http://www.yahoo.com/test/'>yahoo</a>
30
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
29
31
  SOURCE
30
32
 
31
- expect(links.size).to eq(1)
33
+ expect(links.size).to eq(1)
32
34
  end
33
35
  end
34
36
 
35
- describe "#parse_internal_visitable" do
36
- let (:links){Retriever::Page.new(@source,t).parse_internal_visitable}
37
+ describe '#parse_internal_visitable' do
38
+ let(:page) { Retriever::Page.new(@source, t) }
39
+ let(:links) { page.parse_internal_visitable }
37
40
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
- @source = (<<SOURCE).strip
41
+ @source = (<<SOURCE).strip
39
42
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
40
43
  SOURCE
41
- expect(links.size).to eq(0)
44
+ expect(links.size).to eq(0)
42
45
  end
43
46
  end
44
47
 
45
- describe "#parseFiles" do
46
- let (:links){Retriever::Page.new(@source,t).parse_files}
47
- it "filters links by filetype" do
48
- @source = (<<SOURCE).strip
48
+ describe '#parse_files' do
49
+ let(:page) { Retriever::Page.new(@source, t) }
50
+ let(:files) { page.parse_files(page.parse_internal) }
51
+ it 'filters links by filetype' do
52
+ @source = (<<SOURCE).strip
49
53
  <a href='www.cnet.com/download.exe'>download</a>
50
- http://www.google.com
54
+ http://www.google.com
51
55
  <a href='/test.html'>test</a>
52
56
  SOURCE
53
- expect(links.size).to eq(1)
57
+ expect(files.size).to eq(1)
54
58
  end
55
59
  end
56
60
 
57
- describe "#title" do
58
- let (:page){Retriever::Page.new(@source,t)}
59
- it "returns page title" do
60
- @source = (<<SOURCE).strip
61
+ describe '#title' do
62
+ let(:page) { Retriever::Page.new(@source, t) }
63
+ it 'returns page title' do
64
+ @source = (<<SOURCE).strip
61
65
  <title>test</title>
62
66
  SOURCE
63
- expect(page.title).to eq('test')
67
+ expect(page.title).to eq('test')
64
68
  end
65
69
  end
66
- describe "#desc" do
67
- let (:page){Retriever::Page.new(@source,t)}
68
- it "returns meta description" do
69
- @source = (<<SOURCE).strip
70
+ describe '#desc' do
71
+ let(:page) { Retriever::Page.new(@source, t) }
72
+ it 'returns meta description' do
73
+ @source = (<<SOURCE).strip
70
74
  <meta name='description' content="test2 ">
71
75
  SOURCE
72
- expect(page.desc).to eq('test2 ')
76
+ expect(page.desc).to eq('test2 ')
73
77
  end
74
78
  end
75
- describe "#h1" do
76
- let (:page){Retriever::Page.new(@source,t)}
77
- it "returns h1 text" do
78
- @source = (<<SOURCE).strip
79
+ describe '#h1' do
80
+ let(:page) { Retriever::Page.new(@source, t) }
81
+ it 'returns h1 text' do
82
+ @source = (<<SOURCE).strip
79
83
  <h1>test 3</h1>
80
84
  SOURCE
81
- expect(page.h1).to eq('test 3')
85
+ expect(page.h1).to eq('test 3')
82
86
  end
83
87
  end
84
- describe "#h2" do
85
- let (:page){Retriever::Page.new(@source,t)}
86
- it "returns h2 text" do
87
- @source = (<<SOURCE).strip
88
+ describe '#h2' do
89
+ let(:page) { Retriever::Page.new(@source, t) }
90
+ it 'returns h2 text' do
91
+ @source = (<<SOURCE).strip
88
92
  <h2> test 4 </h2>
89
93
  SOURCE
90
- expect(page.h2).to eq(' test 4 ')
94
+ expect(page.h2).to eq(' test 4 ')
91
95
  end
92
96
  end
93
97
  end
@@ -1,5 +1,4 @@
1
1
  require 'retriever'
2
2
 
3
- describe "Fetch" do
4
-
5
- end
3
+ describe 'Fetch' do
4
+ end
data/spec/target_spec.rb CHANGED
@@ -1,44 +1,44 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
4
+ t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
- describe "Target" do
6
+ describe 'Target' do
7
7
 
8
- it "creates target var" do
9
- expect(t.target).to eq("http://www.cnet.com/reviews/")
10
- end
8
+ it 'creates target var' do
9
+ expect(t.target).to eq('http://www.cnet.com/reviews/')
10
+ end
11
11
 
12
- it "creates host var" do
13
- expect(t.host).to eq("www.cnet.com")
14
- end
12
+ it 'creates host var' do
13
+ expect(t.host).to eq('www.cnet.com')
14
+ end
15
15
 
16
- it "creates host_re var" do
17
- expect(t.host_re).to eq(/cnet.com/)
18
- end
16
+ it 'creates host_re var' do
17
+ expect(t.host_re).to eq(/cnet.com/)
18
+ end
19
19
 
20
- it "creates file_re var (when provided)" do
21
- expect(t.file_re).to eq(/\.exe\z/)
22
- end
20
+ it 'creates file_re var (when provided)' do
21
+ expect(t.file_re).to eq(/\.exe\z/)
22
+ end
23
23
 
24
- it "adds protocol to Target URL if none given" do
25
- expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
26
- end
24
+ it 'adds protocol to Target URL if none given' do
25
+ expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
26
+ end
27
27
 
28
- it "fails if given URL has no dot in it" do
29
- expect{Retriever::Target.new("cnetcom")}.to raise_error
30
- end
28
+ it 'fails if given URL has no dot in it' do
29
+ expect { Retriever::Target.new('cnetcom') }.to raise_error
30
+ end
31
31
 
32
- describe "#source" do
32
+ describe '#source' do
33
33
 
34
- it "opens URL and returns source as String" do
35
- expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
34
+ it 'opens URL and returns source as String' do
35
+ expect(Retriever::Target.new('http://techcrunch.com/').source.class)
36
+ .to eq(String)
36
37
  end
37
38
 
38
- it "fails if target redirects to new host" do
39
- expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
39
+ it 'fails if target redirects to new host' do
40
+ expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
41
+ .to raise_error
40
42
  end
41
-
42
43
  end
43
-
44
- end
44
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: addressable
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,7 @@ files:
125
139
  - lib/retriever/fetchseo.rb
126
140
  - lib/retriever/fetchsitemap.rb
127
141
  - lib/retriever/link.rb
128
- - lib/retriever/openuri-redirect-patch.rb
142
+ - lib/retriever/openuri_redirect_patch.rb
129
143
  - lib/retriever/page.rb
130
144
  - lib/retriever/target.rb
131
145
  - lib/retriever/version.rb