wgit 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/wgit CHANGED
@@ -2,18 +2,22 @@
2
2
 
3
3
  require 'wgit'
4
4
 
5
- # Eval .wgit.rb file (if it exists).
6
- def eval_wgit
7
- puts 'Searching for .wgit.rb in local and home directories...'
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
8
 
9
- ['.', Dir.home].each do |dir|
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
10
  path = "#{dir}/.wgit.rb"
11
11
  next unless File.exist?(path)
12
12
 
13
- puts "Eval'ing #{path} (call `eval_wgit` after changes)"
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
14
15
  eval(File.read(path))
16
+
15
17
  break
16
18
  end
19
+
20
+ nil
17
21
  end
18
22
 
19
23
  eval_wgit
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
@@ -6,12 +6,14 @@ require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
8
  require 'set'
9
+ require 'benchmark'
9
10
  require 'typhoeus'
11
+ require 'ferrum'
10
12
 
11
13
  module Wgit
12
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
13
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
14
- # class which contains network logic e.g. HTTP request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
15
17
  class Crawler
16
18
  include Assertable
17
19
 
@@ -38,12 +40,21 @@ module Wgit
38
40
 
39
41
  # The maximum amount of time (in seconds) a crawl request has to complete
40
42
  # before raising an error. Set to 0 to disable time outs completely.
41
- attr_accessor :time_out
43
+ attr_accessor :timeout
42
44
 
43
45
  # Whether or not to UTF-8 encode the response body once crawled. Set to
44
46
  # false if crawling more than just HTML e.g. images.
45
47
  attr_accessor :encode
46
48
 
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
57
+
47
58
  # The Wgit::Response of the most recently crawled URL.
48
59
  attr_reader :last_response
49
60
 
@@ -51,20 +62,27 @@ module Wgit
51
62
  #
52
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
53
64
  # raising an error. Set to 0 to disable redirects completely.
54
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
55
66
  # a crawl request has to complete before raising an error. Set to 0 to
56
67
  # disable time outs completely.
57
68
  # @param encode [Boolean] Whether or not to UTF-8 encode the response body
58
69
  # once crawled. Set to false if crawling more than just HTML e.g. images.
59
- def initialize(redirect_limit: 5, time_out: 5, encode: true)
60
- @redirect_limit = redirect_limit
61
- @time_out = time_out
62
- @encode = encode
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
63
80
  end
64
81
 
65
82
  # Crawls an entire website's HTML pages by recursively going through
66
- # its internal `<a>` links. Each crawled Document is yielded to a block.
67
- # Use `doc.empty?` to determine if the crawled link is valid.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
68
86
  #
69
87
  # Use the allow and disallow paths params to partially and selectively
70
88
  # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -82,26 +100,36 @@ module Wgit
82
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
83
101
  # It is recommended that this URL be the index page of the site to give a
84
102
  # greater chance of finding all pages within that site/host.
85
- # @param allow_paths [String, Array<String>] Filters links by selecting
86
- # them if their path `File.fnmatch?` one of allow_paths.
87
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
88
- # them if their path `File.fnmatch?` one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
89
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
90
112
  # A block is the only way to interact with each crawled Document.
91
113
  # Use `doc.empty?` to determine if the page is valid.
92
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
93
115
  # from all of the site's pages or nil if the given url could not be
94
116
  # crawled successfully.
95
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
96
120
  doc = crawl_url(url, &block)
97
121
  return nil if doc.nil?
98
122
 
99
- path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
100
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
101
129
 
102
130
  crawled = Set.new([url, alt_url])
103
131
  externals = Set.new(doc.external_links)
104
- internals = Set.new(get_internal_links(doc, path_opts))
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
105
133
 
106
134
  return externals.to_a if internals.empty?
107
135
 
@@ -116,7 +144,7 @@ module Wgit
116
144
  crawled += [orig_link, link] # Push both links in case of redirects.
117
145
  next if doc.nil?
118
146
 
119
- internals += get_internal_links(doc, path_opts)
147
+ internals += next_internal_links(doc, **link_opts)
120
148
  externals += doc.external_links
121
149
  end
122
150
  end
@@ -131,10 +159,11 @@ module Wgit
131
159
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
132
160
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
133
161
  # e.g. :host only allows redirects within the same host. Choose from
134
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
135
163
  # This value will be used for all urls crawled.
136
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
137
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
138
167
  # @raise [StandardError] If no urls are provided.
139
168
  # @return [Wgit::Document] The last Document crawled.
140
169
  def crawl_urls(*urls, follow_redirects: true, &block)
@@ -143,7 +172,7 @@ module Wgit
143
172
  opts = { follow_redirects: follow_redirects }
144
173
  doc = nil
145
174
 
146
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
147
176
 
148
177
  doc
149
178
  end
@@ -151,13 +180,15 @@ module Wgit
151
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
152
181
  # occurs.
153
182
  #
154
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
155
185
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
156
186
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
157
187
  # e.g. :host only allows redirects within the same host. Choose from
158
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
159
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
160
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
161
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
162
193
  # crawl was unsuccessful.
163
194
  def crawl_url(url, follow_redirects: true)
@@ -175,16 +206,19 @@ module Wgit
175
206
 
176
207
  protected
177
208
 
178
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
179
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
180
211
  # return a HTML body will be ignored, returning nil.
181
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
182
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
183
217
  # reference and gets modified as a result of the fetch/crawl.
184
218
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
185
219
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
186
220
  # e.g. :host only allows redirects within the same host. Choose from
187
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
222
  # @raise [StandardError] If url isn't valid and absolute.
189
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
190
224
  # unsuccessful.
@@ -193,6 +227,8 @@ module Wgit
193
227
  raise "Invalid url: #{url}" if url.invalid?
194
228
 
195
229
  resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
231
+
196
232
  response.body_or_nil
197
233
  rescue StandardError => e
198
234
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -214,14 +250,14 @@ module Wgit
214
250
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
215
251
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
216
252
  # e.g. :host only allows redirects within the same host. Choose from
217
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
218
254
  # @raise [StandardError] If a redirect isn't allowed etc.
219
255
  def resolve(url, response, follow_redirects: true)
220
- orig_url_base = url.to_url.to_base # Recorded before any redirects.
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
221
257
  follow_redirects, within = redirect?(follow_redirects)
222
258
 
223
259
  loop do
224
- get_response(url, response)
260
+ get_http_response(url, response)
225
261
  break unless response.redirect?
226
262
 
227
263
  # Handle response 'Location' header.
@@ -233,7 +269,7 @@ module Wgit
233
269
  # Validate if the redirect is allowed.
234
270
  raise "Redirect not allowed: #{location}" unless follow_redirects
235
271
 
236
- if within && !location.relative?(within => orig_url_base)
272
+ if within && !location.relative?(within => origin)
237
273
  raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
238
274
  end
239
275
 
@@ -241,7 +277,7 @@ module Wgit
241
277
  if response.redirect_count >= @redirect_limit
242
278
 
243
279
  # Process the location to be crawled next.
244
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
245
281
  response.redirections[url.to_s] = location.to_s
246
282
  url.replace(location) # Update the url on redirect.
247
283
  end
@@ -254,7 +290,7 @@ module Wgit
254
290
  # reference.
255
291
  # @raise [StandardError] If a response can't be obtained.
256
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
257
- def get_response(url, response)
293
+ def get_http_response(url, response)
258
294
  # Perform a HTTP GET request.
259
295
  orig_url = url.to_s
260
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -271,10 +307,40 @@ module Wgit
271
307
  response.add_total_time(http_response.total_time)
272
308
 
273
309
  # Log the request/response details.
274
- log_http(response)
310
+ log_net(:http, response, http_response.total_time)
311
+
312
+ # Handle a failed response.
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
275
341
 
276
342
  # Handle a failed response.
277
- raise "No response (within timeout: #{@time_out} second(s))" \
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
278
344
  if response.failure?
279
345
  end
280
346
 
@@ -285,7 +351,7 @@ module Wgit
285
351
  def http_get(url)
286
352
  opts = {
287
353
  followlocation: false,
288
- timeout: @time_out,
354
+ timeout: @timeout,
289
355
  accept_encoding: 'gzip',
290
356
  headers: {
291
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -294,37 +360,58 @@ module Wgit
294
360
  }
295
361
 
296
362
  # See https://rubydoc.info/gems/typhoeus for more info.
297
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
298
386
  end
299
387
 
300
388
  # Returns a doc's internal HTML page links in absolute form; used when
301
- # crawling a site. Use the allow and disallow paths params to partially
302
- # and selectively crawl a site; the glob syntax is supported e.g.
303
- # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
304
391
  #
305
- # Override this method in a subclass to change how a site
306
- # is crawled, not what is extracted from each page (Document extensions
307
- # should be used for this purpose instead). Just remember that only HTML
308
- # files containing `<a>` links keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
309
395
  #
310
396
  # @param doc [Wgit::Document] The document from which to extract it's
311
397
  # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
312
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
313
403
  # them if their path `File.fnmatch?` one of allow_paths.
314
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
315
405
  # them if their path `File.fnmatch?` one of disallow_paths.
316
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
317
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
318
- links = doc
319
- .internal_absolute_links
320
- .map(&:omit_fragment) # Because fragments don't alter content.
321
- .uniq
322
- .select do |link|
323
- ext = link.to_extension
324
- ext ?
325
- Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
326
- true # URLs without an extension are assumed HTML.
327
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
328
415
 
329
416
  return links if allow_paths.nil? && disallow_paths.nil?
330
417
 
@@ -333,29 +420,40 @@ module Wgit
333
420
 
334
421
  private
335
422
 
336
- # Returns whether or not to follow redirects, and within what context e.g.
337
- # :host, :domain etc.
338
- def redirect?(follow_redirects)
339
- return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
340
433
 
341
- unless [true, false].include?(follow_redirects)
342
- raise "follow_redirects: must be a Boolean or Symbol, not: \
343
- #{follow_redirects}"
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
344
436
  end
345
437
 
346
- [follow_redirects, nil]
438
+ links
347
439
  end
348
440
 
349
- # Log (at debug level) the HTTP request/response details.
350
- def log_http(response)
351
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
352
- log_status = (response.status || 0)
353
- log_total_time = response.total_time.truncate(3)
354
-
355
- Wgit.logger.debug("[http] Request: #{response.url}")
356
- Wgit.logger.debug(
357
- format(resp_template, log_status, response.size, log_total_time)
358
- )
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
359
457
  end
360
458
 
361
459
  # Validate and filter by the given URL paths.
@@ -375,14 +473,17 @@ module Wgit
375
473
 
376
474
  # Validate the paths are suitable for filtering.
377
475
  def validate_paths(paths)
378
- paths = [paths] unless paths.is_a?(Array)
476
+ paths = *paths
379
477
  raise 'The provided paths must all be Strings' \
380
478
  unless paths.all? { |path| path.is_a?(String) }
381
479
 
382
- Wgit::Utils.process_arr(paths, encode: false)
480
+ Wgit::Utils.sanitize(paths, encode: false)
383
481
  raise 'The provided paths cannot be empty' if paths.empty?
384
482
 
385
- paths
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
386
487
  end
387
488
 
388
489
  # Filters links by selecting/rejecting them based on their path.
@@ -390,7 +491,7 @@ module Wgit
390
491
  def filter_links(links, filter_method, paths)
391
492
  links.send(filter_method) do |link|
392
493
  # Turn http://example.com into / meaning index.
393
- link = link.to_endpoint == '/' ? '/' : link.omit_base
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
394
495
 
395
496
  match = false
396
497
  paths.each do |pattern|
@@ -402,6 +503,35 @@ module Wgit
402
503
  end
403
504
  end
404
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
405
535
  alias crawl crawl_urls
406
536
  alias crawl_pages crawl_urls
407
537
  alias crawl_page crawl_url