wgit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/wgit CHANGED
@@ -2,18 +2,22 @@
2
2
 
3
3
  require 'wgit'
4
4
 
5
- # Eval .wgit.rb file (if it exists).
6
- def eval_wgit
7
- puts 'Searching for .wgit.rb in local and home directories...'
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
8
 
9
- ['.', Dir.home].each do |dir|
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
10
  path = "#{dir}/.wgit.rb"
11
11
  next unless File.exist?(path)
12
12
 
13
- puts "Eval'ing #{path} (call `eval_wgit` after changes)"
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
14
15
  eval(File.read(path))
16
+
15
17
  break
16
18
  end
19
+
20
+ nil
17
21
  end
18
22
 
19
23
  eval_wgit
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
@@ -6,12 +6,14 @@ require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
8
  require 'set'
9
+ require 'benchmark'
9
10
  require 'typhoeus'
11
+ require 'ferrum'
10
12
 
11
13
  module Wgit
12
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
13
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
14
- # class which contains network logic e.g. HTTP request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
15
17
  class Crawler
16
18
  include Assertable
17
19
 
@@ -38,12 +40,21 @@ module Wgit
38
40
 
39
41
  # The maximum amount of time (in seconds) a crawl request has to complete
40
42
  # before raising an error. Set to 0 to disable time outs completely.
41
- attr_accessor :time_out
43
+ attr_accessor :timeout
42
44
 
43
45
  # Whether or not to UTF-8 encode the response body once crawled. Set to
44
46
  # false if crawling more than just HTML e.g. images.
45
47
  attr_accessor :encode
46
48
 
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
57
+
47
58
  # The Wgit::Response of the most recently crawled URL.
48
59
  attr_reader :last_response
49
60
 
@@ -51,20 +62,27 @@ module Wgit
51
62
  #
52
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
53
64
  # raising an error. Set to 0 to disable redirects completely.
54
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
55
66
  # a crawl request has to complete before raising an error. Set to 0 to
56
67
  # disable time outs completely.
57
68
  # @param encode [Boolean] Whether or not to UTF-8 encode the response body
58
69
  # once crawled. Set to false if crawling more than just HTML e.g. images.
59
- def initialize(redirect_limit: 5, time_out: 5, encode: true)
60
- @redirect_limit = redirect_limit
61
- @time_out = time_out
62
- @encode = encode
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
63
80
  end
64
81
 
65
82
  # Crawls an entire website's HTML pages by recursively going through
66
- # its internal `<a>` links. Each crawled Document is yielded to a block.
67
- # Use `doc.empty?` to determine if the crawled link is valid.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
68
86
  #
69
87
  # Use the allow and disallow paths params to partially and selectively
70
88
  # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -82,26 +100,36 @@ module Wgit
82
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
83
101
  # It is recommended that this URL be the index page of the site to give a
84
102
  # greater chance of finding all pages within that site/host.
85
- # @param allow_paths [String, Array<String>] Filters links by selecting
86
- # them if their path `File.fnmatch?` one of allow_paths.
87
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
88
- # them if their path `File.fnmatch?` one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
89
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
90
112
  # A block is the only way to interact with each crawled Document.
91
113
  # Use `doc.empty?` to determine if the page is valid.
92
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
93
115
  # from all of the site's pages or nil if the given url could not be
94
116
  # crawled successfully.
95
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
96
120
  doc = crawl_url(url, &block)
97
121
  return nil if doc.nil?
98
122
 
99
- path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
100
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
101
129
 
102
130
  crawled = Set.new([url, alt_url])
103
131
  externals = Set.new(doc.external_links)
104
- internals = Set.new(get_internal_links(doc, path_opts))
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
105
133
 
106
134
  return externals.to_a if internals.empty?
107
135
 
@@ -116,7 +144,7 @@ module Wgit
116
144
  crawled += [orig_link, link] # Push both links in case of redirects.
117
145
  next if doc.nil?
118
146
 
119
- internals += get_internal_links(doc, path_opts)
147
+ internals += next_internal_links(doc, **link_opts)
120
148
  externals += doc.external_links
121
149
  end
122
150
  end
@@ -131,10 +159,11 @@ module Wgit
131
159
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
132
160
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
133
161
  # e.g. :host only allows redirects within the same host. Choose from
134
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
135
163
  # This value will be used for all urls crawled.
136
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
137
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
138
167
  # @raise [StandardError] If no urls are provided.
139
168
  # @return [Wgit::Document] The last Document crawled.
140
169
  def crawl_urls(*urls, follow_redirects: true, &block)
@@ -143,7 +172,7 @@ module Wgit
143
172
  opts = { follow_redirects: follow_redirects }
144
173
  doc = nil
145
174
 
146
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
147
176
 
148
177
  doc
149
178
  end
@@ -151,13 +180,15 @@ module Wgit
151
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
152
181
  # occurs.
153
182
  #
154
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
155
185
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
156
186
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
157
187
  # e.g. :host only allows redirects within the same host. Choose from
158
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
159
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
160
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
161
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
162
193
  # crawl was unsuccessful.
163
194
  def crawl_url(url, follow_redirects: true)
@@ -175,16 +206,19 @@ module Wgit
175
206
 
176
207
  protected
177
208
 
178
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
179
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
180
211
  # return a HTML body will be ignored, returning nil.
181
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
182
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
183
217
  # reference and gets modified as a result of the fetch/crawl.
184
218
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
185
219
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
186
220
  # e.g. :host only allows redirects within the same host. Choose from
187
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
222
  # @raise [StandardError] If url isn't valid and absolute.
189
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
190
224
  # unsuccessful.
@@ -193,6 +227,8 @@ module Wgit
193
227
  raise "Invalid url: #{url}" if url.invalid?
194
228
 
195
229
  resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
231
+
196
232
  response.body_or_nil
197
233
  rescue StandardError => e
198
234
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -214,14 +250,14 @@ module Wgit
214
250
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
215
251
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
216
252
  # e.g. :host only allows redirects within the same host. Choose from
217
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
218
254
  # @raise [StandardError] If a redirect isn't allowed etc.
219
255
  def resolve(url, response, follow_redirects: true)
220
- orig_url_base = url.to_url.to_base # Recorded before any redirects.
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
221
257
  follow_redirects, within = redirect?(follow_redirects)
222
258
 
223
259
  loop do
224
- get_response(url, response)
260
+ get_http_response(url, response)
225
261
  break unless response.redirect?
226
262
 
227
263
  # Handle response 'Location' header.
@@ -233,7 +269,7 @@ module Wgit
233
269
  # Validate if the redirect is allowed.
234
270
  raise "Redirect not allowed: #{location}" unless follow_redirects
235
271
 
236
- if within && !location.relative?(within => orig_url_base)
272
+ if within && !location.relative?(within => origin)
237
273
  raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
238
274
  end
239
275
 
@@ -241,7 +277,7 @@ module Wgit
241
277
  if response.redirect_count >= @redirect_limit
242
278
 
243
279
  # Process the location to be crawled next.
244
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
245
281
  response.redirections[url.to_s] = location.to_s
246
282
  url.replace(location) # Update the url on redirect.
247
283
  end
@@ -254,7 +290,7 @@ module Wgit
254
290
  # reference.
255
291
  # @raise [StandardError] If a response can't be obtained.
256
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
257
- def get_response(url, response)
293
+ def get_http_response(url, response)
258
294
  # Perform a HTTP GET request.
259
295
  orig_url = url.to_s
260
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -271,10 +307,40 @@ module Wgit
271
307
  response.add_total_time(http_response.total_time)
272
308
 
273
309
  # Log the request/response details.
274
- log_http(response)
310
+ log_net(:http, response, http_response.total_time)
311
+
312
+ # Handle a failed response.
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
275
341
 
276
342
  # Handle a failed response.
277
- raise "No response (within timeout: #{@time_out} second(s))" \
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
278
344
  if response.failure?
279
345
  end
280
346
 
@@ -285,7 +351,7 @@ module Wgit
285
351
  def http_get(url)
286
352
  opts = {
287
353
  followlocation: false,
288
- timeout: @time_out,
354
+ timeout: @timeout,
289
355
  accept_encoding: 'gzip',
290
356
  headers: {
291
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -294,37 +360,58 @@ module Wgit
294
360
  }
295
361
 
296
362
  # See https://rubydoc.info/gems/typhoeus for more info.
297
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
298
386
  end
299
387
 
300
388
  # Returns a doc's internal HTML page links in absolute form; used when
301
- # crawling a site. Use the allow and disallow paths params to partially
302
- # and selectively crawl a site; the glob syntax is supported e.g.
303
- # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
304
391
  #
305
- # Override this method in a subclass to change how a site
306
- # is crawled, not what is extracted from each page (Document extensions
307
- # should be used for this purpose instead). Just remember that only HTML
308
- # files containing `<a>` links keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
309
395
  #
310
396
  # @param doc [Wgit::Document] The document from which to extract it's
311
397
  # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
312
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
313
403
  # them if their path `File.fnmatch?` one of allow_paths.
314
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
315
405
  # them if their path `File.fnmatch?` one of disallow_paths.
316
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
317
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
318
- links = doc
319
- .internal_absolute_links
320
- .map(&:omit_fragment) # Because fragments don't alter content.
321
- .uniq
322
- .select do |link|
323
- ext = link.to_extension
324
- ext ?
325
- Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
326
- true # URLs without an extension are assumed HTML.
327
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
328
415
 
329
416
  return links if allow_paths.nil? && disallow_paths.nil?
330
417
 
@@ -333,29 +420,40 @@ module Wgit
333
420
 
334
421
  private
335
422
 
336
- # Returns whether or not to follow redirects, and within what context e.g.
337
- # :host, :domain etc.
338
- def redirect?(follow_redirects)
339
- return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
340
433
 
341
- unless [true, false].include?(follow_redirects)
342
- raise "follow_redirects: must be a Boolean or Symbol, not: \
343
- #{follow_redirects}"
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
344
436
  end
345
437
 
346
- [follow_redirects, nil]
438
+ links
347
439
  end
348
440
 
349
- # Log (at debug level) the HTTP request/response details.
350
- def log_http(response)
351
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
352
- log_status = (response.status || 0)
353
- log_total_time = response.total_time.truncate(3)
354
-
355
- Wgit.logger.debug("[http] Request: #{response.url}")
356
- Wgit.logger.debug(
357
- format(resp_template, log_status, response.size, log_total_time)
358
- )
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
359
457
  end
360
458
 
361
459
  # Validate and filter by the given URL paths.
@@ -375,14 +473,17 @@ module Wgit
375
473
 
376
474
  # Validate the paths are suitable for filtering.
377
475
  def validate_paths(paths)
378
- paths = [paths] unless paths.is_a?(Array)
476
+ paths = *paths
379
477
  raise 'The provided paths must all be Strings' \
380
478
  unless paths.all? { |path| path.is_a?(String) }
381
479
 
382
- Wgit::Utils.process_arr(paths, encode: false)
480
+ Wgit::Utils.sanitize(paths, encode: false)
383
481
  raise 'The provided paths cannot be empty' if paths.empty?
384
482
 
385
- paths
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
386
487
  end
387
488
 
388
489
  # Filters links by selecting/rejecting them based on their path.
@@ -390,7 +491,7 @@ module Wgit
390
491
  def filter_links(links, filter_method, paths)
391
492
  links.send(filter_method) do |link|
392
493
  # Turn http://example.com into / meaning index.
393
- link = link.to_endpoint == '/' ? '/' : link.omit_base
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
394
495
 
395
496
  match = false
396
497
  paths.each do |pattern|
@@ -402,6 +503,35 @@ module Wgit
402
503
  end
403
504
  end
404
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
405
535
  alias crawl crawl_urls
406
536
  alias crawl_pages crawl_urls
407
537
  alias crawl_page crawl_url