wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/wgit CHANGED
@@ -2,18 +2,22 @@
2
2
 
3
3
  require 'wgit'
4
4
 
5
- # Eval .wgit.rb file (if it exists).
6
- def eval_wgit
7
- puts 'Searching for .wgit.rb in local and home directories...'
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
8
 
9
- ['.', Dir.home].each do |dir|
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
10
  path = "#{dir}/.wgit.rb"
11
11
  next unless File.exist?(path)
12
12
 
13
- puts "Eval'ing #{path} (call `eval_wgit` after changes)"
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
14
15
  eval(File.read(path))
16
+
15
17
  break
16
18
  end
19
+
20
+ nil
17
21
  end
18
22
 
19
23
  eval_wgit
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
data/lib/wgit/base.rb ADDED
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
data/lib/wgit/core_ext.rb CHANGED
@@ -11,7 +11,7 @@ class String
11
11
  #
12
12
  # @return [Wgit::Url] The converted URL.
13
13
  def to_url
14
- Wgit::Url.new(self)
14
+ Wgit::Url.parse(self)
15
15
  end
16
16
  end
17
17
 
data/lib/wgit/crawler.rb CHANGED
@@ -6,23 +6,33 @@ require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
8
  require 'set'
9
+ require 'benchmark'
9
10
  require 'typhoeus'
11
+ require 'ferrum'
10
12
 
11
13
  module Wgit
12
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
13
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
14
- # class which contains network logic e.g. request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
15
17
  class Crawler
16
18
  include Assertable
17
19
 
18
- # The URL file extensions (from `<a>` hrefs) which will be crawled by
19
- # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
20
- # doesn't keep the crawl of the site going. All URL's without a file
21
- # extension will be crawled, because they're assumed to be HTML.
22
- SUPPORTED_FILE_EXTENSIONS = Set.new(
20
+ # Set of supported file extensions for Wgit::Crawler#crawl_site.
21
+ @supported_file_extensions = Set.new(
23
22
  %w[asp aspx cfm cgi htm html htmlx jsp php]
24
23
  )
25
24
 
25
+ class << self
26
+ # The URL file extensions (from `<a>` hrefs) which will be crawled by
27
+ # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
28
+ # doesn't keep the crawl of the site going. All URL's without a file
29
+ # extension will be crawled, because they're assumed to be HTML.
30
+ # The `#crawl` method will crawl anything since it's given the URL(s).
31
+ # You can add your own site's URL file extension e.g.
32
+ # `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
33
+ attr_reader :supported_file_extensions
34
+ end
35
+
26
36
  # The amount of allowed redirects before raising an error. Set to 0 to
27
37
  # disable redirects completely; or you can pass `follow_redirects: false`
28
38
  # to any Wgit::Crawler.crawl_* method.
@@ -30,12 +40,21 @@ module Wgit
30
40
 
31
41
  # The maximum amount of time (in seconds) a crawl request has to complete
32
42
  # before raising an error. Set to 0 to disable time outs completely.
33
- attr_accessor :time_out
43
+ attr_accessor :timeout
34
44
 
35
45
  # Whether or not to UTF-8 encode the response body once crawled. Set to
36
46
  # false if crawling more than just HTML e.g. images.
37
47
  attr_accessor :encode
38
48
 
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
57
+
39
58
  # The Wgit::Response of the most recently crawled URL.
40
59
  attr_reader :last_response
41
60
 
@@ -43,20 +62,27 @@ module Wgit
43
62
  #
44
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
45
64
  # raising an error. Set to 0 to disable redirects completely.
46
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
47
66
  # a crawl request has to complete before raising an error. Set to 0 to
48
67
  # disable time outs completely.
49
68
  # @param encode [Boolean] Whether or not to UTF-8 encode the response body
50
69
  # once crawled. Set to false if crawling more than just HTML e.g. images.
51
- def initialize(redirect_limit: 5, time_out: 5, encode: true)
52
- @redirect_limit = redirect_limit
53
- @time_out = time_out
54
- @encode = encode
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
55
80
  end
56
81
 
57
82
  # Crawls an entire website's HTML pages by recursively going through
58
- # its internal `<a>` links. Each crawled Document is yielded to a block.
59
- # Use `doc.empty?` to determine if the crawled link is valid.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
60
86
  #
61
87
  # Use the allow and disallow paths params to partially and selectively
62
88
  # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -74,26 +100,36 @@ module Wgit
74
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
75
101
  # It is recommended that this URL be the index page of the site to give a
76
102
  # greater chance of finding all pages within that site/host.
77
- # @param allow_paths [String, Array<String>] Filters links by selecting
78
- # them if their path `File.fnmatch?` one of allow_paths.
79
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
80
- # them if their path `File.fnmatch?` one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
81
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
82
112
  # A block is the only way to interact with each crawled Document.
83
113
  # Use `doc.empty?` to determine if the page is valid.
84
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
85
115
  # from all of the site's pages or nil if the given url could not be
86
116
  # crawled successfully.
87
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
88
120
  doc = crawl_url(url, &block)
89
121
  return nil if doc.nil?
90
122
 
91
- path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
92
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
93
129
 
94
130
  crawled = Set.new([url, alt_url])
95
131
  externals = Set.new(doc.external_links)
96
- internals = Set.new(get_internal_links(doc, path_opts))
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
97
133
 
98
134
  return externals.to_a if internals.empty?
99
135
 
@@ -108,7 +144,7 @@ module Wgit
108
144
  crawled += [orig_link, link] # Push both links in case of redirects.
109
145
  next if doc.nil?
110
146
 
111
- internals += get_internal_links(doc, path_opts)
147
+ internals += next_internal_links(doc, **link_opts)
112
148
  externals += doc.external_links
113
149
  end
114
150
  end
@@ -123,10 +159,11 @@ module Wgit
123
159
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
124
160
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
125
161
  # e.g. :host only allows redirects within the same host. Choose from
126
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
127
163
  # This value will be used for all urls crawled.
128
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
129
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
130
167
  # @raise [StandardError] If no urls are provided.
131
168
  # @return [Wgit::Document] The last Document crawled.
132
169
  def crawl_urls(*urls, follow_redirects: true, &block)
@@ -135,7 +172,7 @@ module Wgit
135
172
  opts = { follow_redirects: follow_redirects }
136
173
  doc = nil
137
174
 
138
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
139
176
 
140
177
  doc
141
178
  end
@@ -143,13 +180,15 @@ module Wgit
143
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
144
181
  # occurs.
145
182
  #
146
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
147
185
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
148
186
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
149
187
  # e.g. :host only allows redirects within the same host. Choose from
150
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
151
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
152
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
153
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
154
193
  # crawl was unsuccessful.
155
194
  def crawl_url(url, follow_redirects: true)
@@ -167,16 +206,19 @@ module Wgit
167
206
 
168
207
  protected
169
208
 
170
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
171
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
172
211
  # return a HTML body will be ignored, returning nil.
173
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
174
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
175
217
  # reference and gets modified as a result of the fetch/crawl.
176
218
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
177
219
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
178
220
  # e.g. :host only allows redirects within the same host. Choose from
179
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
180
222
  # @raise [StandardError] If url isn't valid and absolute.
181
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
182
224
  # unsuccessful.
@@ -185,6 +227,8 @@ module Wgit
185
227
  raise "Invalid url: #{url}" if url.invalid?
186
228
 
187
229
  resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
231
+
188
232
  response.body_or_nil
189
233
  rescue StandardError => e
190
234
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -206,14 +250,14 @@ module Wgit
206
250
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
207
251
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
208
252
  # e.g. :host only allows redirects within the same host. Choose from
209
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
210
254
  # @raise [StandardError] If a redirect isn't allowed etc.
211
255
  def resolve(url, response, follow_redirects: true)
212
- orig_url_base = url.to_url.to_base # Recorded before any redirects.
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
213
257
  follow_redirects, within = redirect?(follow_redirects)
214
258
 
215
259
  loop do
216
- get_response(url, response)
260
+ get_http_response(url, response)
217
261
  break unless response.redirect?
218
262
 
219
263
  # Handle response 'Location' header.
@@ -225,7 +269,7 @@ module Wgit
225
269
  # Validate if the redirect is allowed.
226
270
  raise "Redirect not allowed: #{location}" unless follow_redirects
227
271
 
228
- if within && !location.relative?(within => orig_url_base)
272
+ if within && !location.relative?(within => origin)
229
273
  raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
230
274
  end
231
275
 
@@ -233,7 +277,7 @@ module Wgit
233
277
  if response.redirect_count >= @redirect_limit
234
278
 
235
279
  # Process the location to be crawled next.
236
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
237
281
  response.redirections[url.to_s] = location.to_s
238
282
  url.replace(location) # Update the url on redirect.
239
283
  end
@@ -246,7 +290,7 @@ module Wgit
246
290
  # reference.
247
291
  # @raise [StandardError] If a response can't be obtained.
248
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
249
- def get_response(url, response)
293
+ def get_http_response(url, response)
250
294
  # Perform a HTTP GET request.
251
295
  orig_url = url.to_s
252
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -263,10 +307,40 @@ module Wgit
263
307
  response.add_total_time(http_response.total_time)
264
308
 
265
309
  # Log the request/response details.
266
- log_http(response)
310
+ log_net(:http, response, http_response.total_time)
267
311
 
268
312
  # Handle a failed response.
269
- raise "No response (within timeout: #{@time_out} second(s))" \
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
341
+
342
+ # Handle a failed response.
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
270
344
  if response.failure?
271
345
  end
272
346
 
@@ -277,7 +351,7 @@ module Wgit
277
351
  def http_get(url)
278
352
  opts = {
279
353
  followlocation: false,
280
- timeout: @time_out,
354
+ timeout: @timeout,
281
355
  accept_encoding: 'gzip',
282
356
  headers: {
283
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -286,35 +360,58 @@ module Wgit
286
360
  }
287
361
 
288
362
  # See https://rubydoc.info/gems/typhoeus for more info.
289
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
290
386
  end
291
387
 
292
388
  # Returns a doc's internal HTML page links in absolute form; used when
293
- # crawling a site. Use the allow and disallow paths params to partially
294
- # and selectively crawl a site; the glob syntax is supported e.g.
295
- # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
296
391
  #
297
- # Override this method in a subclass to change how a site
298
- # is crawled, not what is extracted from each page (Document extensions
299
- # should be used for this purpose instead). Just remember that only HTML
300
- # files containing `<a>` links keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
301
395
  #
302
396
  # @param doc [Wgit::Document] The document from which to extract it's
303
397
  # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
304
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
305
403
  # them if their path `File.fnmatch?` one of allow_paths.
306
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
307
405
  # them if their path `File.fnmatch?` one of disallow_paths.
308
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
309
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
310
- links = doc
311
- .internal_absolute_links
312
- .map(&:omit_fragment) # Because fragments don't alter content.
313
- .uniq
314
- .select do |link|
315
- ext = link.to_extension
316
- ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
317
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
318
415
 
319
416
  return links if allow_paths.nil? && disallow_paths.nil?
320
417
 
@@ -323,29 +420,40 @@ module Wgit
323
420
 
324
421
  private
325
422
 
326
- # Returns whether or not to follow redirects, and within what context e.g.
327
- # :host, :domain etc.
328
- def redirect?(follow_redirects)
329
- return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
330
433
 
331
- unless [true, false].include?(follow_redirects)
332
- raise "follow_redirects: must be a Boolean or Symbol, not: \
333
- #{follow_redirects}"
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
334
436
  end
335
437
 
336
- [follow_redirects, nil]
438
+ links
337
439
  end
338
440
 
339
- # Log (at debug level) the HTTP request/response details.
340
- def log_http(response)
341
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
342
- log_status = (response.status || 0)
343
- log_total_time = response.total_time.truncate(3)
344
-
345
- Wgit.logger.debug("[http] Request: #{response.url}")
346
- Wgit.logger.debug(
347
- format(resp_template, log_status, response.size, log_total_time)
348
- )
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
349
457
  end
350
458
 
351
459
  # Validate and filter by the given URL paths.
@@ -365,14 +473,17 @@ module Wgit
365
473
 
366
474
  # Validate the paths are suitable for filtering.
367
475
  def validate_paths(paths)
368
- paths = [paths] unless paths.is_a?(Array)
476
+ paths = *paths
369
477
  raise 'The provided paths must all be Strings' \
370
478
  unless paths.all? { |path| path.is_a?(String) }
371
479
 
372
- Wgit::Utils.process_arr(paths, encode: false)
480
+ Wgit::Utils.sanitize(paths, encode: false)
373
481
  raise 'The provided paths cannot be empty' if paths.empty?
374
482
 
375
- paths
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
376
487
  end
377
488
 
378
489
  # Filters links by selecting/rejecting them based on their path.
@@ -380,7 +491,7 @@ module Wgit
380
491
  def filter_links(links, filter_method, paths)
381
492
  links.send(filter_method) do |link|
382
493
  # Turn http://example.com into / meaning index.
383
- link = link.to_endpoint == '/' ? '/' : link.omit_base
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
384
495
 
385
496
  match = false
386
497
  paths.each do |pattern|
@@ -392,6 +503,35 @@ module Wgit
392
503
  end
393
504
  end
394
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
395
535
  alias crawl crawl_urls
396
536
  alias crawl_pages crawl_urls
397
537
  alias crawl_page crawl_url