wgit 0.7.0 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
data/bin/wgit CHANGED
@@ -2,18 +2,22 @@
2
2
 
3
3
  require 'wgit'
4
4
 
5
- # Eval .wgit.rb file (if it exists).
6
- def eval_wgit
7
- puts 'Searching for .wgit.rb in local and home directories...'
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
8
 
9
- ['.', Dir.home].each do |dir|
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
10
  path = "#{dir}/.wgit.rb"
11
11
  next unless File.exist?(path)
12
12
 
13
- puts "Eval'ing #{path} (call `eval_wgit` after changes)"
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
14
15
  eval(File.read(path))
16
+
15
17
  break
16
18
  end
19
+
20
+ nil
17
21
  end
18
22
 
19
23
  eval_wgit
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
data/lib/wgit/base.rb ADDED
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
data/lib/wgit/core_ext.rb CHANGED
@@ -11,7 +11,7 @@ class String
11
11
  #
12
12
  # @return [Wgit::Url] The converted URL.
13
13
  def to_url
14
- Wgit::Url.new(self)
14
+ Wgit::Url.parse(self)
15
15
  end
16
16
  end
17
17
 
data/lib/wgit/crawler.rb CHANGED
@@ -6,23 +6,33 @@ require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
8
  require 'set'
9
+ require 'benchmark'
9
10
  require 'typhoeus'
11
+ require 'ferrum'
10
12
 
11
13
  module Wgit
12
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
13
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
14
- # class which contains network logic e.g. request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
15
17
  class Crawler
16
18
  include Assertable
17
19
 
18
- # The URL file extensions (from `<a>` hrefs) which will be crawled by
19
- # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
20
- # doesn't keep the crawl of the site going. All URL's without a file
21
- # extension will be crawled, because they're assumed to be HTML.
22
- SUPPORTED_FILE_EXTENSIONS = Set.new(
20
+ # Set of supported file extensions for Wgit::Crawler#crawl_site.
21
+ @supported_file_extensions = Set.new(
23
22
  %w[asp aspx cfm cgi htm html htmlx jsp php]
24
23
  )
25
24
 
25
+ class << self
26
+ # The URL file extensions (from `<a>` hrefs) which will be crawled by
27
+ # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
28
+ # doesn't keep the crawl of the site going. All URL's without a file
29
+ # extension will be crawled, because they're assumed to be HTML.
30
+ # The `#crawl` method will crawl anything since it's given the URL(s).
31
+ # You can add your own site's URL file extension e.g.
32
+ # `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
33
+ attr_reader :supported_file_extensions
34
+ end
35
+
26
36
  # The amount of allowed redirects before raising an error. Set to 0 to
27
37
  # disable redirects completely; or you can pass `follow_redirects: false`
28
38
  # to any Wgit::Crawler.crawl_* method.
@@ -30,12 +40,21 @@ module Wgit
30
40
 
31
41
  # The maximum amount of time (in seconds) a crawl request has to complete
32
42
  # before raising an error. Set to 0 to disable time outs completely.
33
- attr_accessor :time_out
43
+ attr_accessor :timeout
34
44
 
35
45
  # Whether or not to UTF-8 encode the response body once crawled. Set to
36
46
  # false if crawling more than just HTML e.g. images.
37
47
  attr_accessor :encode
38
48
 
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
57
+
39
58
  # The Wgit::Response of the most recently crawled URL.
40
59
  attr_reader :last_response
41
60
 
@@ -43,20 +62,27 @@ module Wgit
43
62
  #
44
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
45
64
  # raising an error. Set to 0 to disable redirects completely.
46
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
47
66
  # a crawl request has to complete before raising an error. Set to 0 to
48
67
  # disable time outs completely.
49
68
  # @param encode [Boolean] Whether or not to UTF-8 encode the response body
50
69
  # once crawled. Set to false if crawling more than just HTML e.g. images.
51
- def initialize(redirect_limit: 5, time_out: 5, encode: true)
52
- @redirect_limit = redirect_limit
53
- @time_out = time_out
54
- @encode = encode
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
55
80
  end
56
81
 
57
82
  # Crawls an entire website's HTML pages by recursively going through
58
- # its internal `<a>` links. Each crawled Document is yielded to a block.
59
- # Use `doc.empty?` to determine if the crawled link is valid.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
60
86
  #
61
87
  # Use the allow and disallow paths params to partially and selectively
62
88
  # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -74,26 +100,36 @@ module Wgit
74
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
75
101
  # It is recommended that this URL be the index page of the site to give a
76
102
  # greater chance of finding all pages within that site/host.
77
- # @param allow_paths [String, Array<String>] Filters links by selecting
78
- # them if their path `File.fnmatch?` one of allow_paths.
79
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
80
- # them if their path `File.fnmatch?` one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
81
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
82
112
  # A block is the only way to interact with each crawled Document.
83
113
  # Use `doc.empty?` to determine if the page is valid.
84
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
85
115
  # from all of the site's pages or nil if the given url could not be
86
116
  # crawled successfully.
87
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
88
120
  doc = crawl_url(url, &block)
89
121
  return nil if doc.nil?
90
122
 
91
- path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
92
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
93
129
 
94
130
  crawled = Set.new([url, alt_url])
95
131
  externals = Set.new(doc.external_links)
96
- internals = Set.new(get_internal_links(doc, path_opts))
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
97
133
 
98
134
  return externals.to_a if internals.empty?
99
135
 
@@ -108,7 +144,7 @@ module Wgit
108
144
  crawled += [orig_link, link] # Push both links in case of redirects.
109
145
  next if doc.nil?
110
146
 
111
- internals += get_internal_links(doc, path_opts)
147
+ internals += next_internal_links(doc, **link_opts)
112
148
  externals += doc.external_links
113
149
  end
114
150
  end
@@ -123,10 +159,11 @@ module Wgit
123
159
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
124
160
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
125
161
  # e.g. :host only allows redirects within the same host. Choose from
126
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
127
163
  # This value will be used for all urls crawled.
128
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
129
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
130
167
  # @raise [StandardError] If no urls are provided.
131
168
  # @return [Wgit::Document] The last Document crawled.
132
169
  def crawl_urls(*urls, follow_redirects: true, &block)
@@ -135,7 +172,7 @@ module Wgit
135
172
  opts = { follow_redirects: follow_redirects }
136
173
  doc = nil
137
174
 
138
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
139
176
 
140
177
  doc
141
178
  end
@@ -143,13 +180,15 @@ module Wgit
143
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
144
181
  # occurs.
145
182
  #
146
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
147
185
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
148
186
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
149
187
  # e.g. :host only allows redirects within the same host. Choose from
150
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
151
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
152
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
153
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
154
193
  # crawl was unsuccessful.
155
194
  def crawl_url(url, follow_redirects: true)
@@ -167,16 +206,19 @@ module Wgit
167
206
 
168
207
  protected
169
208
 
170
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
171
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
172
211
  # return a HTML body will be ignored, returning nil.
173
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
174
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
175
217
  # reference and gets modified as a result of the fetch/crawl.
176
218
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
177
219
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
178
220
  # e.g. :host only allows redirects within the same host. Choose from
179
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
180
222
  # @raise [StandardError] If url isn't valid and absolute.
181
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
182
224
  # unsuccessful.
@@ -185,6 +227,8 @@ module Wgit
185
227
  raise "Invalid url: #{url}" if url.invalid?
186
228
 
187
229
  resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
231
+
188
232
  response.body_or_nil
189
233
  rescue StandardError => e
190
234
  Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -206,14 +250,14 @@ module Wgit
206
250
  # @param follow_redirects [Boolean, Symbol] Whether or not to follow
207
251
  # redirects. Pass a Symbol to limit where the redirect is allowed to go
208
252
  # e.g. :host only allows redirects within the same host. Choose from
209
- # :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
210
254
  # @raise [StandardError] If a redirect isn't allowed etc.
211
255
  def resolve(url, response, follow_redirects: true)
212
- orig_url_base = url.to_url.to_base # Recorded before any redirects.
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
213
257
  follow_redirects, within = redirect?(follow_redirects)
214
258
 
215
259
  loop do
216
- get_response(url, response)
260
+ get_http_response(url, response)
217
261
  break unless response.redirect?
218
262
 
219
263
  # Handle response 'Location' header.
@@ -225,7 +269,7 @@ module Wgit
225
269
  # Validate if the redirect is allowed.
226
270
  raise "Redirect not allowed: #{location}" unless follow_redirects
227
271
 
228
- if within && !location.relative?(within => orig_url_base)
272
+ if within && !location.relative?(within => origin)
229
273
  raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
230
274
  end
231
275
 
@@ -233,7 +277,7 @@ module Wgit
233
277
  if response.redirect_count >= @redirect_limit
234
278
 
235
279
  # Process the location to be crawled next.
236
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
237
281
  response.redirections[url.to_s] = location.to_s
238
282
  url.replace(location) # Update the url on redirect.
239
283
  end
@@ -246,7 +290,7 @@ module Wgit
246
290
  # reference.
247
291
  # @raise [StandardError] If a response can't be obtained.
248
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
249
- def get_response(url, response)
293
+ def get_http_response(url, response)
250
294
  # Perform a HTTP GET request.
251
295
  orig_url = url.to_s
252
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -263,10 +307,40 @@ module Wgit
263
307
  response.add_total_time(http_response.total_time)
264
308
 
265
309
  # Log the request/response details.
266
- log_http(response)
310
+ log_net(:http, response, http_response.total_time)
267
311
 
268
312
  # Handle a failed response.
269
- raise "No response (within timeout: #{@time_out} second(s))" \
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
341
+
342
+ # Handle a failed response.
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
270
344
  if response.failure?
271
345
  end
272
346
 
@@ -277,7 +351,7 @@ module Wgit
277
351
  def http_get(url)
278
352
  opts = {
279
353
  followlocation: false,
280
- timeout: @time_out,
354
+ timeout: @timeout,
281
355
  accept_encoding: 'gzip',
282
356
  headers: {
283
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -286,35 +360,58 @@ module Wgit
286
360
  }
287
361
 
288
362
  # See https://rubydoc.info/gems/typhoeus for more info.
289
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
290
386
  end
291
387
 
292
388
  # Returns a doc's internal HTML page links in absolute form; used when
293
- # crawling a site. Use the allow and disallow paths params to partially
294
- # and selectively crawl a site; the glob syntax is supported e.g.
295
- # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
296
391
  #
297
- # Override this method in a subclass to change how a site
298
- # is crawled, not what is extracted from each page (Document extensions
299
- # should be used for this purpose instead). Just remember that only HTML
300
- # files containing `<a>` links keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
301
395
  #
302
396
  # @param doc [Wgit::Document] The document from which to extract it's
303
397
  # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
304
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
305
403
  # them if their path `File.fnmatch?` one of allow_paths.
306
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
307
405
  # them if their path `File.fnmatch?` one of disallow_paths.
308
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
309
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
310
- links = doc
311
- .internal_absolute_links
312
- .map(&:omit_fragment) # Because fragments don't alter content.
313
- .uniq
314
- .select do |link|
315
- ext = link.to_extension
316
- ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
317
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
318
415
 
319
416
  return links if allow_paths.nil? && disallow_paths.nil?
320
417
 
@@ -323,29 +420,40 @@ module Wgit
323
420
 
324
421
  private
325
422
 
326
- # Returns whether or not to follow redirects, and within what context e.g.
327
- # :host, :domain etc.
328
- def redirect?(follow_redirects)
329
- return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
330
433
 
331
- unless [true, false].include?(follow_redirects)
332
- raise "follow_redirects: must be a Boolean or Symbol, not: \
333
- #{follow_redirects}"
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
334
436
  end
335
437
 
336
- [follow_redirects, nil]
438
+ links
337
439
  end
338
440
 
339
- # Log (at debug level) the HTTP request/response details.
340
- def log_http(response)
341
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
342
- log_status = (response.status || 0)
343
- log_total_time = response.total_time.truncate(3)
344
-
345
- Wgit.logger.debug("[http] Request: #{response.url}")
346
- Wgit.logger.debug(
347
- format(resp_template, log_status, response.size, log_total_time)
348
- )
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
349
457
  end
350
458
 
351
459
  # Validate and filter by the given URL paths.
@@ -365,14 +473,17 @@ module Wgit
365
473
 
366
474
  # Validate the paths are suitable for filtering.
367
475
  def validate_paths(paths)
368
- paths = [paths] unless paths.is_a?(Array)
476
+ paths = *paths
369
477
  raise 'The provided paths must all be Strings' \
370
478
  unless paths.all? { |path| path.is_a?(String) }
371
479
 
372
- Wgit::Utils.process_arr(paths, encode: false)
480
+ Wgit::Utils.sanitize(paths, encode: false)
373
481
  raise 'The provided paths cannot be empty' if paths.empty?
374
482
 
375
- paths
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
376
487
  end
377
488
 
378
489
  # Filters links by selecting/rejecting them based on their path.
@@ -380,7 +491,7 @@ module Wgit
380
491
  def filter_links(links, filter_method, paths)
381
492
  links.send(filter_method) do |link|
382
493
  # Turn http://example.com into / meaning index.
383
- link = link.to_endpoint == '/' ? '/' : link.omit_base
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
384
495
 
385
496
  match = false
386
497
  paths.each do |pattern|
@@ -392,6 +503,35 @@ module Wgit
392
503
  end
393
504
  end
394
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
395
535
  alias crawl crawl_urls
396
536
  alias crawl_pages crawl_urls
397
537
  alias crawl_page crawl_url