RubyGems - wgit - Versions diffs - 0.7.0 → 0.10.1 - Mend

wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/.yardopts +1 -1
data/CHANGELOG.md +74 -2
data/LICENSE.txt +1 -1
data/README.md +114 -290
data/bin/wgit +9 -5
data/lib/wgit/assertable.rb +3 -3
data/lib/wgit/base.rb +30 -0
data/lib/wgit/core_ext.rb +1 -1
data/lib/wgit/crawler.rb +219 -79
data/lib/wgit/database/database.rb +309 -134
data/lib/wgit/database/model.rb +10 -3
data/lib/wgit/document.rb +226 -143
data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
data/lib/wgit/dsl.rb +324 -0
data/lib/wgit/indexer.rb +65 -162
data/lib/wgit/response.rb +11 -8
data/lib/wgit/url.rb +192 -61
data/lib/wgit/utils.rb +32 -20
data/lib/wgit/version.rb +2 -1
data/lib/wgit.rb +3 -1
metadata +34 -19

data/bin/wgit CHANGED Viewed

@@ -2,18 +2,22 @@
 require 'wgit'
-# Eval .wgit.rb file (if it exists).
-def eval_wgit
-  puts 'Searching for .wgit.rb in local and home directories...'
+# Eval .wgit.rb file (if it exists somewhere).
+def eval_wgit(filepath = nil)
+  puts 'Searching for .wgit.rb file in local and home directories...'
-  ['.', Dir.home].each do |dir|
+  [filepath, Dir.pwd, Dir.home].each do |dir|
     path = "#{dir}/.wgit.rb"
     next unless File.exist?(path)
-    puts "Eval'ing #{path} (call `eval_wgit` after changes)"
+    puts "Eval'ing #{path}"
+    puts 'Call `eval_wgit` after changes to re-eval the file'
     eval(File.read(path))
     break
   end
+  nil
 end
 eval_wgit

data/lib/wgit/assertable.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Wgit
     # Default type fail message.
     DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
     # Wrong method message.
-    WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
+    NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
     # Default duck fail message.
     DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
     # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
     # @raise [StandardError] If the assertion fails.
     # @return [Object] The given arr on successful assertion.
     def assert_arr_types(arr, type_or_types, msg = nil)
-      raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
+      raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
       arr.each { |obj| assert_types(obj, type_or_types, msg) }
     end
@@ -56,7 +56,7 @@ present: %s"
     # @raise [StandardError] If the assertion fails.
     # @return [Object] The given obj_or_objs on successful assertion.
     def assert_respond_to(obj_or_objs, methods, msg = nil)
-      methods = [methods] unless methods.respond_to?(:all?)
+      methods = *methods
       if obj_or_objs.respond_to?(:each)
         obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }

data/lib/wgit/base.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Wgit
+  # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
+  # All subclasses must define a `#parse(doc, &block)` method.
+  class Base
+    extend Wgit::DSL
+    # Runs the crawl/index passing each crawled `Wgit::Document` and the given
+    # block to the subclass's `#parse` method.
+    def self.run(&block)
+      obj = new
+      unless obj.respond_to?(:parse)
+        raise "#{obj.class} must respond_to? #parse(doc, &block)"
+      end
+      crawl_method = @method || :crawl
+      send(crawl_method) { |doc| obj.parse(doc, &block) }
+      obj
+    end
+    # Sets the crawl/index method to call when `Base.run` is called.
+    # The mode method must match one defined in the `Wgit::Crawler` or
+    # `Wgit::Indexer` class.
+    #
+    # @param method [Symbol] The crawl/index method to call.
+    def self.mode(method)
+      @method = method
+    end
+  end
+end

data/lib/wgit/core_ext.rb CHANGED Viewed

@@ -11,7 +11,7 @@ class String
   #
   # @return [Wgit::Url] The converted URL.
   def to_url
-    Wgit::Url.new(self)
+    Wgit::Url.parse(self)
   end
 end

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -6,23 +6,33 @@ require_relative 'utils'
 require_relative 'assertable'
 require_relative 'response'
 require 'set'
+require 'benchmark'
 require 'typhoeus'
+require 'ferrum'
 module Wgit
-  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
-  # serialising their HTML into Wgit::Document instances. This is the only Wgit
-  # class which contains network logic e.g. request/response handling.
+  # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
+  # and serialising their HTML into `Wgit::Document` instances. This is the
+  # only Wgit class containing network logic (HTTP request/response handling).
   class Crawler
     include Assertable
-    # The URL file extensions (from `<a>` hrefs) which will be crawled by
-    # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
-    # doesn't keep the crawl of the site going. All URL's without a file
-    # extension will be crawled, because they're assumed to be HTML.
-    SUPPORTED_FILE_EXTENSIONS = Set.new(
+    # Set of supported file extensions for Wgit::Crawler#crawl_site.
+    @supported_file_extensions = Set.new(
       %w[asp aspx cfm cgi htm html htmlx jsp php]
     )
+    class << self
+      # The URL file extensions (from `<a>` hrefs) which will be crawled by
+      # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
+      # doesn't keep the crawl of the site going. All URL's without a file
+      # extension will be crawled, because they're assumed to be HTML.
+      # The `#crawl` method will crawl anything since it's given the URL(s).
+      # You can add your own site's URL file extension e.g.
+      # `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
+      attr_reader :supported_file_extensions
+    end
     # The amount of allowed redirects before raising an error. Set to 0 to
     # disable redirects completely; or you can pass `follow_redirects: false`
     # to any Wgit::Crawler.crawl_* method.
@@ -30,12 +40,21 @@ module Wgit
     # The maximum amount of time (in seconds) a crawl request has to complete
     # before raising an error. Set to 0 to disable time outs completely.
-    attr_accessor :time_out
+    attr_accessor :timeout
     # Whether or not to UTF-8 encode the response body once crawled. Set to
     # false if crawling more than just HTML e.g. images.
     attr_accessor :encode
+    # Whether or not to parse the Javascript of the crawled document.
+    # Parsing requires Chrome/Chromium to be installed and in $PATH.
+    attr_accessor :parse_javascript
+    # The delay between checks in a page's HTML size. When the page has stopped
+    # "growing", the Javascript has finished dynamically updating the DOM.
+    # The value should balance between a good UX and enough JS parse time.
+    attr_accessor :parse_javascript_delay
     # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
@@ -43,20 +62,27 @@ module Wgit
     #
     # @param redirect_limit [Integer] The amount of allowed redirects before
     #   raising an error. Set to 0 to disable redirects completely.
-    # @param time_out [Integer, Float] The maximum amount of time (in seconds)
+    # @param timeout [Integer, Float] The maximum amount of time (in seconds)
     #   a crawl request has to complete before raising an error. Set to 0 to
     #   disable time outs completely.
     # @param encode [Boolean] Whether or not to UTF-8 encode the response body
     #   once crawled. Set to false if crawling more than just HTML e.g. images.
-    def initialize(redirect_limit: 5, time_out: 5, encode: true)
-      @redirect_limit = redirect_limit
-      @time_out       = time_out
-      @encode         = encode
+    # @param parse_javascript [Boolean] Whether or not to parse the Javascript
+    #   of the crawled document. Parsing requires Chrome/Chromium to be
+    #   installed and in $PATH.
+    def initialize(redirect_limit: 5, timeout: 5, encode: true,
+                   parse_javascript: false, parse_javascript_delay: 1)
+      @redirect_limit         = redirect_limit
+      @timeout                = timeout
+      @encode                 = encode
+      @parse_javascript       = parse_javascript
+      @parse_javascript_delay = parse_javascript_delay
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal `<a>` links. Each crawled Document is yielded to a block.
-    # Use `doc.empty?` to determine if the crawled link is valid.
+    # its internal `<a>` links; this can be overridden with `follow: xpath`.
+    # Each crawled Document is yielded to a block. Use `doc.empty?` to
+    # determine if the crawled link was successful / is valid.
     #
     # Use the allow and disallow paths params to partially and selectively
     # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -74,26 +100,36 @@ module Wgit
     # @param url [Wgit::Url] The base URL of the website to be crawled.
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
-    # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them if their path `File.fnmatch?` one of allow_paths.
-    # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path `File.fnmatch?` one of disallow_paths.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
     # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
     #   Use `doc.empty?` to determine if the page is valid.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
     #   from all of the site's pages or nil if the given url could not be
     #   crawled successfully.
-    def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
+    def crawl_site(
+      url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
+    )
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      link_opts = {
+        xpath: follow,
+        allow_paths: allow_paths,
+        disallow_paths: disallow_paths
+      }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
       crawled   = Set.new([url, alt_url])
       externals = Set.new(doc.external_links)
-      internals = Set.new(get_internal_links(doc, path_opts))
+      internals = Set.new(next_internal_links(doc, **link_opts))
       return externals.to_a if internals.empty?
@@ -108,7 +144,7 @@ module Wgit
           crawled += [orig_link, link] # Push both links in case of redirects.
           next if doc.nil?
-          internals += get_internal_links(doc, path_opts)
+          internals += next_internal_links(doc, **link_opts)
           externals += doc.external_links
         end
       end
@@ -123,10 +159,11 @@ module Wgit
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     #   This value will be used for all urls crawled.
     # @yield [doc] Given each crawled page (Wgit::Document); this is the only
-    #   way to interact with them.
+    #   way to interact with them. Use `doc.empty?` to determine if the page
+    #   is valid.
     # @raise [StandardError] If no urls are provided.
     # @return [Wgit::Document] The last Document crawled.
     def crawl_urls(*urls, follow_redirects: true, &block)
@@ -135,7 +172,7 @@ module Wgit
       opts = { follow_redirects: follow_redirects }
       doc = nil
-      Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
+      Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
       doc
     end
@@ -143,13 +180,15 @@ module Wgit
     # Crawl the url returning the response Wgit::Document or nil, if an error
     # occurs.
     #
-    # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
+    # @param url [Wgit::Url] The Url to crawl; which will be modified in the
+    #   event of a redirect.
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
     #   crawl was successful or not. Therefore, Document#url etc. can be used.
+    #   Use `doc.empty?` to determine if the page is valid.
     # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
     #   crawl was unsuccessful.
     def crawl_url(url, follow_redirects: true)
@@ -167,16 +206,19 @@ module Wgit
     protected
-    # Returns the url HTML String or nil. Handles any errors that arise
+    # Returns the URL's HTML String or nil. Handles any errors that arise
     # and sets the @last_response. Errors or any HTTP response that doesn't
     # return a HTML body will be ignored, returning nil.
     #
+    # If @parse_javascript is true, then the final resolved URL will be browsed
+    # to and Javascript parsed allowing for dynamic HTML generation.
+    #
     # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
     #   reference and gets modified as a result of the fetch/crawl.
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If url isn't valid and absolute.
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
@@ -185,6 +227,8 @@ module Wgit
       raise "Invalid url: #{url}" if url.invalid?
       resolve(url, response, follow_redirects: follow_redirects)
+      get_browser_response(url, response) if @parse_javascript
       response.body_or_nil
     rescue StandardError => e
       Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -206,14 +250,14 @@ module Wgit
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If a redirect isn't allowed etc.
     def resolve(url, response, follow_redirects: true)
-      orig_url_base = url.to_url.to_base # Recorded before any redirects.
+      origin = url.to_url.to_origin # Recorded before any redirects.
       follow_redirects, within = redirect?(follow_redirects)
       loop do
-        get_response(url, response)
+        get_http_response(url, response)
         break unless response.redirect?
         # Handle response 'Location' header.
@@ -225,7 +269,7 @@ module Wgit
         # Validate if the redirect is allowed.
         raise "Redirect not allowed: #{location}" unless follow_redirects
-        if within && !location.relative?(within => orig_url_base)
+        if within && !location.relative?(within => origin)
           raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
         end
@@ -233,7 +277,7 @@ module Wgit
         if response.redirect_count >= @redirect_limit
         # Process the location to be crawled next.
-        location = url.to_base.concat(location) if location.relative?
+        location = url.to_origin.concat(location) if location.relative?
         response.redirections[url.to_s] = location.to_s
         url.replace(location) # Update the url on redirect.
       end
@@ -246,7 +290,7 @@ module Wgit
     #   reference.
     # @raise [StandardError] If a response can't be obtained.
     # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
-    def get_response(url, response)
+    def get_http_response(url, response)
       # Perform a HTTP GET request.
       orig_url = url.to_s
       url      = url.normalize if url.respond_to?(:normalize)
@@ -263,10 +307,40 @@ module Wgit
       response.add_total_time(http_response.total_time)
       # Log the request/response details.
-      log_http(response)
+      log_net(:http, response, http_response.total_time)
       # Handle a failed response.
-      raise "No response (within timeout: #{@time_out} second(s))" \
+      raise "No response (within timeout: #{@timeout} second(s))" \
+      if response.failure?
+    end
+    # Makes a browser request and enriches the given Wgit::Response from it.
+    #
+    # @param url [String] The url to browse to. Will call url#normalize if
+    #   possible.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
+    # @raise [StandardError] If a response can't be obtained.
+    # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
+    def get_browser_response(url, response)
+      url     = url.normalize if url.respond_to?(:normalize)
+      browser = nil
+      crawl_time = Benchmark.measure { browser = browser_get(url) }.real
+      yield browser if block_given?
+      # Enrich the given Wgit::Response object (on top of Typhoeus response).
+      response.adapter_response = browser.network.response
+      response.status           = browser.network.response.status
+      response.headers          = browser.network.response.headers
+      response.body             = browser.body
+      response.add_total_time(crawl_time)
+      # Log the request/response details.
+      log_net(:browser, response, crawl_time)
+      # Handle a failed response.
+      raise "No browser response (within timeout: #{@timeout} second(s))" \
       if response.failure?
     end
@@ -277,7 +351,7 @@ module Wgit
     def http_get(url)
       opts = {
         followlocation: false,
-        timeout: @time_out,
+        timeout: @timeout,
         accept_encoding: 'gzip',
         headers: {
           'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -286,35 +360,58 @@ module Wgit
       }
       # See https://rubydoc.info/gems/typhoeus for more info.
-      Typhoeus.get(url, opts)
+      Typhoeus.get(url, **opts)
+    end
+    # Performs a HTTP GET request in a web browser and parses the response JS
+    # before returning the HTML body of the fully rendered webpage. This allows
+    # Javascript (SPA apps etc.) to generate HTML dynamically.
+    #
+    # @param url [String] The url to browse to.
+    # @return [Ferrum::Browser] The browser response object.
+    def browser_get(url)
+      @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
+      @browser.goto(url)
+      # Wait for the page's JS to finish dynamically manipulating the DOM.
+      html = @browser.body
+      loop do
+        sleep @parse_javascript_delay
+        break if html.size == @browser.body.size
+        html = @browser.body
+      end
+      @browser
     end
     # Returns a doc's internal HTML page links in absolute form; used when
-    # crawling a site. Use the allow and disallow paths params to partially
-    # and selectively crawl a site; the glob syntax is supported e.g.
-    # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
+    # crawling a site. By default, any `<a>` href returning HTML is returned;
+    # override this with `xpath:` if desired.
     #
-    # Override this method in a subclass to change how a site
-    # is crawled, not what is extracted from each page (Document extensions
-    # should be used for this purpose instead). Just remember that only HTML
-    # files containing `<a>` links keep the crawl going beyond the base URL.
+    # Use the allow and disallow paths params to partially and selectively
+    # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
+    # that each path should NOT start with a slash.
     #
     # @param doc [Wgit::Document] The document from which to extract it's
     #   internal (absolute) page links.
+    # @param xpath [String] The xpath selecting links to be returned. Only
+    #   links pointing to the doc.url domain are allowed. The :default is any
+    #   <a> href returning HTML. The allow/disallow paths will be applied to
+    #   the returned value.
     # @param allow_paths [String, Array<String>] Filters links by selecting
     #   them if their path `File.fnmatch?` one of allow_paths.
     # @param disallow_paths [String, Array<String>] Filters links by rejecting
     #   them if their path `File.fnmatch?` one of disallow_paths.
     # @return [Array<Wgit::Url>] The internal page links from doc.
-    def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
-      links = doc
-              .internal_absolute_links
-              .map(&:omit_fragment) # Because fragments don't alter content.
-              .uniq
-              .select do |link|
-        ext = link.to_extension
-        ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
-      end
+    def next_internal_links(
+      doc, xpath: :default, allow_paths: nil, disallow_paths: nil
+    )
+      links = if xpath && xpath != :default
+                follow_xpath(doc, xpath)
+              else
+                follow_default(doc)
+              end
       return links if allow_paths.nil? && disallow_paths.nil?
@@ -323,29 +420,40 @@ module Wgit
     private
-    # Returns whether or not to follow redirects, and within what context e.g.
-    # :host, :domain etc.
-    def redirect?(follow_redirects)
-      return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
+    # Returns the next links used to continue crawling a site. The xpath value
+    # is used to obtain the links. Any valid URL Strings will be converted into
+    # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
+    # pointing to the site domain will raise an error.
+    def follow_xpath(doc, xpath)
+      links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
+        urls
+          .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
+          .compact
+      end
-      unless [true, false].include?(follow_redirects)
-        raise "follow_redirects: must be a Boolean or Symbol, not: \
-#{follow_redirects}"
+      if links.any? { |link| link.to_domain != doc.url.to_domain }
+        raise 'The links to follow must be within the site domain'
       end
-      [follow_redirects, nil]
+      links
     end
-    # Log (at debug level) the HTTP request/response details.
-    def log_http(response)
-      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
-      log_status     = (response.status || 0)
-      log_total_time = response.total_time.truncate(3)
-      Wgit.logger.debug("[http] Request:  #{response.url}")
-      Wgit.logger.debug(
-        format(resp_template, log_status, response.size, log_total_time)
-      )
+    # Returns the default set of links used to continue crawling a site.
+    # By default, any <a> href returning HTML and pointing to the same domain
+    # will get returned.
+    def follow_default(doc)
+      doc
+        .internal_absolute_links
+        .map(&:omit_fragment) # Because fragments don't alter content.
+        .uniq
+        .select do |link| # Whitelist only HTML content.
+          ext = link.to_extension
+          if ext
+            Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
+          else
+            true # URLs without an extension are assumed HTML.
+          end
+        end
     end
     # Validate and filter by the given URL paths.
@@ -365,14 +473,17 @@ module Wgit
     # Validate the paths are suitable for filtering.
     def validate_paths(paths)
-      paths = [paths] unless paths.is_a?(Array)
+      paths = *paths
       raise 'The provided paths must all be Strings' \
       unless paths.all? { |path| path.is_a?(String) }
-      Wgit::Utils.process_arr(paths, encode: false)
+      Wgit::Utils.sanitize(paths, encode: false)
       raise 'The provided paths cannot be empty' if paths.empty?
-      paths
+      paths.map do |path|
+        path = Wgit::Url.parse(path)
+        path.index? ? path : path.omit_slashes
+      end
     end
     # Filters links by selecting/rejecting them based on their path.
@@ -380,7 +491,7 @@ module Wgit
     def filter_links(links, filter_method, paths)
       links.send(filter_method) do |link|
         # Turn http://example.com into / meaning index.
-        link = link.to_endpoint == '/' ? '/' : link.omit_base
+        link = link.to_endpoint.index? ? '/' : link.omit_base
         match = false
         paths.each do |pattern|
@@ -392,6 +503,35 @@ module Wgit
       end
     end
+    # Returns whether or not to follow redirects, and within what context e.g.
+    # :host, :domain etc.
+    def redirect?(follow_redirects)
+      return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
+      unless [true, false].include?(follow_redirects)
+        raise "follow_redirects: must be a Boolean or Symbol, not: \
+#{follow_redirects}"
+      end
+      [follow_redirects, nil]
+    end
+    # Log (at debug level) the network request/response details.
+    def log_net(client, response, duration)
+      resp_template  = "[#{client}] Response: %s (%s bytes in %s seconds)"
+      log_status     = (response.status || 0)
+      log_total_time = (duration || 0.0).truncate(3)
+      # The browsers request URL is the same so ignore it.
+      if client.to_sym == :http
+        Wgit.logger.debug("[#{client}] Request:  #{response.url}")
+      end
+      Wgit.logger.debug(
+        format(resp_template, log_status, response.size, log_total_time)
+      )
+    end
     alias crawl       crawl_urls
     alias crawl_pages crawl_urls
     alias crawl_page  crawl_url