RubyGems - wgit - Versions diffs - 0.4.1 → 0.5.0 - Mend

wgit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/wgit/crawler.rb +155 -66
data/lib/wgit/database/database.rb +9 -8
data/lib/wgit/database/model.rb +2 -2
data/lib/wgit/document.rb +55 -62
data/lib/wgit/document_extensions.rb +2 -2
data/lib/wgit/indexer.rb +27 -15
data/lib/wgit/response.rb +144 -0
data/lib/wgit/url.rb +149 -85
data/lib/wgit/utils.rb +6 -3
data/lib/wgit/version.rb +7 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 70e6ec83e53550bcfe180b66248747763314c33738ecd0fabddca65dbb3918b0
-  data.tar.gz: a1c3d1e8bb6d078731876093cb2beed0ea4da65cb03dff1ead975f714bd3d9b5
+  metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
+  data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
 SHA512:
-  metadata.gz: ab519107506ec0798995cb52f986040da12d1a5c59c0c36f84bf8e09d847fd5ab83b3bd7f47ff95b6e474a35d855b176fdc9d245b1cef356781eadb21a4a84f7
-  data.tar.gz: '010748005ded444f44812c8b6022d258b60a3485dcef8b78c562012428e3955a7fbfe80f53a570cb9f6b524042388949cf2cf08d6a1b27581f2cfd9b424603b0'
+  metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
+  data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -4,11 +4,13 @@ require_relative 'url'
 require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
+require_relative 'response'
 require 'typhoeus'
 module Wgit
   # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
-  # serialising their HTML into Wgit::Document instances.
+  # serialising their HTML into Wgit::Document instances. This is the only Wgit
+  # class which contains network logic e.g. request/response handling.
   class Crawler
     include Assertable
@@ -20,8 +22,11 @@ module Wgit
     # before raising an error. Set to 0 to disable time outs completely.
     attr_accessor :time_out
-    # The Typhoeus::Response of the most recently crawled URL or nil.
-    # See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
+    # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
+    # crawling more than just HTML e.g. images etc.
+    attr_accessor :encode_html
+    # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
     # Initializes and returns a Wgit::Crawler instance.
@@ -31,13 +36,18 @@ module Wgit
     # @param time_out [Integer, Float] The maximum amount of time (in seconds)
     #   a crawl request has to complete before raising an error. Set to 0 to
     #   disable time outs completely.
-    def initialize(redirect_limit: 5, time_out: 5)
+    # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
+    #   crawled. Set to false if crawling more than just HTML e.g. images etc.
+    def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
       @redirect_limit = redirect_limit
       @time_out       = time_out
+      @encode_html    = encode_html
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal links. Each crawled Document is yielded to a block.
+    # its internal <a> links. Each crawled Document is yielded to a block. Use
+    # the allow and disallow paths params to partially and selectively crawl a
+    # site.
     #
     # Only redirects to the same host are followed. For example, the Url
     # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -50,20 +60,26 @@ module Wgit
     # @param url [Wgit::Url] The base URL of the website to be crawled.
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
+    # @param allow_paths [String, Array<String>] Filters links by selecting
+    #   them only if their path includes one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters links by rejecting
+    #   them if their path includes one of disallow_paths.
     # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
     #   from all of the site's pages or nil if the url could not be
     #   crawled successfully.
-    def crawl_site(url, &block)
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      opts      = { follow_external_redirects: false, host: url.to_base }
+      crawl_opts = { follow_external_redirects: false, host: url.to_base }
+      link_opts  = { allow_paths: allow_paths, disallow_paths: disallow_paths }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
       crawled   = [url, alt_url]
       externals = doc.external_links
-      internals = get_internal_links(doc)
+      internals = get_internal_links(doc, link_opts)
       return doc.external_links.uniq if internals.empty?
@@ -76,12 +92,12 @@ module Wgit
         links.each do |link|
           orig_link = link.dup
-          doc = crawl_url(link, opts, &block)
+          doc = crawl_url(link, crawl_opts, &block)
           crawled.push(orig_link, link) # Push both in case of redirects.
           next if doc.nil?
-          internals.concat(get_internal_links(doc))
+          internals.concat(get_internal_links(doc, link_opts))
           externals.concat(doc.external_links)
         end
       end
@@ -141,7 +157,7 @@ module Wgit
         host: host
       )
-      doc = Wgit::Document.new(url, html)
+      doc = Wgit::Document.new(url, html, encode_html: @encode_html)
       yield(doc) if block_given?
       doc.empty? ? nil : doc
@@ -149,7 +165,7 @@ module Wgit
     protected
-    # Fetches the url HTML String or nil. Handles any errors that arise
+    # Returns the url HTML String or nil. Handles any errors that arise
     # and sets the @last_response. Errors or any HTTP response that doesn't
     # return a HTML body will be ignored, returning nil.
     #
@@ -166,31 +182,33 @@ module Wgit
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
     def fetch(url, follow_external_redirects: true, host: nil)
-      response       = nil
-      crawl_duration = nil
+      response = Wgit::Response.new
-      response = resolve(
+      resolve(
         url,
+        response,
         follow_external_redirects: follow_external_redirects,
         host: host
       )
-      crawl_duration = response.total_time
-      response.body.empty? ? nil : response.body
+      response.body_or_nil
     rescue StandardError => e
-      Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
+      Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
       nil
     ensure
-      url.crawled        = true # Also sets date_crawled underneath.
-      url.crawl_duration = crawl_duration
-      @last_response     = response
+      url.crawled        = true # Sets date_crawled underneath.
+      url.crawl_duration = response.total_time
+      @last_response = response
     end
-    # Resolves the url by handling any redirects. The response object will be
-    # returned or an error raised.
+    # GETs the given url, resolving any redirects. The given response object
+    # will be enriched.
     #
-    # @param url [Wgit::Url] The URL to resolve.
+    # @param url [Wgit::Url] The URL to GET and resolve.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
     # @param follow_external_redirects [Boolean] Whether or not to follow
     #   an external redirect. If false, you must also provide a `host:`
     #   parameter.
@@ -200,91 +218,162 @@ module Wgit
     #   'http://www.example.com' will only allow redirects for Urls with a
     #   `to_host` value of 'www.example.com'.
     # @raise [StandardError] If a redirect isn't allowed etc.
-    # @return [Typhoeus::Response] The HTTP response of the GET request.
-    def resolve(url, follow_external_redirects: true, host: nil)
-      response       = nil
-      redirect_count = 0
-      total_net_time = 0.0
+    def resolve(url, response, follow_external_redirects: true, host: nil)
       loop do
-        response = get_response(url)
-        total_net_time += response.total_time if response.total_time
-        # Break unless it's a redirect.
-        break unless (response.code >= 300) && (response.code < 400)
+        get_response(url, response)
+        break unless response.redirect?
         # Handle response 'Location' header.
-        location = Wgit::Utils.fetch(response.headers, :location, '')
-        location = Wgit::Url.new(location)
+        location = Wgit::Url.new(response.headers.fetch(:location, ''))
         raise 'Encountered redirect without Location header' if location.empty?
         yield(url, response, location) if block_given?
-        # Handle redirect logic.
+        # Validate redirect.
         if !follow_external_redirects && !location.relative?(host: host)
           raise "External redirect not allowed - Redirected to: \
 '#{location}', which is outside of host: '#{host}'"
         end
-        raise "Too many redirects, exceeded: #{redirect_count}" \
-        if redirect_count >= @redirect_limit
-        redirect_count += 1
+        raise "Too many redirects, exceeded: #{@redirect_limit}" \
+        if response.redirect_count >= @redirect_limit
         # Process the location to be crawled next.
         location = url.to_base.concat(location) if location.relative?
+        response.redirections[url.to_s] = location.to_s
         url.replace(location) # Update the url on redirect.
       end
-      response.options[:redirect_count] = redirect_count
-      response.options[:total_time]     = total_net_time
-      response
     end
-    # Performs a HTTP GET request and returns the response.
+    # Makes a HTTP request and enriches the given Wgit::Response from it.
     #
     # @param url [String] The url to GET. Will call url#normalize if possible.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
     # @raise [StandardError] If a response can't be obtained.
-    # @return [Typhoeus::Response] The HTTP response of the GET request.
-    def get_response(url)
-      url = url.normalize if url.respond_to?(:normalize)
+    # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
+    def get_response(url, response)
+      # Perform a HTTP GET request.
+      orig_url = url.to_s
+      url      = url.normalize if url.respond_to?(:normalize)
+      http_response = http_get(url)
+      # Enrich the given Wgit::Response object.
+      response.adapter_response = http_response
+      response.url              = orig_url
+      response.status           = http_response.code
+      response.headers          = http_response.headers
+      response.body             = http_response.body
+      response.ip_address       = http_response.primary_ip
+      response.add_total_time(http_response.total_time)
+      # Log (debug) the request/response details.
+      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
+      log_status     = (response.status || 0)
+      log_total_time = response.total_time.truncate(3)
+      Wgit.logger.debug("[http] Request:  #{response.url}")
+      Wgit.logger.debug(
+        format(resp_template, log_status, response.size, log_total_time)
+      )
+      # Handle a failed response.
+      raise "No response (within timeout: #{@time_out} second(s))" \
+      if response.failure?
+    end
+    # Performs a HTTP GET request and returns the response.
+    #
+    # @param url [String] The url to GET.
+    # @return [Typhoeus::Response] The HTTP response object.
+    def http_get(url)
       opts = {
         followlocation: false,
         timeout: @time_out,
         accept_encoding: 'gzip',
         headers: {
           'User-Agent' => "wgit/#{Wgit::VERSION}",
-          'Accept' => 'text/html'
+          'Accept'     => 'text/html'
         }
       }
-      response = Typhoeus.get(url, opts)
-      # Handle response status code.
-      raise "No response (within timeout: #{@time_out} second(s))" \
-      if response.code.zero?
-      response
+      # See https://rubydoc.info/gems/typhoeus for more info.
+      Typhoeus.get(url, opts)
     end
     # Returns a doc's internal HTML page links in absolute form; used when
-    # crawling a site. Override this method in a subclass to change how a site
+    # crawling a site. Use the allow and disallow paths params to partially
+    # and selectively crawl a site.
+    #
+    # Override this method in a subclass to change how a site
     # is crawled; not what is extracted from each page (Document extensions
-    # should be used for this purpose instead).
+    # should be used for this purpose instead). Just remember that only HTML
+    # files containing <a> links can keep the crawl going beyond the base URL.
     #
     # @param doc [Wgit::Document] The document from which to extract it's
     #   internal page links.
+    # @param allow_paths [String, Array<String>] Filters links by selecting
+    #   them only if their path includes one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters links by rejecting
+    #   them if their path includes one of disallow_paths.
     # @return [Array<Wgit::Url>] The internal page links from doc.
-    def get_internal_links(doc)
-      doc.internal_absolute_links
-         .map(&:without_anchor) # Because anchors don't change page content.
-         .uniq
-         .reject do |link|
+    def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
+      links = doc
+              .internal_absolute_links
+              .map(&:omit_fragment) # Because fragments don't alter content.
+              .uniq
+              .reject do |link|
         ext = link.to_extension
         ext ? !%w[htm html].include?(ext.downcase) : false
       end
+      return links if allow_paths.nil? && disallow_paths.nil?
+      process_paths(links, allow_paths, disallow_paths)
+    end
+    private
+    # Validate and filter by the given URL paths.
+    def process_paths(links, allow_paths, disallow_paths)
+      raise "You can't provide both allow_paths: and disallow_paths: params" \
+      if allow_paths && disallow_paths
+      if allow_paths  # White list.
+        filter_method = :select
+        paths         = allow_paths
+      else            # Black list.
+        filter_method = :reject
+        paths         = disallow_paths
+      end
+      paths = [paths] unless paths.is_a?(Array)
+      paths = paths
+              .compact
+              .reject(&:empty?)
+              .uniq
+              .map { |path| Wgit::Url.new(path).to_path }
+      raise 'The provided paths cannot be empty' if paths.empty?
+      filter_links_by_path(links, filter_method, paths)
+    end
+    # Filters links by selecting or rejecting them based on their path.
+    def filter_links_by_path(links, filter_method, paths)
+      links.send(filter_method) do |link|
+        link_path = link.to_path
+        next(false) unless link_path
+        match = false
+        paths.each do |path|
+          match = link_path.start_with?(path)
+          break if match
+        end
+        match
+      end
     end
     alias crawl       crawl_urls

data/lib/wgit/database/database.rb CHANGED Viewed

@@ -220,19 +220,20 @@ module Wgit
     # @param url [Wgit::Url] The Url to search the DB for.
     # @return [Boolean] True if url exists, otherwise false.
     def url?(url)
-      h = { 'url' => url }
-      @client[:urls].find(h).any?
+      assert_type(url, String) # This includes Wgit::Url's.
+      hash = { 'url' => url }
+      @client[:urls].find(hash).any?
     end
-    # Returns whether or not a record with the given doc 'url' field (which is
-    # unique) exists in the database's 'documents' collection.
+    # Returns whether or not a record with the given doc 'url.url' field
+    # (which is unique) exists in the database's 'documents' collection.
     #
     # @param doc [Wgit::Document] The Document to search the DB for.
     # @return [Boolean] True if doc exists, otherwise false.
     def doc?(doc)
-      url = doc.respond_to?(:url) ? doc.url : doc
-      h = { 'url' => url }
-      @client[:documents].find(h).any?
+      assert_type(doc, Wgit::Document)
+      hash = { 'url.url' => doc.url }
+      @client[:documents].find(hash).any?
     end
     ### Update Data ###
@@ -309,7 +310,7 @@ module Wgit
     # @return [Integer] The number of updated records.
     def update_doc(doc)
       assert_type(doc, Wgit::Document)
-      selection = { url: doc.url }
+      selection = { 'url.url' => doc.url }
       doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
       update = { '$set' => doc_hash }
       mutate(true, :documents, selection, update)

data/lib/wgit/database/model.rb CHANGED Viewed

@@ -26,7 +26,7 @@ module Wgit
       raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
       model = doc.to_h(include_html: false, include_score: false)
-      model['url'] = self.url(doc.url) # Expand Url String into full object.
+      model['url'] = url(doc.url) # Expand Url String into full object.
       Wgit::Utils.remove_non_bson_types(model)
     end
@@ -36,7 +36,7 @@ module Wgit
     # @return [Hash] Insertion fields common to all models.
     def self.common_insert_data
       {
-        date_added:    Wgit::Utils.time_stamp,
+        date_added: Wgit::Utils.time_stamp,
         date_modified: Wgit::Utils.time_stamp
       }
     end

data/lib/wgit/document.rb CHANGED Viewed

@@ -5,7 +5,8 @@ require 'nokogiri'
 require 'json'
 module Wgit
-  # Class modeling a HTML web document. Also doubles as a search result when
+  # Class primarily modeling a HTML web document, although other MIME types
+  # will work e.g. images etc. Also doubles as a search result when
   # loading Documents from the database via Wgit::Database#search.
   #
   # The initialize method dynamically initializes instance variables from the
@@ -60,11 +61,11 @@ module Wgit
     #   only used if url_or_obj is a String representing the web page's URL.
     #   Otherwise, the HTML comes from the database object. A html of nil will
     #   be defaulted to an empty String.
-    def initialize(url_or_obj, html = '')
+    def initialize(url_or_obj, html = '', encode_html: true)
       if url_or_obj.is_a?(String)
-        init_from_strings(url_or_obj, html)
+        init_from_strings(url_or_obj, html, encode_html: encode_html)
       else
-        init_from_object(url_or_obj)
+        init_from_object(url_or_obj, encode_html: encode_html)
       end
     end
@@ -91,25 +92,28 @@ module Wgit
     # instance variables upon Document initialization. See the default
     # extensions defined in 'document_extensions.rb' as examples.
     #
-    # Initialises a private instance variable with the xpath or database object
-    # result(s). When initialising from HTML, a true singleton value will only
-    # ever return one result otherwise all xpath results are returned in an
+    # Note that defined extensions work for both Documents initialized from
+    # HTML (via Wgit::Crawler methods) and from database objects.
+    # An extension once defined, initializes a private instance variable with
+    # the xpath or database object result(s).
+    #
+    # When initialising from HTML, a singleton value of true will only
+    # ever return one result; otherwise all xpath results are returned in an
     # Array. When initialising from a database object, the value is taken as
     # is and singleton is only used to define the default empty value.
     # If a value cannot be found (in either the HTML or database object), then
-    # a default will be used. The default value is: singleton ? nil : [].
-    #
-    # Note that defined extensions work for both documents initialized from
-    # the WWW (via Wgit::Crawler methods) and from database objects. This
-    # effectively implements ORM like behavior using this class.
+    # a default will be used. The default value is: `singleton ? nil : []`.
     #
     # @param var [Symbol] The name of the variable to be initialised.
     # @param xpath [String, Object#call] The xpath used to find the element(s)
-    #   of the webpage. Pass a callable object (proc etc.) if you want the
+    #   of the webpage. Only used when initializing from HTML.
+    #
+    #   Pass a callable object (proc etc.) if you want the
     #   xpath value to be derived on Document initialisation (instead of when
     #   the extension is defined). The call method must return a valid xpath
     #   String.
-    # @param options [Hash] The options to define an extension with.
+    # @param options [Hash] The options to define an extension with. The
+    #   options are only used when intializing from HTML, not the database.
     # @option options [Boolean] :singleton The singleton option determines
     #   whether or not the result(s) should be in an Array. If multiple
     #   results are found and singleton is true then the first result will be
@@ -117,16 +121,17 @@ module Wgit
     # @option options [Boolean] :text_content_only The text_content_only option
     #   if true will use the text content of the Nokogiri result object,
     #   otherwise the Nokogiri object itself is returned. Defaults to true.
-    # @yield [value, source] Yields the value (Object) about to be assigned to
-    #   the new var and the source (Symbol) of the value (either :html or
-    #   :object). The return value of the block becomes the new var value,
-    #   unless nil. Return nil if you want to inspect but not change the var
-    #   value. The block gets executed when a Document is initialized from html
-    #   or an object e.g. database.
+    # @yield [value, source, type] Yields the value (Object) about to be
+    #   assigned to the new var, the source of the value (Wgit::Document or DB
+    #   Object) and the source type (Symbol of either :document or :object).
+    #
+    #   The return value of the block becomes the new var value, unless nil.
+    #   Return nil if you want to inspect but not change the var value. The
+    #   block is executed when a Wgit::Document is initialized.
     # @raise [StandardError] If the var param isn't valid.
-    # @return [Symbol] The first half of the newly defined method names e.g.
-    #   if var == "title" then :init_title is returned.
+    # @return [Symbol] The given var Symbol.
     def self.define_extension(var, xpath, options = {}, &block)
+      var = var.to_sym
       default_options = { singleton: true, text_content_only: true }
       options = default_options.merge(options)
@@ -149,7 +154,7 @@ module Wgit
       end
       Document.send :private, func_name
-      "init_#{var}".to_sym
+      var
     end
     # Removes the init_* methods created when an extension is defined.
@@ -189,55 +194,48 @@ module Wgit
       @html[range]
     end
-    # Returns the timestamp of when this Document was crawled.
-    #
-    # @return [Time] Time of when this Document was crawled.
-    def date_crawled
-      @url.date_crawled
-    end
-    # Returns the duration of the crawl for this Document (in seconds).
-    #
-    # @return [Float] The duration of the crawl for this Document.
-    def crawl_duration
-      @url.crawl_duration
-    end
     # Returns the base URL of this Wgit::Document. The base URL is either the
     # <base> element's href value or @url (if @base is nil). If @base is
     # present and relative, then @url.to_base + @base is returned. This method
     # should be used instead of `doc.url.to_base` etc. when manually building
-    # absolute links from relative links.
+    # absolute links from relative links; or use `link.prefix_base(doc)`.
     #
     # Provide the `link:` parameter to get the correct base URL for that type
     # of link. For example, a link of `#top` would always return @url because
     # it applies to that page, not a different one. Query strings work in the
     # same way. Use this parameter if manually concatting Url's e.g.
     #
-    #   relative_link = Wgit::Url.new '?q=hello'
+    #   relative_link = Wgit::Url.new('?q=hello')
     #   absolute_link = doc.base_url(link: relative_link).concat(relative_link)
     #
     # This is similar to how Wgit::Document#internal_absolute_links works.
     #
     # @param link [Wgit::Url, String] The link to obtain the correct base URL
-    #   for.
+    #   for; must be relative, not absolute.
+    # @raise [StandardError] If link is relative or if a base URL can't be
+    #   established e.g. the doc @url is relative and <base> is nil.
     # @return [Wgit::Url] The base URL of this Document e.g.
     #   'http://example.com/public'.
     def base_url(link: nil)
+      raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
+      if @url.relative? && @base.nil?
+      raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
+      if @url.relative? && @base&.relative?
       get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
       if link
         link = Wgit::Url.new(link)
         raise "link must be relative: #{link}" unless link.relative?
-        if link.is_anchor? || link.is_query?
+        if link.is_fragment? || link.is_query?
           base_url = @base ? get_base.call : @url
-          return base_url.without_anchor.without_query
+          return base_url.omit_fragment.omit_query
         end
       end
-      base_url = @base ? get_base.call : @url.base
-      base_url.without_anchor.without_query
+      base_url = @base ? get_base.call : @url.to_base
+      base_url.omit_fragment.omit_query
     end
     # Returns a Hash containing this Document's instance vars.
@@ -340,7 +338,7 @@ module Wgit
       links = @links
               .select { |link| link.relative?(host: @url.to_base) }
-              .map(&:without_base)
+              .map(&:omit_base)
               .map do |link| # Map @url.to_host into / as it's a duplicate.
         link.to_host == @url.to_host ? Wgit::Url.new('/') : link
       end
@@ -354,7 +352,7 @@ module Wgit
     #
     # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
     def internal_absolute_links
-      internal_links.map { |link| base_url(link: link).concat(link) }
+      internal_links.map { |link| link.prefix_base(self) }
     end
     # Returns all external links from this Document in absolute form. External
@@ -366,7 +364,7 @@ module Wgit
       links = @links
               .reject { |link| link.relative?(host: @url.to_base) }
-              .map(&:without_trailing_slash)
+              .map(&:omit_trailing_slash)
       Wgit::Utils.process_arr(links)
     end
@@ -438,7 +436,7 @@ module Wgit
       orig_text = @text
       @text = search(
         query, case_sensitive: case_sensitive,
-        whole_sentence: whole_sentence, sentence_limit: sentence_limit
+               whole_sentence: whole_sentence, sentence_limit: sentence_limit
       )
       orig_text
@@ -473,7 +471,7 @@ module Wgit
     # @yield [value, source] Given the value (String/Object) before it's set as
     #   an instance variable so that you can inspect/alter the value if
     #   desired. Return nil from the block if you don't want to override the
-    #   value. Also given the source (Symbol) which is always :html.
+    #   value. Also given the source (Symbol) which is always :document.
     # @return [String, Object] The value found in the html or the default value
     #   (singleton ? nil : []).
     def find_in_html(xpath, singleton: true, text_content_only: true)
@@ -492,7 +490,7 @@ module Wgit
       singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
       if block_given?
-        new_result = yield(result, :html)
+        new_result = yield(result, self, :document)
         result = new_result unless new_result.nil?
       end
@@ -519,7 +517,7 @@ module Wgit
       singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
       if block_given?
-        new_result = yield(result, :object)
+        new_result = yield(result, obj, :object)
         result = new_result unless new_result.nil?
       end
@@ -529,19 +527,19 @@ module Wgit
     private
     # Initialise the Document from URL and HTML Strings.
-    def init_from_strings(url, html)
+    def init_from_strings(url, html, encode_html: true)
       assert_types(html, [String, NilClass])
       # We already know url.is_a?(String) so parse into Url unless already so.
       url = Wgit::Url.parse(url)
-      url.crawled = true unless url.crawled # Avoid overriding date_crawled.
+      url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
       @url   = url
       @html  = html || ''
       @doc   = init_nokogiri
       @score = 0.0
-      process_url_and_html
+      Wgit::Utils.process_str(@html, encode: encode_html)
       # Dynamically run the init_*_from_html methods.
       Document.private_instance_methods(false).each do |method|
@@ -554,7 +552,7 @@ module Wgit
     # Initialise the Document from a Hash like Object containing Strings as
     # keys e.g. database collection object or Hash.
-    def init_from_object(obj)
+    def init_from_object(obj, encode_html: true)
       assert_respond_to(obj, :fetch)
       @url   = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -562,7 +560,7 @@ module Wgit
       @doc   = init_nokogiri
       @score = obj.fetch('score', 0.0)
-      process_url_and_html
+      Wgit::Utils.process_str(@html, encode: encode_html)
       # Dynamically run the init_*_from_object methods.
       Document.private_instance_methods(false).each do |method|
@@ -573,12 +571,6 @@ module Wgit
       end
     end
-    # Ensure the @url and @html Strings are correctly encoded etc.
-    def process_url_and_html
-      @url  = Wgit::Utils.process_str(@url)
-      @html = Wgit::Utils.process_str(@html)
-    end
     # Initialises an instance variable and defines a getter method for it.
     #
     # @param var [Symbol] The name of the variable to be initialized.
@@ -597,6 +589,7 @@ module Wgit
       end
     end
+    alias content                html
     alias statistics             stats
     alias internal_urls          internal_links
     alias internal_absolute_urls internal_absolute_links