RubyGems - wgit - Versions diffs - 0.4.1 → 0.5.0 - Mend

wgit 0.4.1 → 0.5.0

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/wgit/crawler.rb +155 -66
data/lib/wgit/database/database.rb +9 -8
data/lib/wgit/database/model.rb +2 -2
data/lib/wgit/document.rb +55 -62
data/lib/wgit/document_extensions.rb +2 -2
data/lib/wgit/indexer.rb +27 -15
data/lib/wgit/response.rb +144 -0
data/lib/wgit/url.rb +149 -85
data/lib/wgit/utils.rb +6 -3
data/lib/wgit/version.rb +7 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 70e6ec83e53550bcfe180b66248747763314c33738ecd0fabddca65dbb3918b0
-  data.tar.gz: a1c3d1e8bb6d078731876093cb2beed0ea4da65cb03dff1ead975f714bd3d9b5
+  metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
+  data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
 SHA512:
-  metadata.gz: ab519107506ec0798995cb52f986040da12d1a5c59c0c36f84bf8e09d847fd5ab83b3bd7f47ff95b6e474a35d855b176fdc9d245b1cef356781eadb21a4a84f7
-  data.tar.gz: '010748005ded444f44812c8b6022d258b60a3485dcef8b78c562012428e3955a7fbfe80f53a570cb9f6b524042388949cf2cf08d6a1b27581f2cfd9b424603b0'
+  metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
+  data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -4,11 +4,13 @@ require_relative 'url'
 require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
+require_relative 'response'
 require 'typhoeus'
 module Wgit
   # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
-  # serialising their HTML into Wgit::Document instances.
+  # serialising their HTML into Wgit::Document instances. This is the only Wgit
+  # class which contains network logic e.g. request/response handling.
   class Crawler
     include Assertable
@@ -20,8 +22,11 @@ module Wgit
     # before raising an error. Set to 0 to disable time outs completely.
     attr_accessor :time_out
-    # The Typhoeus::Response of the most recently crawled URL or nil.
-    # See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
+    # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
+    # crawling more than just HTML e.g. images etc.
+    attr_accessor :encode_html
+    # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
     # Initializes and returns a Wgit::Crawler instance.
@@ -31,13 +36,18 @@ module Wgit
     # @param time_out [Integer, Float] The maximum amount of time (in seconds)
     #   a crawl request has to complete before raising an error. Set to 0 to
     #   disable time outs completely.
-    def initialize(redirect_limit: 5, time_out: 5)
+    # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
+    #   crawled. Set to false if crawling more than just HTML e.g. images etc.
+    def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
       @redirect_limit = redirect_limit
       @time_out       = time_out
+      @encode_html    = encode_html
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal links. Each crawled Document is yielded to a block.
+    # its internal <a> links. Each crawled Document is yielded to a block. Use
+    # the allow and disallow paths params to partially and selectively crawl a
+    # site.
     #
     # Only redirects to the same host are followed. For example, the Url
     # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -50,20 +60,26 @@ module Wgit
     # @param url [Wgit::Url] The base URL of the website to be crawled.
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
+    # @param allow_paths [String, Array<String>] Filters links by selecting
+    #   them only if their path includes one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters links by rejecting
+    #   them if their path includes one of disallow_paths.
     # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
     #   from all of the site's pages or nil if the url could not be
     #   crawled successfully.
-    def crawl_site(url, &block)
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      opts      = { follow_external_redirects: false, host: url.to_base }
+      crawl_opts = { follow_external_redirects: false, host: url.to_base }
+      link_opts  = { allow_paths: allow_paths, disallow_paths: disallow_paths }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
       crawled   = [url, alt_url]
       externals = doc.external_links
-      internals = get_internal_links(doc)
+      internals = get_internal_links(doc, link_opts)
       return doc.external_links.uniq if internals.empty?
@@ -76,12 +92,12 @@ module Wgit
         links.each do |link|
           orig_link = link.dup
-          doc = crawl_url(link, opts, &block)
+          doc = crawl_url(link, crawl_opts, &block)
           crawled.push(orig_link, link) # Push both in case of redirects.
           next if doc.nil?
-          internals.concat(get_internal_links(doc))
+          internals.concat(get_internal_links(doc, link_opts))
           externals.concat(doc.external_links)
         end
       end
@@ -141,7 +157,7 @@ module Wgit
         host: host
       )
-      doc = Wgit::Document.new(url, html)
+      doc = Wgit::Document.new(url, html, encode_html: @encode_html)
       yield(doc) if block_given?
       doc.empty? ? nil : doc
@@ -149,7 +165,7 @@ module Wgit
     protected
-    # Fetches the url HTML String or nil. Handles any errors that arise
+    # Returns the url HTML String or nil. Handles any errors that arise
     # and sets the @last_response. Errors or any HTTP response that doesn't
     # return a HTML body will be ignored, returning nil.
     #
@@ -166,31 +182,33 @@ module Wgit
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
     def fetch(url, follow_external_redirects: true, host: nil)
-      response       = nil
-      crawl_duration = nil
+      response = Wgit::Response.new
-      response = resolve(
+      resolve(
         url,
+        response,
         follow_external_redirects: follow_external_redirects,
         host: host
       )
-      crawl_duration = response.total_time
-      response.body.empty? ? nil : response.body
+      response.body_or_nil
     rescue StandardError => e
-      Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
+      Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
       nil
     ensure
-      url.crawled        = true # Also sets date_crawled underneath.
-      url.crawl_duration = crawl_duration
-      @last_response     = response
+      url.crawled        = true # Sets date_crawled underneath.
+      url.crawl_duration = response.total_time
+      @last_response = response
     end
-    # Resolves the url by handling any redirects. The response object will be
-    # returned or an error raised.
+    # GETs the given url, resolving any redirects. The given response object
+    # will be enriched.
     #
-    # @param url [Wgit::Url] The URL to resolve.
+    # @param url [Wgit::Url] The URL to GET and resolve.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
     # @param follow_external_redirects [Boolean] Whether or not to follow
     #   an external redirect. If false, you must also provide a `host:`
     #   parameter.
@@ -200,91 +218,162 @@ module Wgit
     #   'http://www.example.com' will only allow redirects for Urls with a
     #   `to_host` value of 'www.example.com'.
     # @raise [StandardError] If a redirect isn't allowed etc.
-    # @return [Typhoeus::Response] The HTTP response of the GET request.
-    def resolve(url, follow_external_redirects: true, host: nil)
-      response       = nil
-      redirect_count = 0
-      total_net_time = 0.0
+    def resolve(url, response, follow_external_redirects: true, host: nil)
       loop do
-        response = get_response(url)
-        total_net_time += response.total_time if response.total_time
-        # Break unless it's a redirect.
-        break unless (response.code >= 300) && (response.code < 400)
+        get_response(url, response)
+        break unless response.redirect?
         # Handle response 'Location' header.
-        location = Wgit::Utils.fetch(response.headers, :location, '')
-        location = Wgit::Url.new(location)
+        location = Wgit::Url.new(response.headers.fetch(:location, ''))
         raise 'Encountered redirect without Location header' if location.empty?
         yield(url, response, location) if block_given?
-        # Handle redirect logic.
+        # Validate redirect.
         if !follow_external_redirects && !location.relative?(host: host)
           raise "External redirect not allowed - Redirected to: \
 '#{location}', which is outside of host: '#{host}'"
         end
-        raise "Too many redirects, exceeded: #{redirect_count}" \
-        if redirect_count >= @redirect_limit
-        redirect_count += 1
+        raise "Too many redirects, exceeded: #{@redirect_limit}" \
+        if response.redirect_count >= @redirect_limit
         # Process the location to be crawled next.
         location = url.to_base.concat(location) if location.relative?
+        response.redirections[url.to_s] = location.to_s
         url.replace(location) # Update the url on redirect.
       end
-      response.options[:redirect_count] = redirect_count
-      response.options[:total_time]     = total_net_time
-      response
     end
-    # Performs a HTTP GET request and returns the response.
+    # Makes a HTTP request and enriches the given Wgit::Response from it.
     #
     # @param url [String] The url to GET. Will call url#normalize if possible.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
     # @raise [StandardError] If a response can't be obtained.
-    # @return [Typhoeus::Response] The HTTP response of the GET request.
-    def get_response(url)
-      url = url.normalize if url.respond_to?(:normalize)
+    # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
+    def get_response(url, response)
+      # Perform a HTTP GET request.
+      orig_url = url.to_s
+      url      = url.normalize if url.respond_to?(:normalize)
+      http_response = http_get(url)
+      # Enrich the given Wgit::Response object.
+      response.adapter_response = http_response
+      response.url              = orig_url
+      response.status           = http_response.code
+      response.headers          = http_response.headers
+      response.body             = http_response.body
+      response.ip_address       = http_response.primary_ip
+      response.add_total_time(http_response.total_time)
+      # Log (debug) the request/response details.
+      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
+      log_status     = (response.status || 0)
+      log_total_time = response.total_time.truncate(3)
+      Wgit.logger.debug("[http] Request:  #{response.url}")
+      Wgit.logger.debug(
+        format(resp_template, log_status, response.size, log_total_time)
+      )
+      # Handle a failed response.
+      raise "No response (within timeout: #{@time_out} second(s))" \
+      if response.failure?
+    end
+    # Performs a HTTP GET request and returns the response.
+    #
+    # @param url [String] The url to GET.
+    # @return [Typhoeus::Response] The HTTP response object.
+    def http_get(url)
       opts = {
         followlocation: false,
         timeout: @time_out,
         accept_encoding: 'gzip',
         headers: {
           'User-Agent' => "wgit/#{Wgit::VERSION}",
-          'Accept' => 'text/html'
+          'Accept'     => 'text/html'
         }
       }
-      response = Typhoeus.get(url, opts)
-      # Handle response status code.
-      raise "No response (within timeout: #{@time_out} second(s))" \
-      if response.code.zero?
-      response
+      # See https://rubydoc.info/gems/typhoeus for more info.
+      Typhoeus.get(url, opts)
     end
     # Returns a doc's internal HTML page links in absolute form; used when
-    # crawling a site. Override this method in a subclass to change how a site
+    # crawling a site. Use the allow and disallow paths params to partially
+    # and selectively crawl a site.
+    #
+    # Override this method in a subclass to change how a site
     # is crawled; not what is extracted from each page (Document extensions
-    # should be used for this purpose instead).
+    # should be used for this purpose instead). Just remember that only HTML
+    # files containing <a> links can keep the crawl going beyond the base URL.
     #
     # @param doc [Wgit::Document] The document from which to extract it's
     #   internal page links.
+    # @param allow_paths [String, Array<String>] Filters links by selecting
+    #   them only if their path includes one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters links by rejecting
+    #   them if their path includes one of disallow_paths.
     # @return [Array<Wgit::Url>] The internal page links from doc.
-    def get_internal_links(doc)
-      doc.internal_absolute_links
-         .map(&:without_anchor) # Because anchors don't change page content.
-         .uniq
-         .reject do |link|
+    def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
+      links = doc
+              .internal_absolute_links
+              .map(&:omit_fragment) # Because fragments don't alter content.
+              .uniq
+              .reject do |link|
         ext = link.to_extension
         ext ? !%w[htm html].include?(ext.downcase) : false
       end
+      return links if allow_paths.nil? && disallow_paths.nil?
+      process_paths(links, allow_paths, disallow_paths)
+    end
+    private
+    # Validate and filter by the given URL paths.
+    def process_paths(links, allow_paths, disallow_paths)
+      raise "You can't provide both allow_paths: and disallow_paths: params" \
+      if allow_paths && disallow_paths
+      if allow_paths  # White list.
+        filter_method = :select
+        paths         = allow_paths
+      else            # Black list.
+        filter_method = :reject
+        paths         = disallow_paths
+      end
+      paths = [paths] unless paths.is_a?(Array)
+      paths = paths
+              .compact
+              .reject(&:empty?)
+              .uniq
+              .map { |path| Wgit::Url.new(path).to_path }
+      raise 'The provided paths cannot be empty' if paths.empty?
+      filter_links_by_path(links, filter_method, paths)
+    end
+    # Filters links by selecting or rejecting them based on their path.
+    def filter_links_by_path(links, filter_method, paths)
+      links.send(filter_method) do |link|
+        link_path = link.to_path
+        next(false) unless link_path
+        match = false
+        paths.each do |path|
+          match = link_path.start_with?(path)
+          break if match
+        end
+        match
+      end
     end
     alias crawl       crawl_urls

data/lib/wgit/database/database.rb CHANGED Viewed

@@ -220,19 +220,20 @@ module Wgit
     # @param url [Wgit::Url] The Url to search the DB for.
     # @return [Boolean] True if url exists, otherwise false.
     def url?(url)
-      h = { 'url' => url }
-      @client[:urls].find(h).any?
+      assert_type(url, String) # This includes Wgit::Url's.
+      hash = { 'url' => url }
+      @client[:urls].find(hash).any?
     end
-    # Returns whether or not a record with the given doc 'url' field (which is
-    # unique) exists in the database's 'documents' collection.
+    # Returns whether or not a record with the given doc 'url.url' field
+    # (which is unique) exists in the database's 'documents' collection.
     #
     # @param doc [Wgit::Document] The Document to search the DB for.
     # @return [Boolean] True if doc exists, otherwise false.
     def doc?(doc)
-      url = doc.respond_to?(:url) ? doc.url : doc
-      h = { 'url' => url }
-      @client[:documents].find(h).any?
+      assert_type(doc, Wgit::Document)
+      hash = { 'url.url' => doc.url }
+      @client[:documents].find(hash).any?
     end
     ### Update Data ###
@@ -309,7 +310,7 @@ module Wgit
     # @return [Integer] The number of updated records.
     def update_doc(doc)
       assert_type(doc, Wgit::Document)
-      selection = { url: doc.url }
+      selection = { 'url.url' => doc.url }
       doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
       update = { '$set' => doc_hash }
       mutate(true, :documents, selection, update)

data/lib/wgit/database/model.rb CHANGED Viewed

@@ -26,7 +26,7 @@ module Wgit
       raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
       model = doc.to_h(include_html: false, include_score: false)
-      model['url'] = self.url(doc.url) # Expand Url String into full object.
+      model['url'] = url(doc.url) # Expand Url String into full object.
       Wgit::Utils.remove_non_bson_types(model)
     end
@@ -36,7 +36,7 @@ module Wgit
     # @return [Hash] Insertion fields common to all models.
     def self.common_insert_data
       {
-        date_added:    Wgit::Utils.time_stamp,
+        date_added: Wgit::Utils.time_stamp,
         date_modified: Wgit::Utils.time_stamp
       }
     end

data/lib/wgit/document.rb CHANGED Viewed

@@ -5,7 +5,8 @@ require 'nokogiri'
 require 'json'
 module Wgit
-  # Class modeling a HTML web document. Also doubles as a search result when
+  # Class primarily modeling a HTML web document, although other MIME types
+  # will work e.g. images etc. Also doubles as a search result when
   # loading Documents from the database via Wgit::Database#search.
   #
   # The initialize method dynamically initializes instance variables from the
@@ -60,11 +61,11 @@ module Wgit
     #   only used if url_or_obj is a String representing the web page's URL.
     #   Otherwise, the HTML comes from the database object. A html of nil will
     #   be defaulted to an empty String.
-    def initialize(url_or_obj, html = '')
+    def initialize(url_or_obj, html = '', encode_html: true)
       if url_or_obj.is_a?(String)
-        init_from_strings(url_or_obj, html)
+        init_from_strings(url_or_obj, html, encode_html: encode_html)
       else
-        init_from_object(url_or_obj)
+        init_from_object(url_or_obj, encode_html: encode_html)
       end
     end
@@ -91,25 +92,28 @@ module Wgit
     # instance variables upon Document initialization. See the default
     # extensions defined in 'document_extensions.rb' as examples.
     #
-    # Initialises a private instance variable with the xpath or database object
-    # result(s). When initialising from HTML, a true singleton value will only
-    # ever return one result otherwise all xpath results are returned in an
+    # Note that defined extensions work for both Documents initialized from
+    # HTML (via Wgit::Crawler methods) and from database objects.
+    # An extension once defined, initializes a private instance variable with
+    # the xpath or database object result(s).
+    #
+    # When initialising from HTML, a singleton value of true will only
+    # ever return one result; otherwise all xpath results are returned in an
     # Array. When initialising from a database object, the value is taken as
     # is and singleton is only used to define the default empty value.
     # If a value cannot be found (in either the HTML or database object), then
-    # a default will be used. The default value is: singleton ? nil : [].
-    #
-    # Note that defined extensions work for both documents initialized from
-    # the WWW (via Wgit::Crawler methods) and from database objects. This
-    # effectively implements ORM like behavior using this class.
+    # a default will be used. The default value is: `singleton ? nil : []`.
     #
     # @param var [Symbol] The name of the variable to be initialised.
     # @param xpath [String, Object#call] The xpath used to find the element(s)
-    #   of the webpage. Pass a callable object (proc etc.) if you want the
+    #   of the webpage. Only used when initializing from HTML.
+    #
+    #   Pass a callable object (proc etc.) if you want the
     #   xpath value to be derived on Document initialisation (instead of when
     #   the extension is defined). The call method must return a valid xpath
     #   String.
-    # @param options [Hash] The options to define an extension with.
+    # @param options [Hash] The options to define an extension with. The
+    #   options are only used when intializing from HTML, not the database.
     # @option options [Boolean] :singleton The singleton option determines
     #   whether or not the result(s) should be in an Array. If multiple
     #   results are found and singleton is true then the first result will be
@@ -117,16 +121,17 @@ module Wgit
     # @option options [Boolean] :text_content_only The text_content_only option
     #   if true will use the text content of the Nokogiri result object,
     #   otherwise the Nokogiri object itself is returned. Defaults to true.
-    # @yield [value, source] Yields the value (Object) about to be assigned to
-    #   the new var and the source (Symbol) of the value (either :html or
-    #   :object). The return value of the block becomes the new var value,
-    #   unless nil. Return nil if you want to inspect but not change the var
-    #   value. The block gets executed when a Document is initialized from html
-    #   or an object e.g. database.
+    # @yield [value, source, type] Yields the value (Object) about to be
+    #   assigned to the new var, the source of the value (Wgit::Document or DB
+    #   Object) and the source type (Symbol of either :document or :object).
+    #
+    #   The return value of the block becomes the new var value, unless nil.
+    #   Return nil if you want to inspect but not change the var value. The
+    #   block is executed when a Wgit::Document is initialized.
     # @raise [StandardError] If the var param isn't valid.
-    # @return [Symbol] The first half of the newly defined method names e.g.
-    #   if var == "title" then :init_title is returned.
+    # @return [Symbol] The given var Symbol.
     def self.define_extension(var, xpath, options = {}, &block)
+      var = var.to_sym
       default_options = { singleton: true, text_content_only: true }
       options = default_options.merge(options)
@@ -149,7 +154,7 @@ module Wgit
       end
       Document.send :private, func_name
-      "init_#{var}".to_sym
+      var
     end
     # Removes the init_* methods created when an extension is defined.
@@ -189,55 +194,48 @@ module Wgit
       @html[range]
     end
-    # Returns the timestamp of when this Document was crawled.
-    #
-    # @return [Time] Time of when this Document was crawled.
-    def date_crawled
-      @url.date_crawled
-    end
-    # Returns the duration of the crawl for this Document (in seconds).
-    #
-    # @return [Float] The duration of the crawl for this Document.
-    def crawl_duration
-      @url.crawl_duration
-    end
     # Returns the base URL of this Wgit::Document. The base URL is either the
     # <base> element's href value or @url (if @base is nil). If @base is
     # present and relative, then @url.to_base + @base is returned. This method
     # should be used instead of `doc.url.to_base` etc. when manually building
-    # absolute links from relative links.
+    # absolute links from relative links; or use `link.prefix_base(doc)`.
     #
     # Provide the `link:` parameter to get the correct base URL for that type
     # of link. For example, a link of `#top` would always return @url because
     # it applies to that page, not a different one. Query strings work in the
     # same way. Use this parameter if manually concatting Url's e.g.
     #
-    #   relative_link = Wgit::Url.new '?q=hello'
+    #   relative_link = Wgit::Url.new('?q=hello')
     #   absolute_link = doc.base_url(link: relative_link).concat(relative_link)
     #
     # This is similar to how Wgit::Document#internal_absolute_links works.
     #
     # @param link [Wgit::Url, String] The link to obtain the correct base URL
-    #   for.
+    #   for; must be relative, not absolute.
+    # @raise [StandardError] If link is relative or if a base URL can't be
+    #   established e.g. the doc @url is relative and <base> is nil.
     # @return [Wgit::Url] The base URL of this Document e.g.
     #   'http://example.com/public'.
     def base_url(link: nil)
+      raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
+      if @url.relative? && @base.nil?
+      raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
+      if @url.relative? && @base&.relative?
       get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
       if link
         link = Wgit::Url.new(link)
         raise "link must be relative: #{link}" unless link.relative?
-        if link.is_anchor? || link.is_query?
+        if link.is_fragment? || link.is_query?
           base_url = @base ? get_base.call : @url
-          return base_url.without_anchor.without_query
+          return base_url.omit_fragment.omit_query
         end
       end
-      base_url = @base ? get_base.call : @url.base
-      base_url.without_anchor.without_query
+      base_url = @base ? get_base.call : @url.to_base
+      base_url.omit_fragment.omit_query
     end
     # Returns a Hash containing this Document's instance vars.
@@ -340,7 +338,7 @@ module Wgit
       links = @links
               .select { |link| link.relative?(host: @url.to_base) }
-              .map(&:without_base)
+              .map(&:omit_base)
               .map do |link| # Map @url.to_host into / as it's a duplicate.
         link.to_host == @url.to_host ? Wgit::Url.new('/') : link
       end
@@ -354,7 +352,7 @@ module Wgit
     #
     # @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
     def internal_absolute_links
-      internal_links.map { |link| base_url(link: link).concat(link) }
+      internal_links.map { |link| link.prefix_base(self) }
     end
     # Returns all external links from this Document in absolute form. External
@@ -366,7 +364,7 @@ module Wgit
       links = @links
               .reject { |link| link.relative?(host: @url.to_base) }
-              .map(&:without_trailing_slash)
+              .map(&:omit_trailing_slash)
       Wgit::Utils.process_arr(links)
     end
@@ -438,7 +436,7 @@ module Wgit
       orig_text = @text
       @text = search(
         query, case_sensitive: case_sensitive,
-        whole_sentence: whole_sentence, sentence_limit: sentence_limit
+               whole_sentence: whole_sentence, sentence_limit: sentence_limit
       )
       orig_text
@@ -473,7 +471,7 @@ module Wgit
     # @yield [value, source] Given the value (String/Object) before it's set as
     #   an instance variable so that you can inspect/alter the value if
     #   desired. Return nil from the block if you don't want to override the
-    #   value. Also given the source (Symbol) which is always :html.
+    #   value. Also given the source (Symbol) which is always :document.
     # @return [String, Object] The value found in the html or the default value
     #   (singleton ? nil : []).
     def find_in_html(xpath, singleton: true, text_content_only: true)
@@ -492,7 +490,7 @@ module Wgit
       singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
       if block_given?
-        new_result = yield(result, :html)
+        new_result = yield(result, self, :document)
         result = new_result unless new_result.nil?
       end
@@ -519,7 +517,7 @@ module Wgit
       singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
       if block_given?
-        new_result = yield(result, :object)
+        new_result = yield(result, obj, :object)
         result = new_result unless new_result.nil?
       end
@@ -529,19 +527,19 @@ module Wgit
     private
     # Initialise the Document from URL and HTML Strings.
-    def init_from_strings(url, html)
+    def init_from_strings(url, html, encode_html: true)
       assert_types(html, [String, NilClass])
       # We already know url.is_a?(String) so parse into Url unless already so.
       url = Wgit::Url.parse(url)
-      url.crawled = true unless url.crawled # Avoid overriding date_crawled.
+      url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
       @url   = url
       @html  = html || ''
       @doc   = init_nokogiri
       @score = 0.0
-      process_url_and_html
+      Wgit::Utils.process_str(@html, encode: encode_html)
       # Dynamically run the init_*_from_html methods.
       Document.private_instance_methods(false).each do |method|
@@ -554,7 +552,7 @@ module Wgit
     # Initialise the Document from a Hash like Object containing Strings as
     # keys e.g. database collection object or Hash.
-    def init_from_object(obj)
+    def init_from_object(obj, encode_html: true)
       assert_respond_to(obj, :fetch)
       @url   = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -562,7 +560,7 @@ module Wgit
       @doc   = init_nokogiri
       @score = obj.fetch('score', 0.0)
-      process_url_and_html
+      Wgit::Utils.process_str(@html, encode: encode_html)
       # Dynamically run the init_*_from_object methods.
       Document.private_instance_methods(false).each do |method|
@@ -573,12 +571,6 @@ module Wgit
       end
     end
-    # Ensure the @url and @html Strings are correctly encoded etc.
-    def process_url_and_html
-      @url  = Wgit::Utils.process_str(@url)
-      @html = Wgit::Utils.process_str(@html)
-    end
     # Initialises an instance variable and defines a getter method for it.
     #
     # @param var [Symbol] The name of the variable to be initialized.
@@ -597,6 +589,7 @@ module Wgit
       end
     end
+    alias content                html
     alias statistics             stats
     alias internal_urls          internal_links
     alias internal_absolute_urls internal_absolute_links