RubyGems - wgit - Versions diffs - 0.5.1 → 0.6.0 - Mend

wgit 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.yardopts +7 -0
data/CHANGELOG.md +174 -0
data/CODE_OF_CONDUCT.md +76 -0
data/CONTRIBUTING.md +21 -0
data/LICENSE.txt +21 -0
data/README.md +399 -0
data/lib/wgit/crawler.rb +135 -119
data/lib/wgit/document.rb +45 -67
data/lib/wgit/document_extensions.rb +1 -1
data/lib/wgit/response.rb +6 -6
data/lib/wgit/url.rb +23 -14
data/lib/wgit/utils.rb +2 -2
data/lib/wgit/version.rb +1 -1
metadata +10 -5

data/lib/wgit/crawler.rb CHANGED

@@ -5,6 +5,7 @@ require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
 require_relative 'response'
+require 'set'
 require 'typhoeus'
 module Wgit
@@ -14,17 +15,26 @@ module Wgit
   class Crawler
     include Assertable
+    # The URL file extensions (from `<a>` hrefs) which will be crawled by
+    # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
+    # doesn't keep the crawl of the site going. All URL's without a file
+    # extension will be crawled, because they're assumed to be HTML.
+    SUPPORTED_FILE_EXTENSIONS = Set.new(%w[
+      asp aspx cfm cgi htm html htmlx jsp php
+    ])
     # The amount of allowed redirects before raising an error. Set to 0 to
-    # disable redirects completely.
+    # disable redirects completely; or you can pass `follow_redirects: false`
+    # to any Wgit::Crawler.crawl_* method.
     attr_accessor :redirect_limit
     # The maximum amount of time (in seconds) a crawl request has to complete
     # before raising an error. Set to 0 to disable time outs completely.
     attr_accessor :time_out
-    # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
-    # crawling more than just HTML e.g. images etc.
-    attr_accessor :encode_html
+    # Whether or not to UTF-8 encode the response body once crawled. Set to
+    # false if crawling more than just HTML e.g. images.
+    attr_accessor :encode
     # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
@@ -36,18 +46,22 @@ module Wgit
     # @param time_out [Integer, Float] The maximum amount of time (in seconds)
     #   a crawl request has to complete before raising an error. Set to 0 to
     #   disable time outs completely.
-    # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
-    #   crawled. Set to false if crawling more than just HTML e.g. images etc.
-    def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
+    # @param encode [Boolean] Whether or not to UTF-8 encode the response body
+    #   once crawled. Set to false if crawling more than just HTML e.g. images.
+    def initialize(redirect_limit: 5, time_out: 5, encode: true)
       @redirect_limit = redirect_limit
       @time_out       = time_out
-      @encode_html    = encode_html
+      @encode         = encode
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal <a> links. Each crawled Document is yielded to a block. Use
-    # the allow and disallow paths params to partially and selectively crawl a
-    # site.
+    # its internal `<a>` links. Each crawled Document is yielded to a block.
+    # Use `doc.empty?` to determine if the crawled link is valid.
+    #
+    # Use the allow and disallow paths params to partially and selectively
+    # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
+    # Note that each path must NOT start with a slash; the only exception being
+    # a `/` on its own with no other characters, referring to the index page.
     #
     # Only redirects to the same host are followed. For example, the Url
     # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -61,65 +75,64 @@ module Wgit
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
     # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them only if their path includes one of allow_paths.
+    #   them if their path `File.fnmatch?` one of allow_paths.
     # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path includes one of disallow_paths.
+    #   them if their path `File.fnmatch?` one of disallow_paths.
     # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
+    #   Use `doc.empty?` to determine if the page is valid.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
-    #   from all of the site's pages or nil if the url could not be
+    #   from all of the site's pages or nil if the given url could not be
     #   crawled successfully.
     def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      crawl_opts = { follow_external_redirects: false, host: url.to_base }
-      link_opts  = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
-      crawled   = [url, alt_url]
-      externals = doc.external_links
-      internals = get_internal_links(doc, link_opts)
-      return doc.external_links.uniq if internals.empty?
+      crawled   = Set.new([url, alt_url])
+      externals = Set.new(doc.external_links)
+      internals = Set.new(get_internal_links(doc, path_opts))
-      loop do
-        crawled.uniq!
-        internals.uniq!
+      return externals.to_a if internals.empty?
+      loop do
         links = internals - crawled
         break if links.empty?
         links.each do |link|
           orig_link = link.dup
-          doc = crawl_url(link, crawl_opts, &block)
+          doc = crawl_url(link, follow_redirects: :host, &block)
-          crawled.push(orig_link, link) # Push both in case of redirects.
+          crawled += [orig_link, link] # Push both links in case of redirects.
           next if doc.nil?
-          internals.concat(get_internal_links(doc, link_opts))
-          externals.concat(doc.external_links)
+          internals += get_internal_links(doc, path_opts)
+          externals += doc.external_links
         end
       end
-      externals.uniq
+      externals.to_a
     end
     # Crawls one or more individual urls using Wgit::Crawler#crawl_url
     # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
     #
     # @param urls [*Wgit::Url] The Url's to crawl.
+    # @param follow_redirects [Boolean, Symbol] Whether or not to follow
+    #   redirects. Pass a Symbol to limit where the redirect is allowed to go
+    #   e.g. :host only allows redirects within the same host. Choose from
+    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   This value will be used for all urls crawled.
     # @yield [doc] Given each crawled page (Wgit::Document); this is the only
     #   way to interact with them.
     # @raise [StandardError] If no urls are provided.
     # @return [Wgit::Document] The last Document crawled.
-    def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
+    def crawl_urls(*urls, follow_redirects: true, &block)
       raise 'You must provide at least one Url' if urls.empty?
-      opts = {
-        follow_external_redirects: follow_external_redirects,
-        host: host
-      }
+      opts = { follow_redirects: follow_redirects }
       doc = nil
       Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
@@ -131,33 +144,22 @@ module Wgit
     # occurs.
     #
     # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
-    # @param follow_external_redirects [Boolean] Whether or not to follow
-    #   an external redirect. External meaning to a different host. False will
-    #   return nil for such a crawl. If false, you must also provide a `host:`
-    #   parameter.
-    # @param host [Wgit::Url, String] Specify the host by which
-    #   an absolute redirect is determined to be internal or not. Must be
-    #   absolute and contain a protocol prefix. For example, a `host:` of
-    #   'http://www.example.com' will only allow redirects for Url's with a
-    #   `to_host` value of 'www.example.com'.
+    # @param follow_redirects [Boolean, Symbol] Whether or not to follow
+    #   redirects. Pass a Symbol to limit where the redirect is allowed to go
+    #   e.g. :host only allows redirects within the same host. Choose from
+    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
     #   crawl was successful or not. Therefore, Document#url etc. can be used.
     # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
     #   crawl was unsuccessful.
-    def crawl_url(url, follow_external_redirects: true, host: nil)
+    def crawl_url(url, follow_redirects: true)
       # A String url isn't allowed because it's passed by value not reference,
       # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
       assert_type(url, Wgit::Url)
-      raise 'host cannot be nil if follow_external_redirects is false' \
-      if !follow_external_redirects && host.nil?
-      html = fetch(
-        url,
-        follow_external_redirects: follow_external_redirects,
-        host: host
-      )
+      html = fetch(url, follow_redirects: follow_redirects)
+      doc  = Wgit::Document.new(url, html, encode: @encode)
-      doc = Wgit::Document.new(url, html, encode_html: @encode_html)
       yield(doc) if block_given?
       doc.empty? ? nil : doc
@@ -171,26 +173,18 @@ module Wgit
     #
     # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
     #   reference and gets modified as a result of the fetch/crawl.
-    # @param follow_external_redirects [Boolean] Whether or not to follow
-    #   an external redirect. False will return nil for such a crawl. If false,
-    #   you must also provide a `host:` parameter.
-    # @param host [Wgit::Url, String] Specify the host by which
-    #   an absolute redirect is determined to be internal or not. Must be
-    #   absolute and contain a protocol prefix. For example, a `host:` of
-    #   'http://www.example.com' will only allow redirects for Urls with a
-    #   `to_host` value of 'www.example.com'.
+    # @param follow_redirects [Boolean, Symbol] Whether or not to follow
+    #   redirects. Pass a Symbol to limit where the redirect is allowed to go
+    #   e.g. :host only allows redirects within the same host. Choose from
+    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    # @raise [StandardError] If url isn't valid and absolute.
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
-    def fetch(url, follow_external_redirects: true, host: nil)
+    def fetch(url, follow_redirects: true)
       response = Wgit::Response.new
+      raise "Invalid url: #{url}" if url.invalid?
-      resolve(
-        url,
-        response,
-        follow_external_redirects: follow_external_redirects,
-        host: host
-      )
+      resolve(url, response, follow_redirects: follow_redirects)
       response.body_or_nil
     rescue StandardError => e
       Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -209,16 +203,15 @@ module Wgit
     # @param url [Wgit::Url] The URL to GET and resolve.
     # @param response [Wgit::Response] The response to enrich. Modifies by
     #   reference.
-    # @param follow_external_redirects [Boolean] Whether or not to follow
-    #   an external redirect. If false, you must also provide a `host:`
-    #   parameter.
-    # @param host [Wgit::Url, String] Specify the host by which
-    #   an absolute redirect is determined to be internal or not. Must be
-    #   absolute and contain a protocol prefix. For example, a `host:` of
-    #   'http://www.example.com' will only allow redirects for Urls with a
-    #   `to_host` value of 'www.example.com'.
+    # @param follow_redirects [Boolean, Symbol] Whether or not to follow
+    #   redirects. Pass a Symbol to limit where the redirect is allowed to go
+    #   e.g. :host only allows redirects within the same host. Choose from
+    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If a redirect isn't allowed etc.
-    def resolve(url, response, follow_external_redirects: true, host: nil)
+    def resolve(url, response, follow_redirects: true)
+      orig_url_base = url.to_url.to_base # Recorded before any redirects.
+      follow_redirects, within = redirect?(follow_redirects)
       loop do
         get_response(url, response)
         break unless response.redirect?
@@ -229,10 +222,11 @@ module Wgit
         yield(url, response, location) if block_given?
-        # Validate redirect.
-        if !follow_external_redirects && !location.relative?(host: host)
-          raise "External redirect not allowed - Redirected to: \
-'#{location}', which is outside of host: '#{host}'"
+        # Validate if the redirect is allowed.
+        raise "Redirect not allowed: #{location}" unless follow_redirects
+        if within && !location.relative?(within => orig_url_base)
+          raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
         end
         raise "Too many redirects, exceeded: #{@redirect_limit}" \
@@ -268,15 +262,8 @@ module Wgit
       response.ip_address       = http_response.primary_ip
       response.add_total_time(http_response.total_time)
-      # Log (debug) the request/response details.
-      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
-      log_status     = (response.status || 0)
-      log_total_time = response.total_time.truncate(3)
-      Wgit.logger.debug("[http] Request:  #{response.url}")
-      Wgit.logger.debug(
-        format(resp_template, log_status, response.size, log_total_time)
-      )
+      # Log the request/response details.
+      log_http(response)
       # Handle a failed response.
       raise "No response (within timeout: #{@time_out} second(s))" \
@@ -304,28 +291,29 @@ module Wgit
     # Returns a doc's internal HTML page links in absolute form; used when
     # crawling a site. Use the allow and disallow paths params to partially
-    # and selectively crawl a site.
+    # and selectively crawl a site; the glob syntax is supported e.g.
+    # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
     #
     # Override this method in a subclass to change how a site
-    # is crawled; not what is extracted from each page (Document extensions
+    # is crawled, not what is extracted from each page (Document extensions
     # should be used for this purpose instead). Just remember that only HTML
-    # files containing <a> links can keep the crawl going beyond the base URL.
+    # files containing `<a>` links keep the crawl going beyond the base URL.
     #
     # @param doc [Wgit::Document] The document from which to extract it's
-    #   internal page links.
+    #   internal (absolute) page links.
     # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them only if their path includes one of allow_paths.
+    #   them if their path `File.fnmatch?` one of allow_paths.
     # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path includes one of disallow_paths.
+    #   them if their path `File.fnmatch?` one of disallow_paths.
     # @return [Array<Wgit::Url>] The internal page links from doc.
     def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
       links = doc
               .internal_absolute_links
               .map(&:omit_fragment) # Because fragments don't alter content.
               .uniq
-              .reject do |link|
+              .select do |link|
         ext = link.to_extension
-        ext ? !%w[htm html].include?(ext.downcase) : false
+        ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
       end
       return links if allow_paths.nil? && disallow_paths.nil?
@@ -335,40 +323,68 @@ module Wgit
     private
+    # Returns whether or not to follow redirects, and within what context e.g.
+    # :host, :domain etc.
+    def redirect?(follow_redirects)
+      return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
+      unless [true, false].include?(follow_redirects)
+        raise "follow_redirects: must be a Boolean or Symbol, not: \
+#{follow_redirects}"
+      end
+      [follow_redirects, nil]
+    end
+    # Log (at debug level) the HTTP request/response details.
+    def log_http(response)
+      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
+      log_status     = (response.status || 0)
+      log_total_time = response.total_time.truncate(3)
+      Wgit.logger.debug("[http] Request:  #{response.url}")
+      Wgit.logger.debug(
+        format(resp_template, log_status, response.size, log_total_time)
+      )
+    end
     # Validate and filter by the given URL paths.
     def process_paths(links, allow_paths, disallow_paths)
-      raise "You can't provide both allow_paths: and disallow_paths: params" \
-      if allow_paths && disallow_paths
-      if allow_paths  # White list.
-        filter_method = :select
-        paths         = allow_paths
-      else            # Black list.
-        filter_method = :reject
-        paths         = disallow_paths
+      if allow_paths
+        paths = validate_paths(allow_paths)
+        filter_links(links, :select!, paths)
       end
+      if disallow_paths
+        paths = validate_paths(disallow_paths)
+        filter_links(links, :reject!, paths)
+      end
+      links
+    end
+    # Validate the paths are suitable for filtering.
+    def validate_paths(paths)
       paths = [paths] unless paths.is_a?(Array)
-      paths = paths
-              .compact
-              .reject(&:empty?)
-              .uniq
-              .map { |path| Wgit::Url.new(path).to_path }
+      raise 'The provided paths must all be Strings' \
+      unless paths.all? { |path| path.is_a?(String) }
+      Wgit::Utils.process_arr(paths, encode: false)
       raise 'The provided paths cannot be empty' if paths.empty?
-      filter_links_by_path(links, filter_method, paths)
+      paths
     end
-    # Filters links by selecting or rejecting them based on their path.
-    def filter_links_by_path(links, filter_method, paths)
+    # Filters links by selecting/rejecting them based on their path.
+    # Uses File.fnmatch? so that globbing is supported.
+    def filter_links(links, filter_method, paths)
       links.send(filter_method) do |link|
-        link_path = link.to_path
-        next(false) unless link_path
+        # Turn http://example.com into / meaning index.
+        link = link.to_endpoint == '/' ? '/' : link.omit_base
         match = false
-        paths.each do |path|
-          match = link_path.start_with?(path)
+        paths.each do |pattern|
+          match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
           break if match
         end

data/lib/wgit/document.rb CHANGED

@@ -7,41 +7,31 @@ require 'json'
 module Wgit
   # Class primarily modeling a HTML web document, although other MIME types
   # will work e.g. images etc. Also doubles as a search result when
-  # loading Documents from the database via Wgit::Database#search.
+  # loading Documents from the database via `Wgit::Database#search`.
   #
   # The initialize method dynamically initializes instance variables from the
   # Document HTML / Database object e.g. text. This bit is dynamic so that the
   # Document class can be easily extended allowing you to pull out the bits of
-  # a webpage that are important to you. See Wgit::Document.define_extension.
+  # a webpage that are important to you. See `Wgit::Document.define_extension`.
   class Document
     include Assertable
     # Regex for the allowed var names when defining an extension.
     REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
-    # The HTML elements that make up the visible text on a page.
-    # These elements are used to initialize the @text of the Document.
-    # See the README.md for how to add to this Array dynamically.
-    @text_elements = %i[
-      dd div dl dt figcaption figure hr li
-      main ol p pre span ul h1 h2 h3 h4 h5
-    ]
-    class << self
-      # Class level instance reader method for @text_elements.
-      attr_reader :text_elements
-    end
+    # The xpath used to extract the visible text on a page.
+    TEXT_ELEMENTS_XPATH = '//*/text()'.freeze
     # The URL of the webpage, an instance of Wgit::Url.
     attr_reader :url
-    # The HTML of the webpage, an instance of String.
+    # The content/HTML of the document, an instance of String.
     attr_reader :html
     # The Nokogiri::HTML document object initialized from @html.
     attr_reader :doc
-    # The score is only used following a Database#search and records matches.
+    # The score is only used following a `Database#search` and records matches.
     attr_reader :score
     # Initialize takes either two strings (representing the URL and HTML) or an
@@ -50,44 +40,29 @@ module Wgit
     # pages retrieved from the database.
     #
     # During initialisation, the Document will call any private
-    # 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
+    # `init_*_from_html` and `init_*_from_object` methods it can find. See the
     # README.md and Wgit::Document.define_extension method for more details.
     #
-    # @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
+    # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
     #   representing a URL or a Hash-like object responding to :fetch. e.g. a
     #   MongoDB collection object. The Object's :fetch method should support
     #   Strings as keys.
-    # @param html [String, NilClass] The crawled web page's HTML. This param is
-    #   only used if url_or_obj is a String representing the web page's URL.
-    #   Otherwise, the HTML comes from the database object. A html of nil will
-    #   be defaulted to an empty String.
-    def initialize(url_or_obj, html = '', encode_html: true)
+    # @param html [String, NilClass] The crawled web page's content/HTML. This
+    #   param is only used if url_or_obj is a String representing the web
+    #   page's URL. Otherwise, the HTML comes from the database object. A html
+    #   of nil will be defaulted to an empty String.
+    # @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
+    #   false if the Document content is an image etc.
+    def initialize(url_or_obj, html = '', encode: true)
       if url_or_obj.is_a?(String)
-        init_from_strings(url_or_obj, html, encode_html: encode_html)
+        init_from_strings(url_or_obj, html, encode: encode)
       else
-        init_from_object(url_or_obj, encode_html: encode_html)
+        init_from_object(url_or_obj, encode: encode)
       end
     end
     ### Document Class Methods ###
-    # Uses Document.text_elements to build an xpath String, used to obtain
-    # all of the combined text on a webpage.
-    #
-    # @return [String] An xpath String to obtain a webpage's text elements.
-    def self.text_elements_xpath
-      xpath = ''
-      return xpath if Wgit::Document.text_elements.empty?
-      el_xpath = '//%s/text()'
-      Wgit::Document.text_elements.each_with_index do |el, i|
-        xpath += ' | ' unless i.zero?
-        xpath += format(el_xpath, el)
-      end
-      xpath
-    end
     # Defines an extension, which is a way to serialise HTML elements into
     # instance variables upon Document initialization. See the default
     # extensions defined in 'document_extensions.rb' as examples.
@@ -105,35 +80,36 @@ module Wgit
     # a default will be used. The default value is: `singleton ? nil : []`.
     #
     # @param var [Symbol] The name of the variable to be initialised.
-    # @param xpath [String, Object#call] The xpath used to find the element(s)
+    # @param xpath [String, #call] The xpath used to find the element(s)
     #   of the webpage. Only used when initializing from HTML.
     #
     #   Pass a callable object (proc etc.) if you want the
     #   xpath value to be derived on Document initialisation (instead of when
     #   the extension is defined). The call method must return a valid xpath
     #   String.
-    # @param options [Hash] The options to define an extension with. The
+    # @param opts [Hash] The options to define an extension with. The
     #   options are only used when intializing from HTML, not the database.
-    # @option options [Boolean] :singleton The singleton option determines
+    # @option opts [Boolean] :singleton The singleton option determines
     #   whether or not the result(s) should be in an Array. If multiple
     #   results are found and singleton is true then the first result will be
     #   used. Defaults to true.
-    # @option options [Boolean] :text_content_only The text_content_only option
+    # @option opts [Boolean] :text_content_only The text_content_only option
     #   if true will use the text content of the Nokogiri result object,
     #   otherwise the Nokogiri object itself is returned. Defaults to true.
-    # @yield [value, source, type] Yields the value (Object) about to be
-    #   assigned to the new var, the source of the value (Wgit::Document or DB
-    #   Object) and the source type (Symbol of either :document or :object).
-    #
-    #   The return value of the block becomes the new var value, unless nil.
-    #   Return nil if you want to inspect but not change the var value. The
-    #   block is executed when a Wgit::Document is initialized.
+    # @yieldparam value [Object] The value to be assigned to the new var.
+    # @yieldparam source [Wgit::Document, Object] The source of the value.
+    # @yieldparam type [Symbol] The source type, either :document or (DB)
+    #   :object.
+    # @yieldreturn [Object] The return value of the block becomes the new var
+    #   value, unless nil. Return nil if you want to inspect but not change the
+    #   var value. The block is executed when a Wgit::Document is initialized,
+    #   regardless of the source.
     # @raise [StandardError] If the var param isn't valid.
-    # @return [Symbol] The given var Symbol.
-    def self.define_extension(var, xpath, options = {}, &block)
+    # @return [Symbol] The given var Symbol if successful.
+    def self.define_extension(var, xpath, opts = {}, &block)
       var = var.to_sym
-      default_options = { singleton: true, text_content_only: true }
-      options = default_options.merge(options)
+      defaults = { singleton: true, text_content_only: true }
+      opts = defaults.merge(opts)
       raise "var must match #{REGEX_EXTENSION_NAME}" unless \
       var =~ REGEX_EXTENSION_NAME
@@ -141,7 +117,7 @@ module Wgit
       # Define the private init_*_from_html method for HTML.
       # Gets the HTML's xpath value and creates a var for it.
       func_name = Document.send(:define_method, "init_#{var}_from_html") do
-        result = find_in_html(xpath, options, &block)
+        result = find_in_html(xpath, opts, &block)
         init_var(var, result)
       end
       Document.send :private, func_name
@@ -149,7 +125,7 @@ module Wgit
       # Define the private init_*_from_object method for a Database object.
       # Gets the Object's 'key' value and creates a var for it.
       func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
-        result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
+        result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
         init_var(var, result)
       end
       Document.send :private, func_name
@@ -381,7 +357,7 @@ module Wgit
     # original sentence, which ever is less. The algorithm obviously ensures
     # that the search query is visible somewhere in the sentence.
     #
-    # @param query [String, Object#to_s] The value to search the document's
+    # @param query [String, #to_s] The value to search the document's
     #   @text for.
     # @param case_sensitive [Boolean] Whether character case must match.
     # @param whole_sentence [Boolean] Whether multiple words should be searched
@@ -401,10 +377,12 @@ module Wgit
       results = {}
       @text.each do |sentence|
+        sentence = sentence.strip
+        next if results[sentence]
         hits = sentence.scan(regex).count
         next unless hits.positive?
-        sentence.strip!
         index = sentence.index(regex) # Index of first match.
         Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
@@ -422,7 +400,7 @@ module Wgit
     # functionality. The original text is returned; no other reference to it
     # is kept thereafter.
     #
-    # @param query [String, Object#to_s] The value to search the document's
+    # @param query [String, #to_s] The value to search the document's
     #   @text for.
     # @param case_sensitive [Boolean] Whether character case must match.
     # @param whole_sentence [Boolean] Whether multiple words should be searched
@@ -499,7 +477,7 @@ module Wgit
     # Returns a value from the obj using the given key via obj#fetch.
     #
-    # @param obj [Object#fetch] The object containing the key/value.
+    # @param obj [#fetch] The object containing the key/value.
     # @param key [String] Used to find the value in the obj.
     # @param singleton [Boolean] True if a single value, false otherwise.
     # @yield [value, source] Given the value (String/Object) before it's set as
@@ -527,7 +505,7 @@ module Wgit
     private
     # Initialise the Document from URL and HTML Strings.
-    def init_from_strings(url, html, encode_html: true)
+    def init_from_strings(url, html, encode: true)
       assert_types(html, [String, NilClass])
       # We already know url.is_a?(String) so parse into Url unless already so.
@@ -539,7 +517,7 @@ module Wgit
       @doc   = init_nokogiri
       @score = 0.0
-      Wgit::Utils.process_str(@html, encode: encode_html)
+      Wgit::Utils.process_str(@html, encode: encode)
       # Dynamically run the init_*_from_html methods.
       Document.private_instance_methods(false).each do |method|
@@ -552,7 +530,7 @@ module Wgit
     # Initialise the Document from a Hash like Object containing Strings as
     # keys e.g. database collection object or Hash.
-    def init_from_object(obj, encode_html: true)
+    def init_from_object(obj, encode: true)
       assert_respond_to(obj, :fetch)
       @url   = Wgit::Url.new(obj.fetch('url')) # Should always be present.
@@ -560,7 +538,7 @@ module Wgit
       @doc   = init_nokogiri
       @score = obj.fetch('score', 0.0)
-      Wgit::Utils.process_str(@html, encode: encode_html)
+      Wgit::Utils.process_str(@html, encode: encode)
       # Dynamically run the init_*_from_object methods.
       Document.private_instance_methods(false).each do |method|