RubyGems - wgit - Versions diffs - 0.0.18 → 0.2.0 - Mend

wgit 0.0.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/wgit.rb +0 -1
data/lib/wgit/assertable.rb +20 -23
data/lib/wgit/core_ext.rb +6 -14
data/lib/wgit/crawler.rb +94 -183
data/lib/wgit/database/database.rb +209 -185
data/lib/wgit/database/model.rb +7 -7
data/lib/wgit/document.rb +281 -241
data/lib/wgit/indexer.rb +99 -92
data/lib/wgit/logger.rb +5 -1
data/lib/wgit/url.rb +171 -185
data/lib/wgit/utils.rb +57 -68
data/lib/wgit/version.rb +1 -1
metadata +86 -60
data/CHANGELOG.md +0 -61
data/LICENSE.txt +0 -21
data/README.md +0 -361
data/TODO.txt +0 -34
data/lib/wgit/database/connection_details.rb +0 -41

data/lib/wgit/indexer.rb CHANGED

@@ -5,28 +5,28 @@ require_relative 'database/database'
 module Wgit
   # Convience method to index the World Wide Web using
-  # Wgit::Indexer#index_the_web.
+  # Wgit::Indexer#index_www.
   #
   # Retrieves uncrawled url's from the database and recursively crawls each
   # site storing their internal pages into the database and adding their
   # external url's to be crawled later on. Logs info on the crawl
   # using Wgit.logger as it goes along.
   #
-  # @param max_sites_to_crawl [Integer] The number of separate and whole
+  # @param max_sites [Integer] The number of separate and whole
   #   websites to be crawled before the method exits. Defaults to -1 which
   #   means the crawl will occur until manually stopped (Ctrl+C etc).
-  # @param max_data_size [Integer] The maximum amount of bytes that will be
+  # @param max_data [Integer] The maximum amount of bytes that will be
   #   scraped from the web (default is 1GB). Note, that this value is used to
   #   determine when to stop crawling; it's not a guarantee of the max data
   #   that will be obtained.
-  def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
+  def self.index_www(max_sites: -1, max_data: 1_048_576_000)
     db = Wgit::Database.new
     indexer = Wgit::Indexer.new(db)
-    indexer.index_the_web(max_sites_to_crawl, max_data_size)
+    indexer.index_www(max_sites: max_sites, max_data: max_data)
   end
   # Convience method to index a single website using
-  # Wgit::Indexer#index_this_site.
+  # Wgit::Indexer#index_site.
   #
   # Crawls a single website's pages and stores them into the database.
   # There is no max download limit so be careful which sites you index.
@@ -34,18 +34,18 @@ module Wgit
   # @param url [Wgit::Url, String] The base Url of the website to crawl.
   # @param insert_externals [Boolean] Whether or not to insert the website's
   #   external Url's into the database.
-  # @yield [Wgit::Document] Given the Wgit::Document of each crawled webpage,
-  #   before it is inserted into the database allowing for prior manipulation.
+  # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
+  #   inserted into the database allowing for prior manipulation.
   # @return [Integer] The total number of pages crawled within the website.
-  def self.index_this_site(url, insert_externals = true, &block)
-    url = Wgit::Url.new url
+  def self.index_site(url, insert_externals: true, &block)
+    url = Wgit::Url.parse(url)
     db = Wgit::Database.new
     indexer = Wgit::Indexer.new(db)
-    indexer.index_this_site(url, insert_externals, &block)
+    indexer.index_site(url, insert_externals: insert_externals, &block)
   end
   # Convience method to index a single webpage using
-  # Wgit::Indexer#index_this_page.
+  # Wgit::Indexer#index_page.
   #
   # Crawls a single webpage and stores it into the database.
   # There is no max download limit so be careful of large pages.
@@ -53,36 +53,50 @@ module Wgit
   # @param url [Wgit::Url, String] The Url of the webpage to crawl.
   # @param insert_externals [Boolean] Whether or not to insert the website's
   #   external Url's into the database.
-  # @yield [Wgit::Document] Given the Wgit::Document of the crawled webpage,
-  #   before it is inserted into the database allowing for prior manipulation.
-  def self.index_this_page(url, insert_externals = true, &block)
-    url = Wgit::Url.new url
+  # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
+  #   inserted into the database allowing for prior manipulation.
+  def self.index_page(url, insert_externals: true, &block)
+    url = Wgit::Url.parse(url)
     db = Wgit::Database.new
     indexer = Wgit::Indexer.new(db)
-    indexer.index_this_page(url, insert_externals, &block)
+    indexer.index_page(url, insert_externals: insert_externals, &block)
   end
   # Performs a search of the database's indexed documents and pretty prints
-  # the results. See Wgit::Database#search for details of the search.
+  # the results. See Wgit::Database#search and Wgit::Document#search for
+  # details of how the search works.
   #
   # @param query [String] The text query to search with.
+  # @param case_sensitive [Boolean] Whether character case must match.
   # @param whole_sentence [Boolean] Whether multiple words should be searched
   #   for separately.
-  # @param limit [Integer] The max number of results to return.
+  # @param limit [Integer] The max number of results to print.
   # @param skip [Integer] The number of DB records to skip.
-  # @param sentence_length [Integer] The max length of each result's text
+  # @param sentence_limit [Integer] The max length of each result's text
   #   snippet.
-  # @yield [Wgit::Document] Given each search result (Wgit::Document).
-  def self.indexed_search(query, whole_sentence = false, limit = 10,
-                          skip = 0, sentence_length = 80, &block)
-    db = Wgit::Database.new
-    results = db.search(query, whole_sentence, limit, skip, &block)
-    Wgit::Utils.printf_search_results(results, query, false, sentence_length)
+  # @yield [doc] Given each search result (Wgit::Document) returned from the
+  #   database.
+  def self.indexed_search(query, case_sensitive: false, whole_sentence: false,
+                          limit: 10, skip: 0, sentence_limit: 80, &block)
+    results = Wgit::Database.new.search(
+      query, case_sensitive: case_sensitive, whole_sentence: whole_sentence,
+      limit: limit, skip: skip, &block
+    )
+    results.each do |doc|
+      doc.search!(
+        query,
+        case_sensitive: case_sensitive,
+        whole_sentence: whole_sentence,
+        sentence_limit: sentence_limit)
+    end
+    Wgit::Utils.printf_search_results(results)
   end
   # Class which sets up a crawler and saves the indexed docs to a database.
   class Indexer
-    # The crawler used to scrape the WWW.
+    # The crawler used to index the WWW.
     attr_reader :crawler
     # The database instance used to store Urls and Documents in.
@@ -91,74 +105,73 @@ module Wgit
     # Initialize the Indexer.
     #
     # @param database [Wgit::Database] The database instance (already
-    #   initialized with the correct connection details etc).
+    #   initialized with the correct connection string etc).
     def initialize(database)
       @crawler = Wgit::Crawler.new
-      @db = database
+      @db      = database
     end
     # Retrieves uncrawled url's from the database and recursively crawls each
     # site storing their internal pages into the database and adding their
-    # external url's to be crawled later on. Logs info on the crawl
-    # using Wgit.logger as it goes along.
+    # external url's to be crawled later on. Logs info on the crawl using
+    # Wgit.logger as it goes along.
     #
-    # @param max_sites_to_crawl [Integer] The number of separate and whole
+    # @param max_sites [Integer] The number of separate and whole
     #   websites to be crawled before the method exits. Defaults to -1 which
     #   means the crawl will occur until manually stopped (Ctrl+C etc).
-    # @param max_data_size [Integer] The maximum amount of bytes that will be
+    # @param max_data [Integer] The maximum amount of bytes that will be
     #   scraped from the web (default is 1GB). Note, that this value is used to
     #   determine when to stop crawling; it's not a guarantee of the max data
     #   that will be obtained.
-    def index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
-      if max_sites_to_crawl < 0
-        Wgit.logger.info("Indexing until the database has been filled or it runs out of \
-urls to crawl (which might be never).")
+    def index_www(max_sites: -1, max_data: 1_048_576_000)
+      if max_sites.negative?
+        Wgit.logger.info("Indexing until the database has been filled or it \
+runs out of urls to crawl (which might be never).")
       end
       site_count = 0
-      while keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
+      while keep_crawling?(site_count, max_sites, max_data)
         Wgit.logger.info("Current database size: #{@db.size}")
-        @crawler.urls = @db.uncrawled_urls
-        if @crawler.urls.empty?
+        uncrawled_urls = @db.uncrawled_urls(limit: 100)
+        if uncrawled_urls.empty?
           Wgit.logger.info('No urls to crawl, exiting.')
           return
         end
-        Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
+        Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
         docs_count = 0
         urls_count = 0
-        @crawler.urls.each do |url|
-          unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
-            Wgit.logger.info("Reached max number of sites to crawl or database \
-capacity, exiting.")
+        uncrawled_urls.each do |url|
+          unless keep_crawling?(site_count, max_sites, max_data)
+            Wgit.logger.info("Reached max number of sites to crawl or \
+database capacity, exiting.")
             return
           end
           site_count += 1
-          url.crawled = true
-          raise unless @db.update(url) == 1
           site_docs_count = 0
           ext_links = @crawler.crawl_site(url) do |doc|
-            unless doc.empty?
-              if write_doc_to_db(doc)
-                docs_count += 1
-                site_docs_count += 1
-              end
+            if !doc.empty? && write_doc_to_db(doc)
+              docs_count += 1
+              site_docs_count += 1
             end
           end
+          raise 'Error updating url' unless @db.update(url) == 1
           urls_count += write_urls_to_db(ext_links)
           Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
 site: #{url}")
         end
-        Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) overall for \
-this iteration.")
-        Wgit.logger.info("Found and saved #{urls_count} external url(s) for the next \
-iteration.")
+        Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
+overall for this iteration.")
+        Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
+the next iteration.")
         nil
       end
@@ -171,30 +184,27 @@ iteration.")
     # @param url [Wgit::Url] The base Url of the website to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the website's
     #   external Url's into the database.
-    # @yield [Wgit::Document] Given the Wgit::Document of each crawled web
-    #   page, before it is inserted into the database allowing for prior
-    #   manipulation. Return nil or false from the block to prevent the
-    #   document from being saved into the database.
+    # @yield [doc] Given the Wgit::Document of each crawled web page before
+    #   it's inserted into the database allowing for prior manipulation. Return
+    #   nil or false from the block to prevent the document from being saved
+    #   into the database.
     # @return [Integer] The total number of webpages/documents indexed.
-    def index_this_site(url, insert_externals = true)
+    def index_site(url, insert_externals: true)
       total_pages_indexed = 0
       ext_urls = @crawler.crawl_site(url) do |doc|
         result = true
         result = yield(doc) if block_given?
-        if result
-          if write_doc_to_db(doc)
-            total_pages_indexed += 1
-            Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
-          end
+        if result && !doc.empty? && write_doc_to_db(doc)
+          total_pages_indexed += 1
+          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
         end
       end
-      url.crawled = true
       @db.url?(url) ? @db.update(url) : @db.insert(url)
-      if insert_externals
+      if insert_externals && ext_urls
         write_urls_to_db(ext_urls)
         Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
       end
@@ -212,27 +222,24 @@ site: #{url}")
     # @param url [Wgit::Url] The webpage Url to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the webpage's
     #   external Url's into the database.
-    # @yield [Wgit::Document] Given the Wgit::Document of the crawled webpage,
-    #   before it is inserted into the database allowing for prior
+    # @yield [doc] Given the Wgit::Document of the crawled webpage,
+    #   before it's inserted into the database allowing for prior
     #   manipulation. Return nil or false from the block to prevent the
     #   document from being saved into the database.
-    def index_this_page(url, insert_externals = true)
-      document = @crawler.crawl_page(url) do |doc|
+    def index_page(url, insert_externals: true)
+      document = @crawler.crawl_url(url) do |doc|
         result = true
         result = yield(doc) if block_given?
-        if result
-          if write_doc_to_db(doc)
-            Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
-          end
+        if result && !doc.empty? && write_doc_to_db(doc)
+          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
         end
       end
-      url.crawled = true
       @db.url?(url) ? @db.update(url) : @db.insert(url)
-      if insert_externals
-        ext_urls = document.external_links
+      ext_urls = document&.external_links
+      if insert_externals && ext_urls
         write_urls_to_db(ext_urls)
         Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
       end
@@ -246,20 +253,16 @@ site: #{url}")
     # loop iteration.
     #
     # @param site_count [Integer] The current number of crawled sites.
-    # @param max_sites_to_crawl [Integer] The maximum number of sites to crawl
-    #   before stopping.
-    # @param max_data_size [Integer] The maximum amount of data to crawl before
+    # @param max_sites [Integer] The maximum number of sites to crawl
+    #   before stopping. Use -1 for an infinite number of sites.
+    # @param max_data [Integer] The maximum amount of data to crawl before
     #   stopping.
     # @return [Boolean] True if the crawl should continue, false otherwise.
-    def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
-      return false if @db.size >= max_data_size
-      # If max_sites_to_crawl is -1 for example then crawl away.
-      if max_sites_to_crawl < 0
-        true
-      else
-        site_count < max_sites_to_crawl
-      end
+    def keep_crawling?(site_count, max_sites, max_data)
+      return false if @db.size >= max_data
+      return true  if max_sites.negative?
+      site_count < max_sites
     end
     # Write the doc to the DB. Note that the unique url index on the documents
@@ -270,9 +273,11 @@ site: #{url}")
     def write_doc_to_db(doc)
       @db.insert(doc)
       Wgit.logger.info("Saved document for url: #{doc.url}")
       true
     rescue Mongo::Error::OperationFailure
       Wgit.logger.info("Document already exists: #{doc.url}")
       false
     end
@@ -283,6 +288,7 @@ site: #{url}")
     # @return [Boolean] True if the write was successful, false otherwise.
     def write_urls_to_db(urls)
       count = 0
       if urls.respond_to?(:each)
         urls.each do |url|
           @db.insert(url)
@@ -292,6 +298,7 @@ site: #{url}")
           Wgit.logger.info("Url already exists: #{url}")
         end
       end
       count
     end
   end

data/lib/wgit/logger.rb CHANGED

@@ -6,16 +6,18 @@ require 'logger'
 module Wgit
   # The Logger instance used by Wgit. Set your own custom logger after
-  # requiring this file if needed.
+  # requiring this file as needed.
   @logger = nil
   # Returns the current Logger instance.
+  #
   # @return [Logger] The current Logger instance.
   def self.logger
     @logger
   end
   # Sets the current Logger instance.
+  #
   # @param logger [Logger] The Logger instance to use.
   # @return [Logger] The current Logger instance having being set.
   def self.logger=(logger)
@@ -23,6 +25,7 @@ module Wgit
   end
   # Returns the default Logger instance.
+  #
   # @return [Logger] The default Logger instance.
   def self.default_logger
     logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
@@ -33,6 +36,7 @@ module Wgit
   end
   # Sets the default Logger instance to be used by Wgit.
+  #
   # @return [Logger] The default Logger instance.
   def self.use_default_logger
     @logger = default_logger

data/lib/wgit/url.rb CHANGED

@@ -6,10 +6,11 @@ require 'uri'
 require 'addressable/uri'
 module Wgit
-  # Class modeling a web based URL.
+  # Class modeling a web based HTTP URL.
+  #
   # Can be an internal/relative link e.g. "about.html" or a full URL
-  # e.g. "http://www.google.co.uk". Is a subclass of String and uses
-  # 'uri' and 'addressable/uri' internally.
+  # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
+  # 'addressable/uri' internally.
   class Url < String
     include Assertable
@@ -17,104 +18,73 @@ module Wgit
     # is also provided by this class.
     attr_reader :crawled
-    # The date which the Url was crawled.
+    # The Time which the Url was crawled.
     attr_accessor :date_crawled
     # Initializes a new instance of Wgit::Url which represents a web based
     # HTTP URL.
     #
-    # @param url_or_obj [String, Object#fetch#[]] Is either a String based
-    #     URL or an object representing a Database record e.g. a MongoDB
-    #     document/object.
-    # @param crawled [Boolean] Whether or not the HTML of the URL's web
-    #     page has been scraped or not.
-    # @param date_crawled [Time] Should only be provided if crawled is
-    #     true. A suitable object can be returned from
-    #     Wgit::Utils.time_stamp.
-    # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
-    def initialize(url_or_obj, crawled = false, date_crawled = nil)
+    # @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
+    #   based URL or an object representing a Database record e.g. a MongoDB
+    #   document/object.
+    # @param crawled [Boolean] Whether or not the HTML of the URL's web page
+    #   has been crawled or not. Only used if url_or_obj is a String.
+    # @param date_crawled [Time] Should only be provided if crawled is true. A
+    #   suitable object can be returned from Wgit::Utils.time_stamp. Only used
+    #   if url_or_obj is a String.
+    # @raise [StandardError] If url_or_obj is an Object with missing methods.
+    def initialize(url_or_obj, crawled: false, date_crawled: nil)
       # Init from a URL String.
       if url_or_obj.is_a?(String)
         url = url_or_obj.to_s
-      # Else init from a database object/document.
+      # Else init from a Hash like object e.g. database object.
       else
         obj = url_or_obj
-        assert_respond_to(obj, [:fetch, :[]])
+        assert_respond_to(obj, :fetch)
-        url = obj.fetch('url') # Should always be present.
-        crawled = obj.fetch('crawled', false)
-        date_crawled = obj['date_crawled']
+        url          = obj.fetch('url') # Should always be present.
+        crawled      = obj.fetch('crawled', false)
+        date_crawled = obj.fetch('date_crawled', nil)
       end
-      @uri = Addressable::URI.parse(url)
-      @crawled = crawled
+      @uri          = Addressable::URI.parse(url)
+      @crawled      = crawled
       @date_crawled = date_crawled
       super(url)
     end
-    # A class alias for Url.new.
+    # Initialises a new Wgit::Url instance from a String or subclass of String
+    # e.g. Wgit::Url. Any other obj type will raise an error.
     #
-    # @param str [String] The URL string to parse.
-    # @return [Wgit::Url] The parsed Url object.
-    def self.parse(str)
-      new(str)
-    end
-    # Raises an exception if url is not a valid HTTP URL.
+    # If obj is already a Wgit::Url then it will be returned as is to maintain
+    # it's state. Otherwise, a new Wgit::Url is instantiated and returned. This
+    # differs from Wgit::Url.new which always instantiates a new Wgit::Url.
     #
-    # @param url [Wgit::Url, String] The Url to validate.
-    # @raise [RuntimeError] If url is invalid.
-    def self.validate(url)
-      url = Wgit::Url.new(url)
-      raise "Invalid url (or a relative link): #{url}" if url.relative_link?
-      unless url.start_with?('http://') || url.start_with?('https://')
-        raise "Invalid url (missing protocol prefix): #{url}"
-      end
-      if URI::DEFAULT_PARSER.make_regexp.match(url.normalise).nil?
-        raise "Invalid url: #{url}"
-      end
-    end
-    # Determines if the Url is valid or not.
+    # Note: Only use this method if you are allowing obj to be either a String
+    # or a Wgit::Url whose state you want to preserve e.g. when passing a URL
+    # to a crawl method which might redirect (calling Wgit::Url#replace). If
+    # you're sure of the type or don't care about preserving the state of the
+    # Wgit::Url, use Wgit::Url.new instead.
     #
-    # @param url [Wgit::Url, String] The Url to validate.
-    # @return [Boolean] True if valid, otherwise false.
-    def self.valid?(url)
-      Wgit::Url.validate(url)
-      true
-    rescue StandardError
-      false
-    end
+    # @param obj [Object] The object to parse, which #is_a?(String).
+    # @raise [StandardError] If obj.is_a?(String) is false.
+    # @return [Wgit::Url] A Wgit::Url instance.
+    def self.parse(obj)
+      raise 'Can only parse if obj#is_a?(String)' unless obj.is_a?(String)
-    # Modifies the receiver url by prefixing it with a protocol.
-    # Returns the url whether its been modified or not.
-    # The default protocol prefix is http://.
-    #
-    # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
-    # @param https [Boolean] Whether the protocol prefix is https or http.
-    # @return [Wgit::Url] The url with a protocol prefix.
-    def self.prefix_protocol(url, https = false)
-      unless url.start_with?('http://') || url.start_with?('https://')
-        if https
-          url.replace("https://#{url}")
-        else
-          url.replace("http://#{url}")
-        end
-      end
-      url
+      # Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
+      obj.is_a?(Wgit::Url) ? obj : new(obj)
     end
-    # Concats the host and link Strings and returns the result.
+    # Sets the @crawled instance var, also setting @date_crawled to the
+    # current time or nil (depending on the bool value).
     #
-    # @param host [Wgit::Url, String] The Url host.
-    # @param link [Wgit::Url, String] The link to add to the host prefix.
-    # @return [Wgit::Url] host + "/" + link
-    def self.concat(host, link)
-      host = Wgit::Url.new(host).without_trailing_slash
-      link = Wgit::Url.new(link).without_leading_slash
-      separator = (link.start_with?('#') || link.start_with?('?')) ? '' : '/'
-      Wgit::Url.new(host + separator + link)
+    # @param bool [Boolean] True if self has been crawled, false otherwise.
+    # @return [Time, NilClass] Returns the date crawled, if set.
+    def crawled=(bool)
+      @crawled = bool
+      @date_crawled = bool ? Wgit::Utils.time_stamp : nil
     end
     # Overrides String#replace setting the new_url @uri and String value.
@@ -123,108 +93,138 @@ module Wgit
     # @return [String] The new URL value once set.
     def replace(new_url)
       @uri = Addressable::URI.parse(new_url)
       super(new_url)
     end
     # Returns true if self is a relative Url; false if absolute.
     #
     # All external links in a page are expected to have a protocol prefix e.g.
-    # "http://", otherwise the link is treated as an internal link (regardless
+    # 'http://', otherwise the link is treated as an internal link (regardless
     # of whether it's valid or not). The only exception is if an opts arg is
-    # provided and self is a page belonging to that arg type e.g. domain; then
+    # provided and self is a page belonging to that arg type e.g. host; then
     # the link is relative.
     #
-    # @param opts [Hash] The options with which to check relativity.
+    # @param opts [Hash] The options with which to check relativity. Only one
+    #   opts param should be provided. The provided opts param Url must be
+    #   absolute and be prefixed with a protocol. Consider using the output of
+    #   Wgit::Url#to_base which should work unless it's nil.
+    # @option opts [Wgit::Url, String] :base The Url base e.g.
+    #   http://www.google.com/how which gives a base of
+    #   'http://www.google.com'.
     # @option opts [Wgit::Url, String] :host The Url host e.g.
     #   http://www.google.com/how which gives a host of 'www.google.com'.
-    #   The host must be absolute and prefixed with a protocol.
     # @option opts [Wgit::Url, String] :domain The Url domain e.g.
-    #   http://www.google.com/how which gives a domain of 'google.com'. The
-    #   domain must be absolute and prefixed with a protocol.
+    #   http://www.google.com/how which gives a domain of 'google.com'.
     # @option opts [Wgit::Url, String] :brand The Url brand e.g.
-    #   http://www.google.com/how which gives a domain of 'google'. The
-    #   brand must be absolute and prefixed with a protocol.
-    # @raise [RuntimeError] If self is invalid e.g. empty.
+    #   http://www.google.com/how which gives a domain of 'google'.
+    # @raise [StandardError] If self is invalid e.g. empty or an invalid opts
+    #   param has been provided.
     # @return [Boolean] True if relative, false if absolute.
-    def is_relative?(opts = {})
-      opts = { host: nil, domain: nil, brand: nil }.merge(opts)
-      raise "Invalid link: '#{self}'" if empty?
-      if opts.values.count(nil) < (opts.length - 1)
-        raise "Provide only one of: #{opts.keys}"
-      end
+    def relative?(opts = {})
+      defaults = { base: nil, host: nil, domain: nil, brand: nil }
+      opts = defaults.merge(opts)
+      raise 'Url (self) cannot be empty' if empty?
-      host = opts[:host]
-      if host
-        host = Wgit::Url.new(host)
-        if host.to_base.nil?
-          raise "Invalid host, must be absolute and contain protocol: #{host}"
-        end
-      end
+      return true if @uri.relative?
-      domain = opts[:domain]
-      if domain
-        domain = Wgit::Url.new(domain)
-        if domain.to_base.nil?
-          raise "Invalid domain, must be absolute and contain protocol: #{domain}"
-        end
-      end
+      # Self is absolute but may be relative to the opts param e.g. host.
+      opts.select! { |_k, v| v }
+      raise "Provide only one of: #{defaults.keys}" if opts.length > 1
-      brand = opts[:brand]
-      if brand
-        brand = Wgit::Url.new(brand)
-        if brand.to_base.nil?
-          raise "Invalid brand, must be absolute and contain protocol: #{brand}"
-        end
-      end
+      return false if opts.empty?
-      if @uri.relative?
-        true
+      type, url = opts.first
+      url = Wgit::Url.new(url)
+      raise "Invalid opts param value, Url must be absolute and contain \
+protocol: #{url}" unless url.to_base
+      case type
+      when :base   # http://www.google.com
+        to_base   == url.to_base
+      when :host   # www.google.com
+        to_host   == url.to_host
+      when :domain # google.com
+        to_domain == url.to_domain
+      when :brand  # google
+        to_brand  == url.to_brand
       else
-        return host   ? to_host   == host.to_host     : false if host
-        return domain ? to_domain == domain.to_domain : false if domain
-        return brand  ? to_brand  == brand.to_brand   : false if brand
-        false
+        raise "Unknown opts param: :#{type}, use one of: #{defaults.keys}"
       end
     end
-    # Determines if self is a valid Url or not.
+    # Returns true if self is an absolute Url; false if relative.
     #
-    # @return [Boolean] True if valid, otherwise false.
-    def valid?
-      Wgit::Url.valid?(self)
+    # @return [Boolean] True if absolute, false if relative.
+    def absolute?
+      @uri.absolute?
     end
-    # Concats self and the link.
+    # Returns if self is a valid and absolute HTTP Url or not.
     #
-    # @param link [Wgit::Url, String] The link to concat with self.
-    # @return [Wgit::Url] self + "/" + link
-    def concat(link)
-      Wgit::Url.concat(self, link)
+    # @return [Boolean] True if valid and absolute, otherwise false.
+    def valid?
+      return false if relative?
+      return false unless start_with?('http://') || start_with?('https://')
+      return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
+      true
     end
-    # Sets the @crawled instance var, also setting @date_crawled to the
-    # current time or nil (depending on the bool value).
+    # Concats self and path together before returning a new Url. Self is not
+    # modified.
     #
-    # @param bool [Boolean] True if self has been crawled, false otherwise.
-    def crawled=(bool)
-      @crawled = bool
-      @date_crawled = bool ? Wgit::Utils.time_stamp : nil
+    # @param path [Wgit::Url, String] The path to concat onto the end of self.
+    # @return [Wgit::Url] self + separator + path, separator depends on path.
+    def concat(path)
+      path = Wgit::Url.new(path)
+      raise 'path must be relative' unless path.is_relative?
+      path = path.without_leading_slash
+      separator = path.start_with?('#') || path.start_with?('?') ? '' : '/'
+      Wgit::Url.new(without_trailing_slash + separator + path)
     end
-    # Normalises/escapes self and returns a new Wgit::Url.
+    # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
     #
-    # @return [Wgit::Url] An encoded version of self.
-    def normalise
+    # @return [Wgit::Url] An escaped version of self.
+    def normalize
       Wgit::Url.new(@uri.normalize.to_s)
     end
+    # Modifies self by prefixing it with a protocol. Returns the url whether
+    # its been modified or not. The default protocol prefix is http://.
+    #
+    # @param protocol [Symbol] Either :http or :https.
+    # @return [Wgit::Url] The url with protocol prefix (having been modified).
+    def prefix_protocol(protocol: :http)
+      unless %i[http https].include?(protocol)
+        raise 'protocol must be :http or :https'
+      end
+      unless start_with?('http://') || start_with?('https://')
+        protocol == :http ? replace("http://#{url}") : replace("https://#{url}")
+      end
+      self
+    end
+    # Returns a Hash containing this Url's instance vars excluding @uri.
+    # Used when storing the URL in a Database e.g. MongoDB etc.
+    #
+    # @return [Hash] self's instance vars as a Hash.
+    def to_h
+      ignore = ['@uri']
+      h = Wgit::Utils.to_h(self, ignore: ignore)
+      Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
+    end
     # Returns a normalised URI object for this URL.
     #
     # @return [URI::HTTP, URI::HTTPS] The URI object of self.
     def to_uri
-      URI(normalise)
+      URI(normalize)
     end
     # Returns self.
@@ -311,7 +311,7 @@ module Wgit
     # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
     #
     # @return [Wgit::Url, nil] Containing just the query string or nil.
-    def to_query_string
+    def to_query
       query = @uri.query
       query ? Wgit::Url.new("?#{query}") : nil
     end
@@ -361,9 +361,8 @@ module Wgit
     #
     # @return [Wgit::Url] Self without leading or trailing slashes.
     def without_slashes
-      self.
-        without_leading_slash.
-        without_trailing_slash
+      without_leading_slash
+      .without_trailing_slash
     end
     # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
@@ -388,8 +387,8 @@ module Wgit
     # URL.
     #
     # @return [Wgit::Url] Self with the query string portion removed.
-    def without_query_string
-      query = to_query_string
+    def without_query
+      query = to_query
       without_query_string = query ? gsub(query, '') : self
       Wgit::Url.new(without_query_string)
@@ -410,56 +409,43 @@ module Wgit
       Wgit::Url.new(without_anchor)
     end
-    # Returns true if self is a URL query string e.g. ?q=hello etc.
+    # Returns true if self is a URL query string e.g. ?q=hello etc. Note this
+    # shouldn't be used to determine if self contains a query.
     #
     # @return [Boolean] True if self is a query string, false otherwise.
-    def is_query_string?
+    def query?
       start_with?('?')
     end
-    # Returns true if self is a URL anchor/fragment e.g. #top etc.
+    # Returns true if self is a URL anchor/fragment e.g. #top etc. Note this
+    # shouldn't be used to determine if self contains an anchor/fragment.
     #
     # @return [Boolean] True if self is a anchor/fragment, false otherwise.
-    def is_anchor?
+    def anchor?
       start_with?('#')
     end
-    # Returns a Hash containing this Url's instance vars excluding @uri.
-    # Used when storing the URL in a Database e.g. MongoDB etc.
-    #
-    # @return [Hash] self's instance vars as a Hash.
-    def to_h
-      ignore = ['@uri']
-      h = Wgit::Utils.to_h(self, ignore)
-      Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
-    end
-    alias uri to_uri
-    alias url to_url
-    alias scheme to_scheme
-    alias to_protocol to_scheme
-    alias protocol to_scheme
-    alias host to_host
-    alias domain to_domain
-    alias brand to_brand
-    alias base to_base
-    alias path to_path
-    alias endpoint to_endpoint
-    alias query_string to_query_string
-    alias query to_query_string
-    alias anchor to_anchor
-    alias to_fragment to_anchor
-    alias fragment to_anchor
-    alias extension to_extension
-    alias without_query without_query_string
+    alias crawled?         crawled
+    alias is_relative?     relative?
+    alias is_absolute?     absolute?
+    alias is_valid?        valid?
+    alias normalise        normalize
+    alias uri              to_uri
+    alias url              to_url
+    alias scheme           to_scheme
+    alias host             to_host
+    alias domain           to_domain
+    alias brand            to_brand
+    alias base             to_base
+    alias path             to_path
+    alias endpoint         to_endpoint
+    alias query            to_query
+    alias anchor           to_anchor
+    alias fragment         to_anchor
+    alias extension        to_extension
     alias without_fragment without_anchor
-    alias is_query? is_query_string?
-    alias is_fragment? is_anchor?
-    alias relative_link? is_relative?
-    alias internal_link? is_relative?
-    alias is_internal? is_relative?
-    alias relative? is_relative?
-    alias crawled? crawled
-    alias normalize normalise
+    alias is_query?        query?
+    alias is_anchor?       anchor?
+    alias fragment?        anchor?
   end
 end