RubyGems - wgit - Versions diffs - 0.0.1 → 0.0.2 - Mend

wgit 0.0.1 → 0.0.2

Files changed (15) hide show

checksums.yaml +5 -5
data/lib/wgit.rb +1 -1
data/lib/wgit/assertable.rb +72 -61
data/lib/wgit/core_ext.rb +11 -5
data/lib/wgit/crawler.rb +97 -57
data/lib/wgit/database/database.rb +247 -170
data/lib/wgit/database/model.rb +40 -24
data/lib/wgit/database/mongo_connection_details.rb +44 -23
data/lib/wgit/document.rb +534 -233
data/lib/wgit/indexer.rb +235 -0
data/lib/wgit/url.rb +199 -121
data/lib/wgit/utils.rb +143 -96
data/lib/wgit/version.rb +5 -1
metadata +10 -9
data/lib/wgit/web_crawler.rb +0 -134

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
-  data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
+SHA256:
+  metadata.gz: c2ee83f5b722a6aff6fad7727751e149b58de02de3abecd49e51066a1ecf035d
+  data.tar.gz: 112fb3b45192e781d8a4b4eeaca6d3ad7abaaa1a29b8833654b17a9b38644f81
 SHA512:
-  metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
-  data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
+  metadata.gz: a29f32db3538bed0e2b09ae599623ee9751ed0f929f83b1d7a987982132431cc72b89fa5bc3e2522c50c50c4a7d0fd33aaf0fbc000986b1db8f0494a37cebd7c
+  data.tar.gz: faf0f814dad58fef4a4ec61a5cdd3f4b3ba7170b417e48a18f4a45922efa0dcab8ffaf4bbe0d198d53e5778056599e6827d1315ce68e08984ca2af47e26631bc

data/lib/wgit.rb CHANGED

@@ -1,6 +1,6 @@
 require_relative 'wgit/version'
 require_relative 'wgit/crawler'
-require_relative 'wgit/web_crawler'
+require_relative 'wgit/indexer'
 require_relative 'wgit/url'
 require_relative 'wgit/document'
 require_relative 'wgit/utils'

data/lib/wgit/assertable.rb CHANGED

@@ -1,69 +1,80 @@
 module Wgit
-  # @author Michael Telford
-  # Module containing assert methods including type checking which can be used
-  # for asserting the integrity of method definitions etc.
+  # Module containing assert methods including type checking which can be used
+  # for asserting the integrity of method definitions etc.
   module Assertable
-      DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
-      WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
-      DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
-      # obj.instance_of? must return true for one of the types listed in
-      # type_or_types or an exception is thrown using msg if provided.
-      # type_or_types can be a single Class or an Enumerable of Class objects,
-      # Strings and Symbols will not work.
-      def assert_types(obj, type_or_types, msg = nil)
-          msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
-          if type_or_types.respond_to?(:any?)
-              match = type_or_types.any? { |type| obj.instance_of?(type) }
-          else
-              match = obj.instance_of?(type_or_types)
-          end
-          raise msg unless match
-          obj
-      end
-      # Each object within arr must match one of the types listed in
-      # type_or_types or an exception is thrown using msg if provided.
-      # type_or_types can be a single Class or an Enumerable of Class objects,
-      # Strings and Symbols will not work.
-      def assert_arr_types(arr, type_or_types, msg = nil)
-          raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
-          arr.each do |obj|
-              assert_types(obj, type_or_types, msg)
-          end
+    DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s".freeze
+    WRONG_METHOD_MSG = "arr must be Enumerable, use a different method".freeze
+    DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s".freeze
+    # Tests if the obj is of a given type.
+    #
+    # @param obj [Object] The Object to test.
+    # @param type_or_types [Type, Array<Type>] The type/types that obj must
+    #     belong to or an exception is thrown.
+    # @param msg [String] The raised RuntimeError message, if provided.
+    # @return [Object] The given obj on successful assertion.
+    def assert_types(obj, type_or_types, msg = nil)
+      msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
+      if type_or_types.respond_to?(:any?)
+        match = type_or_types.any? { |type| obj.instance_of?(type) }
+      else
+        match = obj.instance_of?(type_or_types)
       end
-      # The obj_or_objs must respond_to? all of the given methods or an
-      # Exception is raised using msg or a default message.
-      # Returns obj_or_objs on sucessful assertion.
-      def assert_respond_to(obj_or_objs, methods, msg = nil)
-          if obj_or_objs.respond_to?(:each)
-              obj_or_objs.each do |obj|
-                  _assert_respond_to(obj, methods, msg)
-              end
-          else
-              _assert_respond_to(obj_or_objs, methods, msg)
-          end
-          obj_or_objs
+      raise msg unless match
+      obj
+    end
+    # Each object within arr must match one of the types listed in
+    # type_or_types or an exception is raised using msg, if provided.
+    #
+    # @param arr [Enumerable#each] Enumerable of objects to type check.
+    # @param type_or_types [Type, Array<Type>] The allowed type(s).
+    # @param msg [String] The raised RuntimeError message, if provided.
+    # @return [Object] The given arr on successful assertion.
+    def assert_arr_types(arr, type_or_types, msg = nil)
+      raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
+      arr.each do |obj|
+        assert_types(obj, type_or_types, msg)
       end
-      private
-      def _assert_respond_to(obj, methods, msg = nil)
-          msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
-          match = methods.all? { |method| obj.respond_to?(method) }
-          raise msg unless match
-          obj
+    end
+    # The obj_or_objs must respond_to? all of the given methods or an
+    # Exception is raised using msg, if provided.
+    #
+    # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
+    # @param methods [Array<Symbol>] The methods to :respond_to?.
+    # @param msg [String] The raised RuntimeError message, if provided.
+    # @return [Object] The given obj_or_objs on successful assertion.
+    def assert_respond_to(obj_or_objs, methods, msg = nil)
+      methods = [methods] unless methods.respond_to?(:all?)
+      if obj_or_objs.respond_to?(:each)
+        obj_or_objs.each do |obj|
+          _assert_respond_to(obj, methods, msg)
+        end
+      else
+        _assert_respond_to(obj_or_objs, methods, msg)
       end
-      alias :assert_type :assert_types
-      alias :type :assert_types
-      alias :types :assert_types
-      alias :assert_arr_type :assert_arr_types
-      alias :arr_type :assert_arr_types
-      alias :arr_types :assert_arr_types
-      alias :respond_to :assert_respond_to
+      obj_or_objs
+    end
+  private
+    # obj must respond_to? all methods or an exception is raised.
+    def _assert_respond_to(obj, methods, msg = nil)
+      raise "methods must respond_to? :all?" unless methods.respond_to?(:all?)
+      msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
+      match = methods.all? { |method| obj.respond_to?(method) }
+      raise msg unless match
+      obj
+    end
+    alias :assert_type :assert_types
+    alias :type :assert_types
+    alias :types :assert_types
+    alias :assert_arr_type :assert_arr_types
+    alias :arr_type :assert_arr_types
+    alias :arr_types :assert_arr_types
+    alias :respond_to :assert_respond_to
   end
 end

data/lib/wgit/core_ext.rb CHANGED

@@ -1,11 +1,12 @@
 require_relative 'url'
-# @author Michael Telford
 # Script which extends Ruby's core functionality when parsed.
-# Needs to be required separately using `require 'wgit/core_ext'`.
+# Needs to be required separately using `require 'wgit/core_ext'`.
 class String
-  # Converts a String into a Wgit::Url object.
+  # Converts a String into a Wgit::Url object.
+  #
+  # @return [Wgit::Url] The converted URL.
   def to_url
     Wgit::Url.new(self)
   end
@@ -13,7 +14,9 @@ end
 module Enumerable
   # Converts each String instance into a Wgit::Url object and returns the new
-  # array.
+  # Array.
+  #
+  # @return [Array<Wgit::Url>] The converted URL's.
   def to_urls
     map do |element|
       process_url_element(element)
@@ -21,7 +24,9 @@ module Enumerable
   end
   # Converts each String instance into a Wgit::Url object and returns the
-  # updated array.
+  # updated array. Modifies the receiver.
+  #
+  # @return [Array<Wgit::Url>] Self containing the converted URL's.
   def to_urls!
     map! do |element|
       process_url_element(element)
@@ -31,6 +36,7 @@ end
 private
+# Converts the element to a Wgit::Url if the element is a String.
 def process_url_element(element)
   if element.is_a? String
     element.to_url

data/lib/wgit/crawler.rb CHANGED

@@ -3,67 +3,106 @@ require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
 require 'net/http' # requires 'uri'
 module Wgit
-  # @author Michael Telford
-  # Crawler class provides a means of crawling web URL's.
-  # Note that any redirects will not be followed for during crawling
-  # functionality.
+  # The Crawler class provides a means of crawling web based URL's, turning
+  # their HTML into Wgit::Document's.
+  # Note that currently all redirects will not be followed during a crawl.
   class Crawler
     include Assertable
-  	attr_reader :urls, :docs
+    # The urls to crawl.
+    attr_reader :urls
+    # The docs of the crawled @urls.
+    attr_reader :docs
-  	def initialize(*urls)
-  		self.urls = urls unless urls.nil?
+    # Initializes the Crawler by setting the @urls and @docs.
+    #
+    # @param urls [*Wgit::Url] The URLs to crawl.
+    def initialize(*urls)
+      self.[](*urls)
       @docs = []
-  	end
+    end
+    # Sets this Crawler's @urls.
+    #
+    # @param urls [Array<Wgit::Url>] The URLs to crawl.
     def urls=(urls)
-        @urls = []
-        Wgit::Utils.each(urls) { |url| add_url(url) }
+      @urls = []
+      Wgit::Utils.each(urls) { |url| add_url(url) }
     end
+    # Sets this Crawler's @urls.
+    #
+    # @param urls [*Wgit::Url] The URLs to crawl.
     def [](*urls)
-        self.urls = urls unless urls.nil?
+      # If urls is nil then add_url (when called later) will set @urls = []
+      # so we do nothing here.
+      if not urls.nil?
+        # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
+        # outer array is bogus so we use the inner one only.
+        if  urls.is_a?(Enumerable) &&
+            urls.length == 1 &&
+            urls.first.is_a?(Enumerable)
+          urls = urls.first
+        end
+        # Here we call urls= method using self because the param name is also
+        # urls which conflicts.
+        self.urls = urls
+      end
     end
+    # Adds the url to this Crawler's @urls.
+    #
+    # @param url [Wgit::Url] A URL to crawl.
     def <<(url)
-        add_url(url)
+      add_url(url)
     end
     # Crawls individual urls, not entire sites.
-    # Returns the last crawled doc.
-    # Yields each doc to the provided block or adds each doc to @docs
-    # which can be accessed by Crawler#docs after the method returns.
-  	def crawl_urls(urls = @urls, &block)
+    #
+    # @param urls [Array<Wgit::Url>] The URLs to crawl.
+    # @yield [doc] If provided, the block is given each crawled
+    #   Document. Otherwise each doc is added to @docs which can be accessed
+    #   by Crawler#docs after this method returns.
+    # @return [Wgit::Document] The last Document crawled.
+    def crawl_urls(urls = @urls, &block)
       raise "No urls to crawl" unless urls
       @docs = []
       doc = nil
       Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
       doc ? doc : @docs.last
-  	end
-  	# Crawl the url and return the response document or nil.
-    # Also yield(doc) if a block is provided. The doc is passed to the block
-    # regardless of the crawl success so the doc.url can be used if needed.
-  	def crawl_url(url = @urls.first, &block)
-      assert_type(url, Url)
-  		markup = fetch(url)
+    end
+    # Crawl the url and return the response document or nil.
+    #
+    # @param url [Wgit::Document] The URL to crawl.
+    # @yield [doc] The crawled HTML Document regardless if the
+    #   crawl was successful or not. Therefore, the Document#url can be used.
+    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
+    #   crawl was unsuccessful.
+    def crawl_url(url = @urls.first)
+      assert_type(url, Wgit::Url)
+      markup = fetch(url)
       url.crawled = true
       doc = Wgit::Document.new(url, markup)
-      block.call(doc) if block_given?
+      yield(doc) if block_given?
       doc.empty? ? nil : doc
-  	end
+    end
     # Crawls an entire site by recursively going through its internal_links.
-    # Also yield(doc) for each crawled doc if a block is provided.
-    # A block is the only way to interact with the crawled docs.
-    # Returns a unique array of external urls collected from the site
-    # or nil if the base_url could not be crawled successfully.
+    #
+    # @param base_url [Wgit::Url] The base URL of the website to be crawled.
+    # @yield [doc] Given each crawled Document/page of the site.
+    #   A block is the only way to interact with each crawled Document.
+    # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
+    #   from all of the site's pages or nil if the base_url could not be
+    #   crawled successfully.
     def crawl_site(base_url = @urls.first, &block)
-      assert_type(base_url, Url)
+      assert_type(base_url, Wgit::Url)
       doc = crawl_url(base_url, &block)
       return nil if doc.nil?
@@ -75,7 +114,7 @@ module Wgit
       return doc.external_links.uniq if internal_urls.empty?
       loop do
-        internal_urls.uniq! unless internal_urls.uniq.nil?
+        internal_urls.uniq!
         links = internal_urls - crawled_urls
         break if links.empty?
@@ -94,36 +133,37 @@ module Wgit
   private
-    # Add the document to the @docs array for later processing
-    # or let the block process it here and now.
+    # Add the document to the @docs array for later processing or let the block
+    # process it here and now.
     def handle_crawl_block(url, &block)
-        if not block_given?
-		        @docs << crawl_url(url)
-            nil
-        else
-            crawl_url(url, &block)
-        end
+      if block_given?
+        crawl_url(url, &block)
+      else
+        @docs << crawl_url(url)
+        nil
+      end
     end
     # The fetch method performs a HTTP GET to obtain the HTML document.
-    # Invalid urls or any HTTP response that doesn't return a HTML body
-    # will be ignored and nil will be returned.  This means that redirects
-    # etc. will not be followed.
+    # Invalid urls or any HTTP response that doesn't return a HTML body will be
+    # ignored and nil will be returned.  This means that redirects etc. will
+    # not be followed.
     def fetch(url)
-        raise unless url.respond_to?(:to_uri)
-        res = Net::HTTP.get_response(url.to_uri)
-        res.body.empty? ? nil : res.body
+      raise unless url.respond_to?(:to_uri)
+      res = Net::HTTP.get_response(url.to_uri)
+      res.body.empty? ? nil : res.body
     rescue
-        nil
+      nil
     end
+    # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
     def add_url(url)
-        @urls = [] if @urls.nil?
-        if url.instance_of?(Url)
-            @urls << url
-        else
-            @urls << Wgit::Url.new(url)
-        end
+      @urls = [] if @urls.nil?
+      if url.is_a?(Wgit::Url)
+        @urls << url
+      else
+        @urls << Wgit::Url.new(url)
+      end
     end
     alias :crawl :crawl_urls

data/lib/wgit/database/database.rb CHANGED

@@ -2,22 +2,19 @@ require_relative '../document'
 require_relative '../url'
 require_relative '../utils'
 require_relative '../assertable'
-require_relative 'mongo_connection_details'
 require_relative 'model'
 require 'mongo'
 module Wgit
-  # @author Michael Telford
   # Class modeling a DB connection and CRUD operations for the Url and
   # Document collections.
-  # The most common methods are: insert, update, urls, search, stats, size.
   class Database
     include Assertable
-    # Is relative to the root project folder, not this file.
-    LOG_FILE_PATH = "misc/mongo_log.txt"
+    # Initializes a database connection client.
+    #
+    # @raise [RuntimeError] If Wgit::CONNECTION_DETAILS aren't set.
     def initialize
       conn_details = Wgit::CONNECTION_DETAILS
       if conn_details.empty?
@@ -25,146 +22,188 @@ module Wgit
 :port, :db, :uname, :pword for a database connection to be established."
       end
-      logger = Logger.new(LOG_FILE_PATH)
+      # Only log to STDOUT in fatal scenarios.
+      Mongo::Logger.logger.level = Logger::FATAL
       address = "#{conn_details[:host]}:#{conn_details[:port]}"
       @@client = Mongo::Client.new([address],
-                                   :database => conn_details[:db],
-                                   :user => conn_details[:uname],
-                                   :password => conn_details[:pword],
-                                   :logger => logger,
-                                   :truncate_logs => false)
+                                   database:      conn_details[:db],
+                                   user:          conn_details[:uname],
+                                   password:      conn_details[:pword])
     end
     ### Create Data ###
+    # Insert one or more Url or Document objects into the DB.
+    #
+    # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
+    #   Wgit::Model.url or Wgit::Model.document.
+    # @raise [RuntimeError] If the data is not valid.
     def insert(data)
-        if data.is_a?(Url)
-            insert_urls(data)
-        elsif data.is_a?(Document)
-            insert_docs(data)
-        elsif data.respond_to?(:first)
-            if data.first.is_a?(Url)
-                insert_urls(data)
-            else
-                insert_docs(data)
-            end
-        else
-            raise "data is not in the correct format (all Url's or Document's)"
-        end
-    end
-    def insert_urls(url_or_urls)
-        unless url_or_urls.respond_to?(:map)
-            assert_type(url_or_urls, Url)
-            url_or_urls = Wgit::Model.url(url_or_urls)
-        else
-            assert_arr_types(url_or_urls, Url)
-            url_or_urls = url_or_urls.map do |url|
-                Wgit::Model.url(url)
-            end
-        end
-        create(:urls, url_or_urls)
-    end
-    def insert_docs(doc_or_docs)
-        unless doc_or_docs.respond_to?(:map)
-            assert_type(doc_or_docs, [Document, Hash])
-            unless doc_or_docs.is_a?(Hash)
-                doc_or_docs = Wgit::Model.document(doc_or_docs)
-            end
+      if data.is_a?(Url)
+        insert_urls(data)
+      elsif data.is_a?(Document)
+        insert_docs(data)
+      elsif data.respond_to?(:first)
+        if data.first.is_a?(Url)
+          insert_urls(data)
         else
-            assert_arr_types(doc_or_docs, [Document, Hash])
-            doc_or_docs = doc_or_docs.map do |doc|
-                Wgit::Model.document(doc) unless doc.is_a?(Hash)
-            end
+          insert_docs(data)
         end
-        create(:documents, doc_or_docs)
+      else
+        raise "data is not in the correct format (all Url's or Document's)"
+      end
     end
     ### Retrieve Data ###
-    # A crawled parameter value of nil (the default) returns all urls.
-    # A limit of 0 means all urls are returned.
-    # All urls are sorted by date_added ascending, in other words the first
-    # url in the results is the first added.
-    def urls(crawled = nil, limit = 0, skip = 0, &block)
-      crawled.nil? ? query = {} : query = { :crawled => crawled }
+    # Returns Url records from the DB. All Urls are sorted by date_added
+    # ascending, in other words the first url returned is the first one that
+    # was inserted into the DB.
+    #
+    # @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
+    # @param limit [Integer] The max number of Url's to return. 0 returns all.
+    # @param skip [Integer] Skip n amount of Url's.
+    # @yield [url] Given each Url returned from the DB.
+    # @return [Array<Wgit::Url>] The Urls obtained from the DB.
+    def urls(crawled = nil, limit = 0, skip = 0)
+      crawled.nil? ? query = {} : query = { crawled: crawled }
-      sort = { :date_added => 1 }
+      sort = { date_added: 1 }
       results = retrieve(:urls, query, sort, {}, limit, skip)
       return [] if results.count < 1
       # results.respond_to? :map! is false so we use map and overwrite the var.
       results = results.map { |url_doc| Wgit::Url.new(url_doc) }
-      return results unless block_given?
-      results.each { |url| block.call(url) }
+      results.each { |url| yield(url) } if block_given?
+      results
     end
+    # Returns Url records that have been crawled.
+    #
+    # @param limit [Integer] The max number of Url's to return. 0 returns all.
+    # @param skip [Integer] Skip n amount of Url's.
+    # @yield [url] Given each Url returned from the DB.
+    # @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
     def crawled_urls(limit = 0, skip = 0, &block)
       urls(true, limit, skip, &block)
     end
+    # Returned Url records that haven't been crawled. Each Url is yielded to a
+    # block, if given.
+    #
+    # @param limit [Integer] The max number of Url's to return. 0 returns all.
+    # @param skip [Integer] Skip n amount of Url's.
+    # @yield [url] Given each Url returned from the DB.
+    # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
     def uncrawled_urls(limit = 0, skip = 0, &block)
       urls(false, limit, skip, &block)
     end
+    # Searches against the indexed docs in the DB for the given query.
+    #
     # Currently all searches are case insensitive.
     #
-    # Searches against the indexed docs in the DB for the given text.
-    # The searched fields are decided by the text index setup against the
+    # The searched fields are decided by the text index setup against the
     # documents collection. Currently we search against the following fields:
     # "author", "keywords", "title" and "text".
     #
-    # The MongoDB search ranks/sorts the results in order (highest first) based
-    # upon each documents textScore which records the number of text hits. We
-    # then store this textScore in each Document object for use elsewhere if
-    # needed.
+    # The MongoDB search ranks/sorts the results in order (highest first) based
+    # upon each documents textScore which records the number of query hits. We
+    # then store this textScore in each Document result object for use
+    # elsewhere if needed.
     #
-    # @param text [String] the value to search the data against.
-    # @param whole_sentence [Boolean] whether multiple words should be
-    # searched for separately.
-    # @param limit [Fixnum] the max length/count of the results array.
-    # @param skip [Fixnum] the number of results to skip, starting with the
-    # most relevant based upon the textScore of the search.
-    # @param block [Block] a block which if provided is passed to each result.
-    #
-    # @return [Array] of Document objects representing the search results.
-    def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
-      text.strip!
-      text.replace("\"" + text + "\"") if whole_sentence
+    # @param query [String] The text query to search with.
+    # @param whole_sentence [Boolean] Whether multiple words should be searched
+    #   for separately.
+    # @param limit [Integer] The max number of results to return.
+    # @param skip [Integer] The number of DB records to skip.
+    # @yield [doc] Given each search result (Wgit::Document).
+    # @return [Array<Wgit::Document>] The search results obtained from the DB.
+    def search(query, whole_sentence = false, limit = 10, skip = 0)
+      query.strip!
+      query.replace("\"" + query + "\"") if whole_sentence
-      # The textScore sorts based on the most search hits.
-      # We use the textScore hash as a sort and a projection below.
-      # :$caseSensitive => case_sensitive, # 3.2+ only.
-      sort_proj = { :score => { :$meta => "textScore" } }
-      query = { :$text => { :$search => text } }
+      # The sort_proj sorts based on the most search hits.
+      # We use the sort_proj hash as both a sort and a projection below.
+      # :$caseSensitive => case_sensitive, 3.2+ only.
+      sort_proj = { score: { :$meta => "textScore" } }
+      query = { :$text => { :$search => query } }
       results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
-      return [] if results.count < 1
+      return [] if results.count < 1 # respond_to? :empty? == false
       # results.respond_to? :map! is false so we use map and overwrite the var.
       results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
-      return results unless block_given?
-      results.each { |doc| block.call(doc) }
+      results.each { |doc| yield(doc) } if block_given?
+      results
     end
-    # Performs a search and pretty prints the results.
-    def search_p(text, whole_sentence = false, limit = 10,
-                 skip = 0, sentence_length = 80, &block)
-      results = search(text, whole_sentence, limit, skip, &block)
-      Wgit::Utils.printf_search_results(results, text, false, sentence_length)
-    end
-    # Returns a Mongo object which can be used like a Hash to retrieve values.
+    # Returns statistics about the database.
+    #
+    # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
     def stats
-        @@client.command(:dbStats => 0).documents[0]
+      @@client.command(dbStats: 0).documents[0]
     end
+    # Returns the current size of the database.
+    #
+    # @return [Integer] The current size of the DB.
     def size
-        stats[:dataSize]
+      stats[:dataSize]
     end
+    # Returns the total number of URL records in the DB.
+    #
+    # @return [Integer] The current number of URL records.
+    def num_urls
+      @@client[:urls].count
+    end
+    # Returns the total number of Document records in the DB.
+    #
+    # @return [Integer] The current number of Document records.
+    def num_docs
+      @@client[:documents].count
+    end
+    # Returns the total number of records (urls + docs) in the DB.
+    #
+    # @return [Integer] The current number of URL and Document records.
+    def num_records
+      num_urls + num_docs
+    end
+    # Returns whether or not a record with the given url (which is unique)
+    # exists in the database's 'urls' collection.
+    #
+    # @param url [Wgit::Url] The Url to search the DB for.
+    # @return [Boolean] True if url exists, otherwise false.
+    def url?(url)
+      h = { "url" => url }
+      not @@client[:urls].find(h).none?
+    end
+    # Returns whether or not a record with the given doc.url (which is unique)
+    # exists in the database's 'documents' collection.
+    #
+    # @param doc [Wgit::Document] The Document to search the DB for.
+    # @return [Boolean] True if doc exists, otherwise false.
+    def doc?(doc)
+      url = doc.respond_to?(:url) ? doc.url : doc
+      h = { "url" => url }
+      not @@client[:documents].find(h).none?
+    end
     ### Update Data ###
+    # Update a Url or Document object in the DB.
+    #
+    # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
+    #   Wgit::Model.url or Wgit::Model.document.
+    # @raise [RuntimeError] If the data is not valid.
     def update(data)
       if data.is_a?(Url)
         update_url(data)
@@ -174,96 +213,134 @@ module Wgit
         raise "data is not in the correct format (all Url's or Document's)"
       end
     end
-    def update_url(url)
-      assert_type(url, Url)
-      selection = { :url => url }
-      url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
-      update = { "$set" => url_hash }
-      _update(true, :urls, selection, update)
-    end
-  def update_doc(doc)
-    assert_type(doc, Document)
-    selection = { :url => doc.url }
-    doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
-    update = { "$set" => doc_hash }
-    _update(true, :documents, selection, update)
-  end
-private
+  private
+    # Return if the write to the DB succeeded or not.
     def write_succeeded?(result, count = 1, multi = false)
-        case result.class.to_s
-        # Single create result.
-        when "Mongo::Operation::Write::Insert::Result"
-            result.documents.first[:err].nil?
-        # Multiple create result.
-        when "Mongo::BulkWrite::Result"
-            result.inserted_count == count
-        # Single and multiple update result.
-        when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
-             "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
-            if multi
-                result.n == count
-            else
-                result.documents.first[:err].nil?
-            end
+      case result.class.to_s
+      # Single create result.
+      when "Mongo::Operation::Insert::Result"
+        result.documents.first[:err].nil?
+      # Multiple create result.
+      when "Mongo::BulkWrite::Result"
+        result.inserted_count == count
+      # Single and multiple update result.
+      when "Mongo::Operation::Update::Result"
+        if multi
+          result.n == count
         else
-            raise "Result class not currently supported: #{result.class.to_s}"
+          result.documents.first[:err].nil?
         end
+      # Class no longer used, have you upgraded the 'mongo' gem?
+      else
+        raise "Result class not currently supported: #{result.class.to_s}"
+      end
+    end
+    # Insert one or more Url objects into the DB.
+    def insert_urls(url_or_urls)
+      unless url_or_urls.respond_to?(:map)
+        assert_type(url_or_urls, Url)
+        url_or_urls = Wgit::Model.url(url_or_urls)
+      else
+        assert_arr_types(url_or_urls, Url)
+        url_or_urls = url_or_urls.map do |url|
+          Wgit::Model.url(url)
+        end
+      end
+      create(:urls, url_or_urls)
     end
-    def create(collection, data)
-        assert_type(data, [Hash, Array])
-        # Single doc.
-        if data.is_a?(Hash)
-            data.merge!(Wgit::Model.common_insert_data)
-            result = @@client[collection.to_sym].insert_one(data)
-            unless write_succeeded?(result)
-              raise "DB write (insert) failed"
-            end
-            result.n
-        # Multiple docs.
-        elsif data.is_a?(Array)
-            assert_arr_types(data, Hash)
-            data.map! do |data_hash|
-                data_hash.merge(Wgit::Model.common_insert_data)
-            end
-            result = @@client[collection.to_sym].insert_many(data)
-            unless write_succeeded?(result, data.length)
-                raise "DB write(s) failed"
-            end
-            result.inserted_count
-        else
-            raise "data must be a Hash or an Array of Hash's"
+    # Insert one or more Document objects into the DB.
+    def insert_docs(doc_or_docs)
+      unless doc_or_docs.respond_to?(:map)
+        assert_type(doc_or_docs, [Document, Hash])
+        unless doc_or_docs.is_a?(Hash)
+          doc_or_docs = Wgit::Model.document(doc_or_docs)
         end
+      else
+        assert_arr_types(doc_or_docs, [Document, Hash])
+        doc_or_docs = doc_or_docs.map do |doc|
+          Wgit::Model.document(doc) unless doc.is_a?(Hash)
+        end
+      end
+      create(:documents, doc_or_docs)
     end
-    def retrieve(collection, query, sort = {}, projection = {},
+    # Create/insert one or more Url or Document records into the DB.
+    def create(collection, data)
+      assert_type(data, [Hash, Array])
+      # Single doc.
+      if data.is_a?(Hash)
+        data.merge!(Wgit::Model.common_insert_data)
+        result = @@client[collection.to_sym].insert_one(data)
+        unless write_succeeded?(result)
+          raise "DB write (insert) failed"
+        end
+        result.n
+      # Multiple docs.
+      elsif data.is_a?(Array)
+        assert_arr_types(data, Hash)
+        data.map! do |data_hash|
+          data_hash.merge(Wgit::Model.common_insert_data)
+        end
+        result = @@client[collection.to_sym].insert_many(data)
+        unless write_succeeded?(result, data.length)
+          raise "DB write(s) failed"
+        end
+        result.inserted_count
+      else
+        raise "data must be a Hash or an Array of Hash's"
+      end
+    end
+    # Retrieve Url or Document records from the DB.
+    def retrieve(collection, query,
+                 sort = {}, projection = {},
                  limit = 0, skip = 0)
-        assert_type(query, Hash)
-        @@client[collection.to_sym].find(query).projection(projection)
-                                  .skip(skip).limit(limit).sort(sort)
+      assert_type(query, Hash)
+      @@client[collection.to_sym].find(query).projection(projection)
+                                   .skip(skip).limit(limit).sort(sort)
+    end
+    # Update a Url object in the DB.
+    def update_url(url)
+      assert_type(url, Url)
+      selection = { url: url }
+      url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
+      update = { "$set" => url_hash }
+      _update(true, :urls, selection, update)
+    end
+    # Update a Document object in the DB.
+    def update_doc(doc)
+      assert_type(doc, Document)
+      selection = { url: doc.url }
+      doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
+      update = { "$set" => doc_hash }
+      _update(true, :documents, selection, update)
     end
+    # Update one or more Url or Document records in the DB.
     # NOTE: The Model.common_update_data should be merged in the calling
     # method as the update param can be bespoke due to its nature.
     def _update(single, collection, selection, update)
-        assert_arr_types([selection, update], Hash)
-        if single
-          result = @@client[collection.to_sym].update_one(selection, update)
-        else
-          result = @@client[collection.to_sym].update_many(selection, update)
-        end
-        raise "DB write (update) failed" unless write_succeeded?(result)
-        result.n
+      assert_arr_types([selection, update], Hash)
+      if single
+        result = @@client[collection.to_sym].update_one(selection, update)
+      else
+        result = @@client[collection.to_sym].update_many(selection, update)
+      end
+      raise "DB write (update) failed" unless write_succeeded?(result)
+      result.n
     end
     alias :count :size
     alias :length :size
+    alias :num_documents :num_docs
+    alias :document? :doc?
     alias :insert_url :insert_urls
     alias :insert_doc :insert_docs
-    alias :search_and_format :search_p
   end
 end