RubyGems - wgit - Versions diffs - 0.8.0 → 0.9.0 - Mend

wgit 0.8.0 → 0.9.0

Files changed (21) hide show

checksums.yaml +4 -4
data/.yardopts +1 -1
data/CHANGELOG.md +39 -0
data/LICENSE.txt +1 -1
data/README.md +118 -323
data/bin/wgit +9 -5
data/lib/wgit.rb +3 -1
data/lib/wgit/assertable.rb +3 -3
data/lib/wgit/base.rb +30 -0
data/lib/wgit/crawler.rb +206 -76
data/lib/wgit/database/database.rb +309 -134
data/lib/wgit/database/model.rb +10 -3
data/lib/wgit/document.rb +138 -95
data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
data/lib/wgit/dsl.rb +324 -0
data/lib/wgit/indexer.rb +65 -162
data/lib/wgit/response.rb +5 -2
data/lib/wgit/url.rb +133 -31
data/lib/wgit/utils.rb +32 -20
data/lib/wgit/version.rb +2 -1
metadata +26 -14

data/lib/wgit/{document_extensions.rb → document_extractors.rb} RENAMED

@@ -1,19 +1,19 @@
 # frozen_string_literal: true
-### Default Document Extensions ###
+### Default Document Extractors ###
 # Base.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :base,
   '//base/@href',
   singleton: true,
   text_content_only: true
 ) do |base|
-  Wgit::Url.parse_or_nil(base) if base
+  Wgit::Url.parse?(base) if base
 end
 # Title.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :title,
   '//title',
   singleton: true,
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
 )
 # Description.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :description,
   '//meta[@name="description"]/@content',
   singleton: true,
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
 )
 # Author.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :author,
   '//meta[@name="author"]/@content',
   singleton: true,
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
 )
 # Keywords.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :keywords,
   '//meta[@name="keywords"]/@content',
   singleton: true,
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
 ) do |keywords, _source, type|
   if keywords && (type == :document)
     keywords = keywords.split(',')
-    Wgit::Utils.process_arr(keywords)
+    Wgit::Utils.sanitize(keywords)
   end
   keywords
 end
 # Links.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :links,
   '//a/@href',
   singleton: false,
   text_content_only: true
 ) do |links|
   links
-    .map { |link| Wgit::Url.parse_or_nil(link) }
+    .map { |link| Wgit::Url.parse?(link) }
     .compact # Remove unparsable links.
 end
 # Text.
-Wgit::Document.define_extension(
+Wgit::Document.define_extractor(
   :text,
   proc { Wgit::Document.text_elements_xpath },
   singleton: false,

data/lib/wgit/dsl.rb ADDED

@@ -0,0 +1,324 @@
+module Wgit
+  # DSL methods that act as a wrapper around Wgit's underlying class methods.
+  # All instance vars/constants are prefixed to avoid conflicts when included.
+  module DSL
+    # Error message shown when there's no URL to crawl.
+    DSL_ERROR__NO_START_URL = "missing url, pass as parameter to this or \
+the 'start' function".freeze
+    ### CRAWLER METHODS ###
+    # Defines an extractor using `Wgit::Document.define_extractor` underneath.
+    #
+    # @param var [Symbol] The name of the variable to be initialised, that will
+    #   contain the extracted content.
+    # @param xpath [String, #call] The xpath used to find the element(s)
+    #   of the webpage. Only used when initializing from HTML.
+    #
+    #   Pass a callable object (proc etc.) if you want the
+    #   xpath value to be derived on Document initialisation (instead of when
+    #   the extractor is defined). The call method must return a valid xpath
+    #   String.
+    # @param opts [Hash] The options to define an extractor with. The
+    #   options are only used when intializing from HTML, not the database.
+    # @option opts [Boolean] :singleton The singleton option determines
+    #   whether or not the result(s) should be in an Array. If multiple
+    #   results are found and singleton is true then the first result will be
+    #   used. Defaults to true.
+    # @option opts [Boolean] :text_content_only The text_content_only option
+    #   if true will use the text content of the Nokogiri result object,
+    #   otherwise the Nokogiri object itself is returned. Defaults to true.
+    # @yield The block is executed when a Wgit::Document is initialized,
+    #   regardless of the source. Use it (optionally) to process the result
+    #   value.
+    # @yieldparam value [Object] The result value to be assigned to the new
+    #   `var`.
+    # @yieldparam source [Wgit::Document, Object] The source of the `value`.
+    # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
+    #   `:object`.
+    # @yieldreturn [Object] The return value of the block becomes the new var's
+    #   value. Return the block's value param unchanged if you want to inspect.
+    # @raise [StandardError] If the var param isn't valid.
+    # @return [Symbol] The given var Symbol if successful.
+    def extract(var, xpath, opts = {}, &block)
+      Wgit::Document.define_extractor(var, xpath, opts, &block)
+    end
+    # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
+    # index methods used by the DSL. See the Wgit::Crawler documentation for
+    # more details.
+    #
+    # @yield [crawler] The created crawler; use the block to configure.
+    # @return [Wgit::Crawler] The created crawler used by the DSL.
+    def crawler
+      @dsl_crawler ||= Wgit::Crawler.new
+      yield @dsl_crawler if block_given?
+      @dsl_crawler
+    end
+    # Sets the URL to be crawled when a `crawl*` or `index*` method is
+    # subsequently called. Calling this is optional as the URL can be
+    # passed to the method instead. You can also omit the url param and just
+    # use the block to configure the crawler instead.
+    #
+    # @param urls [*String, *Wgit::Url] The URL(s) to crawl
+    #   or nil (if only using the block to configure the crawler).
+    # @yield [crawler] The crawler that'll be used in the subsequent
+    #   crawl/index; use the block to configure.
+    def start(*urls, &block)
+      crawler(&block)
+      @dsl_start = urls
+    end
+    # Sets the xpath to be followed when `crawl_site` or `index_site` is
+    # subsequently called. Calling this method is optional as the default is to
+    # follow all `<a>` href's that point to the site domain. You can also pass
+    # `follow:` to the crawl/index methods directly.
+    #
+    # @param xpath [String] The xpath which is followed when crawling/indexing
+    #   a site. Use `:default` to restore the default follow logic.
+    def follow(xpath)
+      @dsl_follow = xpath
+    end
+    # Crawls one or more individual urls using `Wgit::Crawler#crawl_url`
+    # underneath. If no urls are provided, then the `start` URL is used.
+    #
+    # @param urls [*Wgit::Url] The URL's to crawl. Defaults to the `start`
+    #   URL(s).
+    # @param follow_redirects [Boolean, Symbol] Whether or not to follow
+    #   redirects. Pass a Symbol to limit where the redirect is allowed to go
+    #   e.g. :host only allows redirects within the same host. Choose from
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   This value will be used for all urls crawled.
+    # @yield [doc] Given each crawled page (Wgit::Document); this is the only
+    #   way to interact with them.
+    # @raise [StandardError] If no urls are provided and no `start` URL has
+    #   been set.
+    # @return [Wgit::Document] The last Document crawled.
+    def crawl(*urls, follow_redirects: true, &block)
+      urls = (@dsl_start || []) if urls.empty?
+      raise DSL_ERROR__NO_START_URL if urls.empty?
+      urls.map! { |url| Wgit::Url.parse(url) }
+      crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
+    end
+    # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
+    # url is provided, then the first `start` URL is used.
+    #
+    # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to be
+    #   crawled. It is recommended that this URL be the index page of the site
+    #   to give a greater chance of finding all pages within that site/host.
+    #   Defaults to the `start` URLs.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML. This can also be set using `follow`.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
+    # @yield [doc] Given each crawled page (Wgit::Document) of the site.
+    #   A block is the only way to interact with each crawled Document.
+    #   Use `doc.empty?` to determine if the page is valid.
+    # @raise [StandardError] If no url is provided and no `start` URL has been
+    #   set.
+    # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
+    #   from all of the site's pages or nil if the given url could not be
+    #   crawled successfully.
+    def crawl_site(
+      *urls, follow: @dsl_follow,
+      allow_paths: nil, disallow_paths: nil, &block
+    )
+      urls = (@dsl_start || []) if urls.empty?
+      raise DSL_ERROR__NO_START_URL if urls.empty?
+      xpath = follow || :default
+      opts  = {
+        follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
+      }
+      urls.reduce([]) do |externals, url|
+        externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
+      end
+    end
+    # Returns the DSL's `crawler#last_response`.
+    #
+    # @return [Wgit::Response] The response from the last URL crawled.
+    def last_response
+      crawler.last_response
+    end
+    # Nilifies the DSL instance variables.
+    def reset
+      @dsl_crawler  = nil
+      @dsl_start    = nil
+      @dsl_follow   = nil
+      @dsl_conn_str = nil
+    end
+    ### INDEXER METHODS ###
+    # Defines the connection string to the database used in subsequent `index*`
+    # method calls. This method is optional as the connection string can be
+    # passed to the index method instead.
+    #
+    # @param conn_str [String] The connection string used to connect to the
+    #   database in subsequent `index*` method calls.
+    def connection_string(conn_str)
+      @dsl_conn_str = conn_str
+    end
+    # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
+    #
+    # @param connection_string [String] The database connection string. Set as
+    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
+    #   `connection_string`.
+    # @param max_sites [Integer] The number of separate and whole
+    #   websites to be crawled before the method exits. Defaults to -1 which
+    #   means the crawl will occur until manually stopped (Ctrl+C etc).
+    # @param max_data [Integer] The maximum amount of bytes that will be
+    #   scraped from the web (default is 1GB). Note, that this value is used to
+    #   determine when to stop crawling; it's not a guarantee of the max data
+    #   that will be obtained.
+    def index_www(
+      connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
+    )
+      db      = Wgit::Database.new(connection_string)
+      indexer = Wgit::Indexer.new(db, crawler)
+      indexer.index_www(max_sites: max_sites, max_data: max_data)
+    end
+    # Indexes a single website using `Wgit::Indexer#index_site` underneath.
+    #
+    # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
+    #   crawl. Can be set using `start`.
+    # @param connection_string [String] The database connection string. Set as
+    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
+    #   `connection_string`.
+    # @param insert_externals [Boolean] Whether or not to insert the website's
+    #   external URL's into the database.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML. This can also be set using `follow`.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
+    # @yield [doc] Given the Wgit::Document of each crawled webpage, before it
+    #   is inserted into the database allowing for prior manipulation.
+    # @raise [StandardError] If no url is provided and no `start` URL has been
+    #   set.
+    # @return [Integer] The total number of pages crawled within the website.
+    def index_site(
+      *urls, connection_string: @dsl_conn_str,
+      insert_externals: false, follow: @dsl_follow,
+      allow_paths: nil, disallow_paths: nil, &block
+    )
+      urls = (@dsl_start || []) if urls.empty?
+      raise DSL_ERROR__NO_START_URL if urls.empty?
+      db         = Wgit::Database.new(connection_string)
+      indexer    = Wgit::Indexer.new(db, crawler)
+      xpath      = follow || :default
+      crawl_opts = {
+        insert_externals: insert_externals, follow: xpath,
+        allow_paths: allow_paths, disallow_paths: disallow_paths
+      }
+      urls.reduce(0) do |total, url|
+        total + indexer.index_site(Wgit::Url.parse(url), **crawl_opts, &block)
+      end
+    end
+    # Indexes a single webpage using `Wgit::Indexer#index_url` underneath.
+    #
+    # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
+    #   `start` URL(s).
+    # @param connection_string [String] The database connection string. Set as
+    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
+    #   `connection_string`.
+    # @param insert_externals [Boolean] Whether or not to insert the website's
+    #   external URL's into the database.
+    # @yield [doc] Given the Wgit::Document of the crawled webpage,
+    #   before it's inserted into the database allowing for prior
+    #   manipulation. Return nil or false from the block to prevent the
+    #   document from being saved into the database.
+    # @raise [StandardError] If no urls are provided and no `start` URL has
+    #   been set.
+    def index(
+      *urls, connection_string: @dsl_conn_str,
+      insert_externals: false, &block
+    )
+      urls = (@dsl_start || []) if urls.empty?
+      raise DSL_ERROR__NO_START_URL if urls.empty?
+      db      = Wgit::Database.new(connection_string)
+      indexer = Wgit::Indexer.new(db, crawler)
+      urls.map! { |url| Wgit::Url.parse(url) }
+      indexer.index_urls(*urls, insert_externals: insert_externals, &block)
+    end
+    # Performs a search of the database's indexed documents and pretty prints
+    # the results in a search engine-esque format. See `Wgit::Database#search!`
+    # and `Wgit::Document#search!` for details of how the search works.
+    #
+    # @param query [String] The text query to search with.
+    # @param connection_string [String] The database connection string. Set as
+    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
+    #   `connection_string`.
+    # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
+    #   to output text somewhere e.g. a file or STDERR. Use nil for no output.
+    # @param case_sensitive [Boolean] Whether character case must match.
+    # @param whole_sentence [Boolean] Whether multiple words should be searched
+    #   for separately.
+    # @param limit [Integer] The max number of results to print.
+    # @param skip [Integer] The number of DB records to skip.
+    # @param sentence_limit [Integer] The max length of each result's text
+    #   snippet.
+    # @yield [doc] Given each search result (Wgit::Document) returned from the
+    #   database containing only its matching `#text`.
+    # @return [Array<Wgit::Document>] The search results with matching text.
+    def search(
+      query, connection_string: @dsl_conn_str, stream: STDOUT,
+      case_sensitive: false, whole_sentence: true,
+      limit: 10, skip: 0, sentence_limit: 80, &block
+    )
+      stream ||= File.open(File::NULL, 'w')
+      db = Wgit::Database.new(connection_string)
+      results = db.search!(
+        query,
+        case_sensitive: case_sensitive,
+        whole_sentence: whole_sentence,
+        limit: limit,
+        skip: skip,
+        sentence_limit: sentence_limit,
+        &block
+      )
+      Wgit::Utils.printf_search_results(results, stream: stream)
+      results
+    end
+    # Deletes everything in the urls and documents collections by calling
+    # `Wgit::Database#clear_db` underneath. This will nuke the entire database
+    # so yeah... be careful.
+    #
+    # @return [Integer] The number of deleted records.
+    def clear_db!(connection_string: @dsl_conn_str)
+      db = Wgit::Database.new(connection_string)
+      db.clear_db
+    end
+    alias crawl_r    crawl_site
+    alias index_r    index_site
+    alias start_urls start
+  end
+end

data/lib/wgit/indexer.rb CHANGED

@@ -4,129 +4,8 @@ require_relative 'crawler'
 require_relative 'database/database'
 module Wgit
-  # Convience method to index the World Wide Web using
-  # Wgit::Indexer#index_www.
-  #
-  # Retrieves uncrawled url's from the database and recursively crawls each
-  # site storing their internal pages into the database and adding their
-  # external url's to be crawled later on. Logs info on the crawl
-  # using Wgit.logger as it goes along.
-  #
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param max_sites [Integer] The number of separate and whole
-  #   websites to be crawled before the method exits. Defaults to -1 which
-  #   means the crawl will occur until manually stopped (Ctrl+C etc).
-  # @param max_data [Integer] The maximum amount of bytes that will be
-  #   scraped from the web (default is 1GB). Note, that this value is used to
-  #   determine when to stop crawling; it's not a guarantee of the max data
-  #   that will be obtained.
-  def self.index_www(
-    connection_string: nil, max_sites: -1, max_data: 1_048_576_000
-  )
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_www(max_sites: max_sites, max_data: max_data)
-  end
-  # Convience method to index a single website using
-  # Wgit::Indexer#index_site.
-  #
-  # Crawls a single website's pages and stores them into the database.
-  # There is no max download limit so be careful which sites you index.
-  #
-  # @param url [Wgit::Url, String] The base Url of the website to crawl.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param insert_externals [Boolean] Whether or not to insert the website's
-  #   external Url's into the database.
-  # @param allow_paths [String, Array<String>] Filters links by selecting
-  #   them if their path `File.fnmatch?` one of allow_paths.
-  # @param disallow_paths [String, Array<String>] Filters links by rejecting
-  #   them if their path `File.fnmatch?` one of disallow_paths.
-  # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
-  #   inserted into the database allowing for prior manipulation.
-  # @return [Integer] The total number of pages crawled within the website.
-  def self.index_site(
-    url, connection_string: nil, insert_externals: true,
-    allow_paths: nil, disallow_paths: nil, &block
-  )
-    url = Wgit::Url.parse(url)
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_site(
-      url, insert_externals: insert_externals,
-           allow_paths: allow_paths, disallow_paths: disallow_paths, &block
-    )
-  end
-  # Convience method to index a single webpage using
-  # Wgit::Indexer#index_page.
-  #
-  # Crawls a single webpage and stores it into the database.
-  # There is no max download limit so be careful of large pages.
-  #
-  # @param url [Wgit::Url, String] The Url of the webpage to crawl.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param insert_externals [Boolean] Whether or not to insert the website's
-  #   external Url's into the database.
-  # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
-  #   inserted into the database allowing for prior manipulation.
-  def self.index_page(
-    url, connection_string: nil, insert_externals: true, &block
-  )
-    url = Wgit::Url.parse(url)
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_page(url, insert_externals: insert_externals, &block)
-  end
-  # Performs a search of the database's indexed documents and pretty prints
-  # the results. See Wgit::Database#search and Wgit::Document#search for
-  # details of how the search works.
-  #
-  # @param query [String] The text query to search with.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param case_sensitive [Boolean] Whether character case must match.
-  # @param whole_sentence [Boolean] Whether multiple words should be searched
-  #   for separately.
-  # @param limit [Integer] The max number of results to print.
-  # @param skip [Integer] The number of DB records to skip.
-  # @param sentence_limit [Integer] The max length of each result's text
-  #   snippet.
-  # @yield [doc] Given each search result (Wgit::Document) returned from the
-  #   database.
-  def self.indexed_search(
-    query, connection_string: nil,
-    case_sensitive: false, whole_sentence: true,
-    limit: 10, skip: 0, sentence_limit: 80, &block
-  )
-    db = Wgit::Database.new(connection_string)
-    results = db.search(
-      query,
-      case_sensitive: case_sensitive,
-      whole_sentence: whole_sentence,
-      limit: limit,
-      skip: skip,
-      &block
-    )
-    results.each do |doc|
-      doc.search!(
-        query,
-        case_sensitive: case_sensitive,
-        whole_sentence: whole_sentence,
-        sentence_limit: sentence_limit
-      )
-    end
-    Wgit::Utils.printf_search_results(results)
-  end
-  # Class which crawls and saves the indexed Documents to a database.
+  # Class which crawls and saves the Documents to a database. Can be thought of
+  # as a combination of Wgit::Crawler and Wgit::Database.
   class Indexer
     # The crawler used to index the WWW.
     attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
     # @param database [Wgit::Database] The database instance (already
     #   initialized and connected) used to index.
     # @param crawler [Wgit::Crawler] The crawler instance used to index.
-    def initialize(database, crawler = Wgit::Crawler.new)
+    def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
       @db      = database
       @crawler = crawler
     end
@@ -189,7 +68,8 @@ database capacity, exiting.")
           site_docs_count = 0
           ext_links = @crawler.crawl_site(url) do |doc|
-            if !doc.empty? && write_doc_to_db(doc)
+            unless doc.empty?
+              write_doc_to_db(doc)
               docs_count += 1
               site_docs_count += 1
             end
@@ -198,12 +78,9 @@ database capacity, exiting.")
           raise 'Error updating url' unless @db.update(url) == 1
           urls_count += write_urls_to_db(ext_links)
-          Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
-site: #{url}")
         end
-        Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
+        Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
 overall for this iteration.")
         Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
 the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
     # @param url [Wgit::Url] The base Url of the website to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the website's
     #   external Url's into the database.
-    # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them if their path `File.fnmatch?` one of allow_paths.
-    # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path `File.fnmatch?` one of disallow_paths.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
     # @yield [doc] Given the Wgit::Document of each crawled web page before
     #   it's inserted into the database allowing for prior manipulation. Return
     #   nil or false from the block to prevent the document from being saved
     #   into the database.
     # @return [Integer] The total number of webpages/documents indexed.
     def index_site(
-      url, insert_externals: true, allow_paths: nil, disallow_paths: nil
+      url, insert_externals: false, follow: :default,
+      allow_paths: nil, disallow_paths: nil
     )
-      crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      crawl_opts = {
+        follow: follow,
+        allow_paths: allow_paths,
+        disallow_paths: disallow_paths
+      }
       total_pages_indexed = 0
-      ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
-        result = true
-        result = yield(doc) if block_given?
+      ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
+        result = block_given? ? yield(doc) : true
-        if result && !doc.empty? && write_doc_to_db(doc)
+        if result && !doc.empty?
+          write_doc_to_db(doc)
           total_pages_indexed += 1
-          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
         end
       end
-      @db.url?(url) ? @db.update(url) : @db.insert(url)
+      @db.upsert(url)
       if insert_externals && ext_urls
         num_inserted_urls = write_urls_to_db(ext_urls)
         Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
       end
-      Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
-site: #{url}")
+      Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
+the site: #{url}")
       total_pages_indexed
     end
+    # Crawls one or more webpages and stores them into the database.
+    # There is no max download limit so be careful of large pages.
+    # Logs info on the crawl using Wgit.logger as it goes along.
+    #
+    # @param urls [*Wgit::Url] The webpage Url's to crawl.
+    # @param insert_externals [Boolean] Whether or not to insert the webpages
+    #   external Url's into the database.
+    # @yield [doc] Given the Wgit::Document of the crawled webpage,
+    #   before it's inserted into the database allowing for prior
+    #   manipulation. Return nil or false from the block to prevent the
+    #   document from being saved into the database.
+    # @raise [StandardError] if no urls are provided.
+    def index_urls(*urls, insert_externals: false, &block)
+      raise 'You must provide at least one Url' if urls.empty?
+      opts = { insert_externals: insert_externals }
+      Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
+      nil
+    end
     # Crawls a single webpage and stores it into the database.
     # There is no max download limit so be careful of large pages.
     # Logs info on the crawl using Wgit.logger as it goes along.
     #
     # @param url [Wgit::Url] The webpage Url to crawl.
-    # @param insert_externals [Boolean] Whether or not to insert the webpage's
+    # @param insert_externals [Boolean] Whether or not to insert the webpages
     #   external Url's into the database.
     # @yield [doc] Given the Wgit::Document of the crawled webpage,
     #   before it's inserted into the database allowing for prior
     #   manipulation. Return nil or false from the block to prevent the
     #   document from being saved into the database.
-    def index_page(url, insert_externals: true)
+    def index_url(url, insert_externals: false)
       document = @crawler.crawl_url(url) do |doc|
-        result = true
-        result = yield(doc) if block_given?
-        if result && !doc.empty? && write_doc_to_db(doc)
-          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
-        end
+        result = block_given? ? yield(doc) : true
+        write_doc_to_db(doc) if result && !doc.empty?
       end
-      @db.url?(url) ? @db.update(url) : @db.insert(url)
+      @db.upsert(url)
       ext_urls = document&.external_links
       if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
     # collection deliberately prevents duplicate inserts.
     #
     # @param doc [Wgit::Document] The document to write to the DB.
-    # @return [Boolean] True if the write was successful, false otherwise.
     def write_doc_to_db(doc)
-      @db.insert(doc)
-      Wgit.logger.info("Saved document for url: #{doc.url}")
-      true
-    rescue Mongo::Error::OperationFailure
-      Wgit.logger.info("Document already exists: #{doc.url}")
-      false
+      if @db.upsert(doc)
+        Wgit.logger.info("Saved document for url: #{doc.url}")
+      else
+        Wgit.logger.info("Updated document for url: #{doc.url}")
+      end
     end
     # Write the urls to the DB. Note that the unique url index on the urls
     # collection deliberately prevents duplicate inserts.
     #
     # @param urls [Array<Wgit::Url>] The urls to write to the DB.
-    # @return [Boolean] True if the write was successful, false otherwise.
+    # @return [Integer] The number of inserted urls.
     def write_urls_to_db(urls)
       count = 0
@@ -341,6 +239,7 @@ site: #{url}")
         @db.insert(url)
         count += 1
         Wgit.logger.info("Inserted external url: #{url}")
       rescue Mongo::Error::OperationFailure
         Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
       count
     end
+    alias database db
+    alias index    index_urls
+    alias index_r  index_site
   end
 end