RubyGems - wgit - Versions diffs - 0.11.0 → 0.12.1 - Mend

wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +48 -0
data/CODE_OF_CONDUCT.md +1 -1
data/CONTRIBUTING.md +1 -1
data/README.md +27 -24
data/bin/wgit +72 -18
data/lib/wgit/assertable.rb +33 -6
data/lib/wgit/core_ext.rb +1 -1
data/lib/wgit/crawler.rb +91 -20
data/lib/wgit/database/adapters/in_memory.rb +204 -0
data/lib/wgit/database/adapters/mongo_db.rb +627 -0
data/lib/wgit/database/database.rb +18 -663
data/lib/wgit/database/database_adapter.rb +147 -0
data/lib/wgit/document.rb +187 -77
data/lib/wgit/document_extractors.rb +15 -23
data/lib/wgit/dsl.rb +64 -67
data/lib/wgit/html_to_text.rb +277 -0
data/lib/wgit/indexer.rb +29 -10
data/lib/wgit/logger.rb +2 -2
data/lib/wgit/model.rb +164 -0
data/lib/wgit/response.rb +5 -8
data/lib/wgit/robots_parser.rb +8 -8
data/lib/wgit/url.rb +38 -38
data/lib/wgit/utils.rb +124 -14
data/lib/wgit/version.rb +1 -1
data/lib/wgit.rb +18 -14
metadata +74 -30
data/lib/wgit/database/model.rb +0 -60

data/lib/wgit/dsl.rb CHANGED Viewed

@@ -44,14 +44,14 @@ the 'start' function".freeze
       Wgit::Document.define_extractor(var, xpath, opts, &block)
     end
-    # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
-    # index methods used by the DSL. See the Wgit::Crawler documentation for
-    # more details.
+    # Sets and returns the Wgit::Crawler used in subsequent crawls including
+    # indexing. Defaults to `Wgit::Crawler.new` if not given a param. See the
+    # Wgit::Crawler documentation for more details.
     #
-    # @yield [crawler] The created crawler; use the block to configure.
-    # @return [Wgit::Crawler] The created crawler used by the DSL.
-    def crawler
-      @dsl_crawler ||= Wgit::Crawler.new
+    # @yield [crawler] Given the DSL crawler; use the block to configure.
+    # @return [Wgit::Crawler] The crawler instance used by the DSL.
+    def use_crawler(crawler = nil)
+      @dsl_crawler = crawler || @dsl_crawler || Wgit::Crawler.new
       yield @dsl_crawler if block_given?
       @dsl_crawler
     end
@@ -66,7 +66,7 @@ the 'start' function".freeze
     # @yield [crawler] The crawler that'll be used in the subsequent
     #   crawl/index; use the block to configure.
     def start(*urls, &block)
-      crawler(&block)
+      use_crawler(&block) if block_given?
       @dsl_start = urls
     end
@@ -101,7 +101,7 @@ the 'start' function".freeze
       raise DSL_ERROR__NO_START_URL if urls.empty?
       urls.map! { |url| Wgit::Url.parse(url) }
-      crawler.crawl_urls(*urls, follow_redirects:, &block)
+      get_crawler.crawl_urls(*urls, follow_redirects:, &block)
     end
     # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -138,42 +138,41 @@ the 'start' function".freeze
       opts  = { follow: xpath, allow_paths:, disallow_paths: }
       urls.reduce([]) do |externals, url|
-        externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
+        externals + get_crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
       end
     end
-    # Returns the DSL's `crawler#last_response`.
+    # Returns the DSL's `Wgit::Crawler#last_response`.
     #
     # @return [Wgit::Response] The response from the last URL crawled.
     def last_response
-      crawler.last_response
+      get_crawler.last_response
     end
     # Nilifies the DSL instance variables.
     def reset
-      @dsl_crawler  = nil
-      @dsl_start    = nil
-      @dsl_follow   = nil
-      @dsl_conn_str = nil
+      @dsl_crawler = nil
+      @dsl_start   = nil
+      @dsl_follow  = nil
+      @dsl_db      = nil
     end
     ### INDEXER METHODS ###
-    # Defines the connection string to the database used in subsequent `index*`
-    # method calls. This method is optional as the connection string can be
-    # passed to the index method instead.
+    # Defines the connected database instance used in subsequent index and DB
+    # method calls. This method is optional however, as a new instance of the
+    # Wgit::Database.adapter_class will be initialised otherwise. Therefore
+    # if not calling this method, you should ensure
+    # ENV['WGIT_CONNECTION_STRING'] is set or the connection will fail.
     #
-    # @param conn_str [String] The connection string used to connect to the
-    #   database in subsequent `index*` method calls.
-    def connection_string(conn_str)
-      @dsl_conn_str = conn_str
+    # @param db [Wgit::Database::DatabaseAdapter] The connected database
+    #   instance used in subsequent `index*` method calls.
+    def use_database(db)
+      @dsl_db = db
     end
     # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
     #
-    # @param connection_string [String] The database connection string. Set as
-    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
-    #   `connection_string`.
     # @param max_sites [Integer] The number of separate and whole
     #   websites to be crawled before the method exits. Defaults to -1 which
     #   means the crawl will occur until manually stopped (Ctrl+C etc).
@@ -181,11 +180,8 @@ the 'start' function".freeze
     #   scraped from the web (default is 1GB). Note, that this value is used to
     #   determine when to stop crawling; it's not a guarantee of the max data
     #   that will be obtained.
-    def index_www(
-      connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
-    )
-      db      = Wgit::Database.new(connection_string)
-      indexer = Wgit::Indexer.new(db, crawler)
+    def index_www(max_sites: -1, max_data: 1_048_576_000)
+      indexer = Wgit::Indexer.new(get_db, get_crawler)
       indexer.index_www(max_sites:, max_data:)
     end
@@ -194,9 +190,6 @@ the 'start' function".freeze
     #
     # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
     #   crawl. Can be set using `start`.
-    # @param connection_string [String] The database connection string. Set as
-    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
-    #   `connection_string`.
     # @param insert_externals [Boolean] Whether or not to insert the website's
     #   external URL's into the database.
     # @param follow [String] The xpath extracting links to be followed during
@@ -213,15 +206,13 @@ the 'start' function".freeze
     #   set.
     # @return [Integer] The total number of pages crawled within the website.
     def index_site(
-      *urls, connection_string: @dsl_conn_str,
-      insert_externals: false, follow: @dsl_follow,
+      *urls, insert_externals: false, follow: @dsl_follow,
       allow_paths: nil, disallow_paths: nil, &block
     )
       urls = (@dsl_start || []) if urls.empty?
       raise DSL_ERROR__NO_START_URL if urls.empty?
-      db         = Wgit::Database.new(connection_string)
-      indexer    = Wgit::Indexer.new(db, crawler)
+      indexer    = Wgit::Indexer.new(get_db, get_crawler)
       xpath      = follow || :default
       crawl_opts = {
         insert_externals:, follow: xpath, allow_paths:, disallow_paths:
@@ -236,9 +227,6 @@ the 'start' function".freeze
     #
     # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
     #   `start` URL(s).
-    # @param connection_string [String] The database connection string. Set as
-    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
-    #   `connection_string`.
     # @param insert_externals [Boolean] Whether or not to insert the website's
     #   external URL's into the database.
     # @yield [doc] Given the Wgit::Document of the crawled webpage,
@@ -247,15 +235,11 @@ the 'start' function".freeze
     #   document from being saved into the database.
     # @raise [StandardError] If no urls are provided and no `start` URL has
     #   been set.
-    def index(
-      *urls, connection_string: @dsl_conn_str,
-      insert_externals: false, &block
-    )
+    def index(*urls, insert_externals: false, &block)
       urls = (@dsl_start || []) if urls.empty?
       raise DSL_ERROR__NO_START_URL if urls.empty?
-      db      = Wgit::Database.new(connection_string)
-      indexer = Wgit::Indexer.new(db, crawler)
+      indexer = Wgit::Indexer.new(get_db, get_crawler)
       urls.map! { |url| Wgit::Url.parse(url) }
       indexer.index_urls(*urls, insert_externals:, &block)
@@ -264,13 +248,11 @@ the 'start' function".freeze
     ### DATABASE METHODS ###
     # Performs a search of the database's indexed documents and pretty prints
-    # the results in a search engine-esque format. See `Wgit::Database#search!`
-    # and `Wgit::Document#search!` for details of how the search works.
+    # the results in a search engine-esque format. See
+    # `Wgit::Database::DatabaseAdapter#search` and `Wgit::Document#search!`
+    # for details of how the search methods work.
     #
     # @param query [String] The text query to search with.
-    # @param connection_string [String] The database connection string. Set as
-    #   nil to use ENV['WGIT_CONNECTION_STRING'] or set using
-    #   `connection_string`.
     # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
     #   to output text somewhere e.g. a file or STDERR. Use nil for no output.
     # @param case_sensitive [Boolean] Whether character case must match.
@@ -284,38 +266,53 @@ the 'start' function".freeze
     #   database containing only its matching `#text`.
     # @return [Array<Wgit::Document>] The search results with matching text.
     def search(
-      query, connection_string: @dsl_conn_str, stream: $stdout,
+      query, stream: $stdout,
+      top_result_only: true, include_score: false,
       case_sensitive: false, whole_sentence: true,
-      limit: 10, skip: 0, sentence_limit: 80, &block
+      limit: 10, skip: 0, sentence_limit: 80
     )
       stream ||= File.open(File::NULL, 'w')
-      db = Wgit::Database.new(connection_string)
-      results = db.search!(
-        query,
-        case_sensitive:, whole_sentence:,
-        limit:, skip:,
-        sentence_limit:, &block
-      )
+      results = get_db.search(
+        query, case_sensitive:, whole_sentence:, limit:, skip:)
-      Wgit::Utils.pprint_search_results(results, stream:)
+      results.each do |doc|
+        doc.search_text!(
+          query, case_sensitive:, whole_sentence:, sentence_limit:)
+        yield(doc) if block_given?
+      end
+      if top_result_only
+        Wgit::Utils.pprint_top_search_results(results, include_score:, stream:)
+      else
+        Wgit::Utils.pprint_all_search_results(results, include_score:, stream:)
+      end
       results
     end
     # Deletes everything in the urls and documents collections by calling
-    # `Wgit::Database#clear_db` underneath. This will nuke the entire database
-    # so yeah... be careful.
+    # `Wgit::Database::DatabaseAdapter#empty` underneath.
     #
     # @return [Integer] The number of deleted records.
-    def clear_db!(connection_string: @dsl_conn_str)
-      db = Wgit::Database.new(connection_string)
-      db.clear_db
+    def empty_db!
+      get_db.empty
+    end
+    private
+    def get_crawler
+      @dsl_crawler ||= Wgit::Crawler.new
+    end
+    def get_db
+      @dsl_db ||= Wgit::Database.new
     end
     alias_method :crawl_url,  :crawl
     alias_method :crawl_r,    :crawl_site
     alias_method :index_r,    :index_site
+    alias_method :index_url,  :index
     alias_method :start_urls, :start
   end
 end

data/lib/wgit/html_to_text.rb ADDED Viewed

@@ -0,0 +1,277 @@
+require_relative "utils"
+require_relative "assertable"
+require "nokogiri"
+module Wgit
+  # Class used to extract the visible page text from a HTML string.
+  # This is in turn used to set the output of a Wgit::Document#text method.
+  class HTMLToText
+    include Assertable
+    # Set of text elements used to extract the visible text.
+    # The element's display (:inline or :block) is used to delimit sentences e.g.
+    # <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
+    # <span>foo</span><span>bar</span> will be extracted as ['foobar'].
+    @text_elements = {
+      a:          :inline,
+      abbr:       :inline,
+      address:    :block,
+      article:    :block,
+      aside:      :block,
+      b:          :inline,
+      bdi:        :inline,
+      bdo:        :inline,
+      blockquote: :block,
+      br:         :block,
+      button:     :block, # Normally inline but Wgit treats as block.
+      caption:    :block,
+      cite:       :inline,
+      code:       :inline,
+      data:       :inline,
+      dd:         :block,
+      del:        :inline,
+      details:    :block,
+      dfn:        :inline,
+      div:        :block,
+      dl:         :block,
+      dt:         :block,
+      em:         :inline,
+      figcaption: :block,
+      figure:     :block,
+      footer:     :block,
+      h1:         :block,
+      h2:         :block,
+      h3:         :block,
+      h4:         :block,
+      h5:         :block,
+      h6:         :block,
+      header:     :block,
+      hr:         :block,
+      i:          :inline,
+      input:      :inline,
+      ins:        :block,
+      kbd:        :inline,
+      label:      :inline,
+      legend:     :block,
+      li:         :block,
+      main:       :block,
+      mark:       :inline,
+      meter:      :block,
+      ol:         :block,
+      option:     :block,
+      output:     :block,
+      p:          :block,
+      pre:        :block,
+      q:          :inline,
+      rb:         :inline,
+      rt:         :inline,
+      ruby:       :inline,
+      s:          :inline,
+      samp:       :inline,
+      section:    :block,
+      small:      :inline,
+      span:       :inline,
+      strong:     :inline,
+      sub:        :inline,
+      summary:    :block,
+      sup:        :inline,
+      td:         :block,
+      textarea:   :block,
+      th:         :block,
+      time:       :inline,
+      u:          :inline,
+      ul:         :block,
+      var:        :inline,
+      wbr:        :inline
+    }
+    class << self
+      # Set of HTML elements that make up the visible text on a page. These
+      # elements are used to initialize the Wgit::Document#text. See the
+      # README.md for how to add to this Hash dynamically.
+      attr_reader :text_elements
+    end
+    # The Nokogiri::HTML document object initialized from a HTML string.
+    attr_reader :parser
+    # Creates a new HTML to text extractor instance.
+    #
+    # @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
+    # @raise [StandardError] If the given parser is of an invalid type.
+    def initialize(parser)
+      assert_type(parser, Nokogiri::HTML4::Document)
+      @parser = parser
+    end
+    # Extracts and returns the text sentences from the @parser HTML.
+    #
+    # @return [Array<String>] An array of unique text sentences.
+    def extract_arr
+      return [] if @parser.to_s.empty?
+      text_str = extract_str
+      # Split the text_str into an Array of text sentences.
+      text_str
+        .split("\n")
+        .map(&:strip)
+        .reject(&:empty?)
+    end
+    # Extracts and returns a text string from the @parser HTML.
+    #
+    # @return [String] A string of text with \n delimiting sentences.
+    def extract_str
+      text_str = ""
+      iterate_child_nodes(@parser) do |node, display|
+        # Handle any special cases e.g. skip nodes we don't care about...
+        # <pre> nodes should have their contents displayed exactly as is.
+        if node_name(node) == :pre
+          text_str << "\n"
+          text_str << node.text
+          next
+        end
+        # Skip any child node of <pre> since they're handled as a special case above.
+        next if child_of?(:pre, node)
+        if node.text?
+          # Skip any text element that is purely whitespace.
+          next unless valid_text_content?(node.text)
+        else
+          # Skip a concrete node if it has other concrete child nodes as these
+          # will be iterated onto later.
+          #
+          # Process if node has no children or one child which is a valid text node.
+          next unless node.children.empty? || parent_of_text_node_only?(node)
+        end
+        # Apply display rules deciding if a new line is needed before node.text.
+        add_new_line = false
+        prev = prev_sibling_or_parent(node)
+        if node.text?
+          add_new_line = true unless prev && inline?(prev)
+        else
+          add_new_line = true if display == :block
+          add_new_line = true if prev && block?(prev)
+        end
+        text_str << "\n" if add_new_line
+        text_str << format_text(node.text)
+      end
+      text_str
+        .strip
+        .squeeze("\n")
+        .squeeze(" ")
+    end
+    private
+    def node_name(node)
+      node.name&.downcase&.to_sym
+    end
+    def display(node)
+      name = node_name(node)
+      Wgit::HTMLToText.text_elements[name]
+    end
+    def inline?(node)
+      display(node) == :inline
+    end
+    def block?(node)
+      display(node) == :block
+    end
+    # Returns the previous sibling of node or nil. Only valid text elements are
+    # returned i.e. non duplicates with valid text content.
+    def prev_sibling(node)
+      prev = node.previous
+      return nil unless prev
+      return prev unless prev.text?
+      return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
+      return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
+      prev.previous
+    end
+    # Returns node's previous sibling, parent or nil; in that order. Only valid
+    # text elements are returned i.e. non duplicates with valid text content.
+    def prev_sibling_or_parent(node)
+      prev = prev_sibling(node)
+      return prev if prev
+      node.parent
+    end
+    def child_of?(ancestor_name, node)
+      node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
+    end
+    # Returns true if any of the child nodes contain a non empty :text node.
+    def parent_of_text_node?(node)
+      node.children.any? { |child| child.text? && valid_text_content?(child.text) }
+    end
+    def parent_of_text_node_only?(node)
+      node.children.size == 1 && parent_of_text_node?(node)
+    end
+    # Returns true if text is not empty having removed all new lines.
+    def valid_text_content?(text)
+      !format_text(text).empty?
+    end
+    # Returns true if node is a text node.
+    # Duplicate text nodes (that follow a concrete node) are omitted.
+    def valid_text_node?(node)
+      node.text? && node.text != node.parent.text
+    end
+    def contains_new_line?(text)
+      ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
+    end
+    # Remove special characters including any new lines; as semantic HTML will
+    # typically use <br> and/or block elements to denote a line break.
+    def format_text(text)
+      text
+        .encode("UTF-8", undef: :replace, invalid: :replace)
+        .gsub("\n",       "")
+        .gsub('\\n',      "")
+        .gsub("\r",       "")
+        .gsub('\\r',      "")
+        .gsub("\f",       "")
+        .gsub('\\f',      "")
+        .gsub("\t",       "")
+        .gsub('\\t',      "")
+        .gsub("&zwnj;",   "")
+        .gsub("&nbsp;",   " ")
+        .gsub("&#160;",   " ")
+        .gsub("&thinsp;", " ")
+        .gsub("&ensp;",   " ")
+        .gsub("&emsp;",   " ")
+        .gsub('\u00a0',   " ")
+    end
+    # Iterate over node and it's child nodes, yielding each to &block.
+    # Only HTMLToText.text_elements or valid :text nodes will be yielded.
+    # Duplicate text nodes (that follow a concrete node) are omitted.
+    def iterate_child_nodes(node, &block)
+      display = display(node)
+      text_node = valid_text_node?(node)
+      yield(node, display) if display || text_node
+      node.children.each { |child| iterate_child_nodes(child, &block) }
+    end
+    alias_method :extract, :extract_arr
+  end
+end

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -1,12 +1,23 @@
 # frozen_string_literal: true
+require_relative 'assertable'
 require_relative 'crawler'
-require_relative 'database/database'
+require_relative 'database/database_adapter'
 module Wgit
   # Class which crawls and saves the Documents to a database. Can be thought of
-  # as a combination of Wgit::Crawler and Wgit::Database.
+  # as a combination of Wgit::Crawler and Wgit::Database::DatabaseAdapter.
   class Indexer
+    include Assertable
+    # The ENV var used to omit and ignore robots.txt parsing during an index.
+    # Applies to all index_* methods if set in the ENV.
+    WGIT_IGNORE_ROBOTS_TXT = "WGIT_IGNORE_ROBOTS_TXT".freeze
+    # The block return value used to skip saving a crawled document to the
+    # database. Applies to all index_* methods that take a block.
+    SKIP_UPSERT = :skip.freeze
     # The crawler used to index the WWW.
     attr_reader :crawler
@@ -15,10 +26,13 @@ module Wgit
     # Initialize the Indexer.
     #
-    # @param database [Wgit::Database] The database instance (already
-    #   initialized and connected) used to index.
-    # @param crawler [Wgit::Crawler] The crawler instance used to index.
+    # @param database [Wgit::Database::DatabaseAdapter] The database instance
+    #   (already initialized and connected) used for indexing.
+    # @param crawler [Wgit::Crawler] The crawler instance used for indexing.
     def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
+      assert_type(database, Wgit::Database::DatabaseAdapter)
+      assert_type(crawler, Wgit::Crawler)
       @db      = database
       @crawler = crawler
     end
@@ -143,11 +157,10 @@ future iterations")
         next if no_index?(@crawler.last_response, doc)
         result = block_given? ? yield(doc) : true
+        next if doc.empty? || result == SKIP_UPSERT
-        if result && !doc.empty?
-          upsert_doc(doc)
-          total_pages_indexed += 1
-        end
+        upsert_doc(doc)
+        total_pages_indexed += 1
       end
       upsert_url_and_redirects(url)
@@ -207,7 +220,9 @@ for the site: #{url}")
         break if no_index?(@crawler.last_response, doc)
         result = block_given? ? yield(doc) : true
-        upsert_doc(doc) if result && !doc.empty?
+        break if doc.empty? || result == SKIP_UPSERT
+        upsert_doc(doc)
       end
       upsert_url_and_redirects(url)
@@ -285,6 +300,8 @@ for the site: #{url}")
     # Crawls and parses robots.txt file (if found). Returns the parser or nil.
     def parse_robots_txt(url)
+      return nil if ENV[WGIT_IGNORE_ROBOTS_TXT]
       robots_url = url.to_origin.join('/robots.txt')
       Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
@@ -328,6 +345,8 @@ for the site: #{url}")
     # Returns if the last_response or doc #no_index? is true or not.
     def no_index?(last_response, doc)
+      return false if ENV[WGIT_IGNORE_ROBOTS_TXT]
       url = last_response.url.to_s
       if last_response.no_index?
         Wgit.logger.info("Skipping page due to no-index response header: #{url}")

data/lib/wgit/logger.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 # FYI: The default logger is set at the bottom of this file.
-require 'logger'
+require "logger"
 module Wgit
   # The Logger instance used by Wgit. Set your own custom logger after
@@ -28,7 +28,7 @@ module Wgit
   #
   # @return [Logger] The default Logger instance.
   def self.default_logger
-    logger = Logger.new($stdout, progname: 'wgit', level: :info)
+    logger = Logger.new($stdout, progname: "wgit", level: :info)
     logger.formatter = proc do |_severity, _datetime, progname, msg|
       "[#{progname}] #{msg}\n"
     end