RubyGems - wgit - Versions diffs - 0.8.0 → 0.9.0 - Mend

wgit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/.yardopts +1 -1
data/CHANGELOG.md +39 -0
data/LICENSE.txt +1 -1
data/README.md +118 -323
data/bin/wgit +9 -5
data/lib/wgit.rb +3 -1
data/lib/wgit/assertable.rb +3 -3
data/lib/wgit/base.rb +30 -0
data/lib/wgit/crawler.rb +206 -76
data/lib/wgit/database/database.rb +309 -134
data/lib/wgit/database/model.rb +10 -3
data/lib/wgit/document.rb +138 -95
data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
data/lib/wgit/dsl.rb +324 -0
data/lib/wgit/indexer.rb +65 -162
data/lib/wgit/response.rb +5 -2
data/lib/wgit/url.rb +133 -31
data/lib/wgit/utils.rb +32 -20
data/lib/wgit/version.rb +2 -1
metadata +26 -14

data/lib/wgit/database/model.rb CHANGED

@@ -14,8 +14,7 @@ module Wgit
       raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
       model = url.to_h
-      Wgit::Utils.remove_non_bson_types(model)
+      select_bson_types(model)
     end
     # The data model for a Wgit::Document collection object.
@@ -28,7 +27,7 @@ module Wgit
       model = doc.to_h(include_html: false, include_score: false)
       model['url'] = url(doc.url) # Expand Url String into full object.
-      Wgit::Utils.remove_non_bson_types(model)
+      select_bson_types(model)
     end
     # Common fields when inserting a record into the DB.
@@ -49,5 +48,13 @@ module Wgit
         date_modified: Wgit::Utils.time_stamp
       }
     end
+    # Returns the model having removed non bson types (for use with MongoDB).
+    #
+    # @param model_hash [Hash] The model Hash to sanitize.
+    # @return [Hash] The model Hash with non bson types removed.
+    def self.select_bson_types(model_hash)
+      model_hash.select { |_k, v| v.respond_to?(:bson_type) }
+    end
   end
 end

data/lib/wgit/document.rb CHANGED

@@ -6,19 +6,19 @@ require 'json'
 require 'set'
 module Wgit
-  # Class primarily modeling a HTML web document, although other MIME types
+  # Class modeling/serialising a HTML web document, although other MIME types
   # will work e.g. images etc. Also doubles as a search result when
   # loading Documents from the database via `Wgit::Database#search`.
   #
   # The initialize method dynamically initializes instance variables from the
   # Document HTML / Database object e.g. text. This bit is dynamic so that the
-  # Document class can be easily extended allowing you to pull out the bits of
-  # a webpage that are important to you. See `Wgit::Document.define_extension`.
+  # Document class can be easily extended allowing you to extract the bits of
+  # a webpage that are important to you. See `Wgit::Document.define_extractor`.
   class Document
     include Assertable
-    # Regex for the allowed var names when defining an extension.
-    REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
+    # Regex for the allowed var names when defining an extractor.
+    REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
     # Set of text elements used to build Document#text.
     @text_elements = Set.new(%i[
@@ -29,8 +29,8 @@ module Wgit
       summary sup td textarea th time u ul var wbr
     ])
-    # Set of Symbols representing the defined Document extensions.
-    @extensions = Set.new
+    # Set of Symbols representing the defined Document extractors.
+    @extractors = Set.new
     class << self
       # Set of HTML elements that make up the visible text on a page. These
@@ -38,9 +38,9 @@ module Wgit
       # README.md for how to add to this Set dynamically.
       attr_reader :text_elements
-      # Set of Symbols representing the defined Document extensions. Is
-      # read-only. Use Wgit::Document.define_extension for a new extension.
-      attr_reader :extensions
+      # Set of Symbols representing the defined Document extractors. Is
+      # read-only. Use Wgit::Document.define_extractor for a new extractor.
+      attr_reader :extractors
     end
     # The URL of the webpage, an instance of Wgit::Url.
@@ -50,7 +50,7 @@ module Wgit
     attr_reader :html
     # The Nokogiri::HTML document object initialized from @html.
-    attr_reader :doc
+    attr_reader :parser
     # The score is only used following a `Database#search` and records matches.
     attr_reader :score
@@ -62,7 +62,7 @@ module Wgit
     #
     # During initialisation, the Document will call any private
     # `init_*_from_html` and `init_*_from_object` methods it can find. See the
-    # README.md and Wgit::Document.define_extension method for more details.
+    # Wgit::Document.define_extractor method for more details.
     #
     # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
     #   representing a URL or a Hash-like object responding to :fetch. e.g. a
@@ -101,13 +101,16 @@ module Wgit
       xpath
     end
-    # Defines an extension, which is a way to serialise HTML elements into
-    # instance variables upon Document initialization. See the default
-    # extensions defined in 'document_extensions.rb' as examples.
+    # Defines a content extractor, which extracts HTML elements/content
+    # into instance variables upon Document initialization. See the default
+    # extractors defined in 'document_extractors.rb' as examples. Defining an
+    # extractor means that every subsequently crawled/initialized document
+    # will attempt to extract the xpath's content. Use `#xpath` for a one off
+    # content extraction.
     #
-    # Note that defined extensions work for both Documents initialized from
+    # Note that defined extractors work for both Documents initialized from
     # HTML (via Wgit::Crawler methods) and from database objects.
-    # An extension once defined, initializes a private instance variable with
+    # An extractor once defined, initializes a private instance variable with
     # the xpath or database object result(s).
     #
     # When initialising from HTML, a singleton value of true will only
@@ -118,15 +121,17 @@ module Wgit
     # object), then a default will be used. The default value is:
     # `singleton ? nil : []`.
     #
-    # @param var [Symbol] The name of the variable to be initialised.
+    # @param var [Symbol] The name of the variable to be initialised, that will
+    #   contain the extracted content. A getter and setter method is defined
+    #   for the initialised variable.
     # @param xpath [String, #call] The xpath used to find the element(s)
     #   of the webpage. Only used when initializing from HTML.
     #
     #   Pass a callable object (proc etc.) if you want the
     #   xpath value to be derived on Document initialisation (instead of when
-    #   the extension is defined). The call method must return a valid xpath
+    #   the extractor is defined). The call method must return a valid xpath
     #   String.
-    # @param opts [Hash] The options to define an extension with. The
+    # @param opts [Hash] The options to define an extractor with. The
     #   options are only used when intializing from HTML, not the database.
     # @option opts [Boolean] :singleton The singleton option determines
     #   whether or not the result(s) should be in an Array. If multiple
@@ -147,46 +152,50 @@ module Wgit
     #   value. Return the block's value param unchanged if you want to inspect.
     # @raise [StandardError] If the var param isn't valid.
     # @return [Symbol] The given var Symbol if successful.
-    def self.define_extension(var, xpath, opts = {}, &block)
+    def self.define_extractor(var, xpath, opts = {}, &block)
       var = var.to_sym
       defaults = { singleton: true, text_content_only: true }
       opts = defaults.merge(opts)
-      raise "var must match #{REGEX_EXTENSION_NAME}" unless \
-      var =~ REGEX_EXTENSION_NAME
+      raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
+      var =~ REGEX_EXTRACTOR_NAME
       # Define the private init_*_from_html method for HTML.
       # Gets the HTML's xpath value and creates a var for it.
       func_name = Document.send(:define_method, "init_#{var}_from_html") do
-        result = find_in_html(xpath, opts, &block)
+        result = extract_from_html(xpath, **opts, &block)
         init_var(var, result)
       end
       Document.send(:private, func_name)
       # Define the private init_*_from_object method for a Database object.
       # Gets the Object's 'key' value and creates a var for it.
-      func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
-        result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
+      func_name = Document.send(
+        :define_method, "init_#{var}_from_object"
+      ) do |obj|
+        result = extract_from_object(
+          obj, var.to_s, singleton: opts[:singleton], &block
+        )
         init_var(var, result)
       end
       Document.send(:private, func_name)
-      @extensions << var
+      @extractors << var
       var
     end
-    # Removes the `init_*` methods created when an extension is defined.
-    # Therefore, this is the opposing method to `Document.define_extension`.
+    # Removes the `init_*` methods created when an extractor is defined.
+    # Therefore, this is the opposing method to `Document.define_extractor`.
     # Returns true if successful or false if the method(s) cannot be found.
     #
-    # @param var [Symbol] The extension variable already defined.
-    # @return [Boolean] True if the extension `var` was found and removed;
+    # @param var [Symbol] The extractor variable to remove.
+    # @return [Boolean] True if the extractor `var` was found and removed;
     #   otherwise false.
-    def self.remove_extension(var)
+    def self.remove_extractor(var)
       Document.send(:remove_method, "init_#{var}_from_html")
       Document.send(:remove_method, "init_#{var}_from_object")
-      @extensions.delete(var.to_sym)
+      @extractors.delete(var.to_sym)
       true
     rescue NameError
       false
@@ -215,9 +224,9 @@ module Wgit
     # Returns the base URL of this Wgit::Document. The base URL is either the
     # <base> element's href value or @url (if @base is nil). If @base is
-    # present and relative, then @url.to_base + @base is returned. This method
-    # should be used instead of `doc.url.to_base` etc. when manually building
-    # absolute links from relative links; or use `link.prefix_base(doc)`.
+    # present and relative, then @url.to_origin + @base is returned. This method
+    # should be used instead of `doc.url.to_origin` etc. when manually building
+    # absolute links from relative links; or use `link.make_absolute(doc)`.
     #
     # Provide the `link:` parameter to get the correct base URL for that type
     # of link. For example, a link of `#top` would always return @url because
@@ -236,12 +245,16 @@ module Wgit
     # @return [Wgit::Url] The base URL of this Document e.g.
     #   'http://example.com/public'.
     def base_url(link: nil)
-      raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
       if @url.relative? && @base.nil?
-      raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
+        raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
+      end
       if @url.relative? && @base&.relative?
+        raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
+be relative"
+      end
-      get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
+      get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
       if link
         link = Wgit::Url.new(link)
@@ -253,7 +266,7 @@ module Wgit
         end
       end
-      base_url = @base ? get_base.call : @url.to_base
+      base_url = @base ? get_base.call : @url.to_origin
       base_url.omit_fragment.omit_query
     end
@@ -267,7 +280,7 @@ module Wgit
     def to_h(include_html: false, include_score: true)
       ignore = include_html ? [] : ['@html']
       ignore << '@score' unless include_score
-      ignore << '@doc' # Always ignore Nokogiri @doc.
+      ignore << '@parser' # Always ignore the Nokogiri object.
       Wgit::Utils.to_h(self, ignore: ignore)
     end
@@ -284,7 +297,7 @@ module Wgit
     # Returns a Hash containing this Document's instance variables and
     # their #length (if they respond to it). Works dynamically so that any
-    # user defined extensions (and their created instance vars) will appear in
+    # user defined extractors (and their created instance vars) will appear in
     # the returned Hash as well. The number of text snippets as well as total
     # number of textual bytes are always included in the returned Hash.
     #
@@ -324,21 +337,39 @@ module Wgit
     end
     # Uses Nokogiri's xpath method to search the doc's html and return the
-    # results.
+    # results. Use `#at_xpath` for returning the first result only.
     #
     # @param xpath [String] The xpath to search the @html with.
     # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
     def xpath(xpath)
-      @doc.xpath(xpath)
+      @parser.xpath(xpath)
     end
-    # Uses Nokogiri's css method to search the doc's html and return the
-    # results.
+    # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
+    # result. Use `#xpath` for returning several results.
+    #
+    # @param xpath [String] The xpath to search the @html with.
+    # @return [Nokogiri::XML::Element] The result of the xpath search.
+    def at_xpath(xpath)
+      @parser.at_xpath(xpath)
+    end
+    # Uses Nokogiri's `css` method to search the doc's html and return the
+    # results. Use `#at_css` for returning the first result only.
     #
     # @param selector [String] The CSS selector to search the @html with.
     # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
     def css(selector)
-      @doc.css(selector)
+      @parser.css(selector)
+    end
+    # Uses Nokogiri's `at_css` method to search the doc's html and return the
+    # result. Use `#css` for returning several results.
+    #
+    # @param selector [String] The CSS selector to search the @html with.
+    # @return [Nokogiri::XML::Element] The result of the CSS search.
+    def at_css(selector)
+      @parser.at_css(selector)
     end
     # Returns all unique internal links from this Document in relative form.
@@ -356,13 +387,13 @@ module Wgit
       return [] if @links.empty?
       links = @links
-              .select { |link| link.relative?(host: @url.to_base) }
+              .select { |link| link.relative?(host: @url.to_origin) }
               .map(&:omit_base)
               .map do |link| # Map @url.to_host into / as it's a duplicate.
         link.to_host == @url.to_host ? Wgit::Url.new('/') : link
       end
-      Wgit::Utils.process_arr(links)
+      Wgit::Utils.sanitize(links)
     end
     # Returns all unique internal links from this Document in absolute form by
@@ -371,7 +402,7 @@ module Wgit
     #
     # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
     def internal_absolute_links
-      internal_links.map { |link| link.prefix_base(self) }
+      internal_links.map { |link| link.make_absolute(self) }
     end
     # Returns all unique external links from this Document in absolute form.
@@ -382,10 +413,10 @@ module Wgit
       return [] if @links.empty?
       links = @links
-              .reject { |link| link.relative?(host: @url.to_base) }
+              .reject { |link| link.relative?(host: @url.to_origin) }
               .map(&:omit_trailing_slash)
-      Wgit::Utils.process_arr(links)
+      Wgit::Utils.sanitize(links)
     end
     # Searches the @text for the given query and returns the results.
@@ -400,8 +431,8 @@ module Wgit
     # original sentence, which ever is less. The algorithm obviously ensures
     # that the search query is visible somewhere in the sentence.
     #
-    # @param query [String, #to_s] The value to search the document's
-    #   @text for.
+    # @param query [Regexp, #to_s] The regex or text value to search the
+    #   document's @text for.
     # @param case_sensitive [Boolean] Whether character case must match.
     # @param whole_sentence [Boolean] Whether multiple words should be searched
     #   for separately.
@@ -411,12 +442,16 @@ module Wgit
     def search(
       query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
     )
-      query = query.to_s
-      raise 'A search query must be provided' if query.empty?
       raise 'The sentence_limit value must be even' if sentence_limit.odd?
-      query   = query.gsub(' ', '|') unless whole_sentence
-      regex   = Regexp.new(query, !case_sensitive)
+      if query.is_a?(Regexp)
+        regex = query
+      else # respond_to? #to_s == true
+        query = query.to_s
+        query = query.gsub(' ', '|') unless whole_sentence
+        regex = Regexp.new(query, !case_sensitive)
+      end
       results = {}
       @text.each do |sentence|
@@ -443,8 +478,8 @@ module Wgit
     # functionality. The original text is returned; no other reference to it
     # is kept thereafter.
     #
-    # @param query [String, #to_s] The value to search the document's
-    #   @text for.
+    # @param query [Regexp, #to_s] The regex or text value to search the
+    #   document's @text for.
     # @param case_sensitive [Boolean] Whether character case must match.
     # @param whole_sentence [Boolean] Whether multiple words should be searched
     #   for separately.
@@ -463,13 +498,31 @@ module Wgit
       orig_text
     end
+    # Extracts a value/object from this Document's @html using the given xpath
+    # parameter.
+    #
+    # @param xpath [String, #call] Used to find the value/object in @html.
+    # @param singleton [Boolean] singleton ? results.first (single Nokogiri
+    #   Object) : results (Array).
+    # @param text_content_only [Boolean] text_content_only ? result.content
+    #   (String) : result (Nokogiri Object).
+    # @return [String, Object] The value found in the html or the default value
+    #   (singleton ? nil : []).
+    def extract(xpath, singleton: true, text_content_only: true)
+      send(
+        :extract_from_html, xpath,
+        singleton: singleton, text_content_only: text_content_only
+      )
+    end
     protected
     # Initializes the nokogiri object using @html, which cannot be nil.
     # Override this method to custom configure the Nokogiri object returned.
     # Gets called from Wgit::Document.new upon initialization.
     #
-    # @yield [config] The given block is passed to Nokogiri::HTML for initialisation.
+    # @yield [config] The given block is passed to Nokogiri::HTML for
+    #   initialisation.
     # @raise [StandardError] If @html isn't set.
     # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
     def init_nokogiri(&block)
@@ -481,7 +534,7 @@ module Wgit
     # Extracts a value/object from this Document's @html using the given xpath
     # parameter.
     #
-    # @param xpath [String] Used to find the value/object in @html.
+    # @param xpath [String, #call] Used to find the value/object in @html.
     # @param singleton [Boolean] singleton ? results.first (single Nokogiri
     #   Object) : results (Array).
     # @param text_content_only [Boolean] text_content_only ? result.content
@@ -497,23 +550,16 @@ module Wgit
     #   the block's `value` param unchanged if you simply want to inspect it.
     # @return [String, Object] The value found in the html or the default value
     #   (singleton ? nil : []).
-    def find_in_html(xpath, singleton: true, text_content_only: true)
-      default = singleton ? nil : []
-      xpath   = xpath.call if xpath.respond_to?(:call)
-      results = @doc.xpath(xpath)
-      return default if results.nil? || results.empty?
+    def extract_from_html(xpath, singleton: true, text_content_only: true)
+      xpath  = xpath.call if xpath.respond_to?(:call)
+      result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
-      result = if singleton
-                 text_content_only ? results.first.content : results.first
-               else
-                 text_content_only ? results.map(&:content) : results
-               end
-      singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
+      if text_content_only
+        result = singleton ? result&.content : result.map(&:content)
+      end
+      Wgit::Utils.sanitize(result)
       result = yield(result, self, :document) if block_given?
       result
     end
@@ -533,16 +579,14 @@ module Wgit
     #   the block's `value` param unchanged if you simply want to inspect it.
     # @return [String, Object] The value found in the obj or the default value
     #   (singleton ? nil : []).
-    def find_in_object(obj, key, singleton: true)
+    def extract_from_object(obj, key, singleton: true)
       assert_respond_to(obj, :fetch)
       default = singleton ? nil : []
       result  = obj.fetch(key.to_s, default)
-      singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
+      Wgit::Utils.sanitize(result)
       result = yield(result, obj, :object) if block_given?
       result
     end
@@ -556,12 +600,12 @@ module Wgit
       url = Wgit::Url.parse(url)
       url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
-      @url   = url
-      @html  = html || ''
-      @doc   = init_nokogiri
-      @score = 0.0
+      @url    = url
+      @html   = html || ''
+      @parser = init_nokogiri
+      @score  = 0.0
-      Wgit::Utils.process_str(@html, encode: encode)
+      Wgit::Utils.sanitize(@html, encode: encode)
       # Dynamically run the init_*_from_html methods.
       Document.private_instance_methods(false).each do |method|
@@ -577,12 +621,12 @@ module Wgit
     def init_from_object(obj, encode: true)
       assert_respond_to(obj, :fetch)
-      @url   = Wgit::Url.new(obj.fetch('url')) # Should always be present.
-      @html  = obj.fetch('html', '')
-      @doc   = init_nokogiri
-      @score = obj.fetch('score', 0.0)
+      @url    = Wgit::Url.new(obj.fetch('url')) # Should always be present.
+      @html   = obj.fetch('html', '')
+      @parser = init_nokogiri
+      @score  = obj.fetch('score', 0.0)
-      Wgit::Utils.process_str(@html, encode: encode)
+      Wgit::Utils.sanitize(@html, encode: encode)
       # Dynamically run the init_*_from_object methods.
       Document.private_instance_methods(false).each do |method|
@@ -593,11 +637,11 @@ module Wgit
       end
     end
-    # Initialises an instance variable and defines a getter method for it.
+    # Initialises an instance variable and defines an accessor method for it.
     #
     # @param var [Symbol] The name of the variable to be initialized.
     # @param value [Object] The newly initialized variable's value.
-    # @return [Symbol] The name of the newly created getter method.
+    # @return [Symbol] The name of the defined getter method.
     def init_var(var, value)
       # instance_var_name starts with @, var_name doesn't.
       var = var.to_s
@@ -605,10 +649,9 @@ module Wgit
       instance_var_name = "@#{var_name}".to_sym
       instance_variable_set(instance_var_name, value)
+      Wgit::Document.attr_accessor(var_name)
-      Document.send(:define_method, var_name) do
-        instance_variable_get(instance_var_name)
-      end
+      var_name
     end
     alias content                html