RubyGems - wgit - Versions diffs - 0.10.7 → 0.11.0 - Mend

wgit 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +44 -1
data/CONTRIBUTING.md +1 -1
data/README.md +22 -2
data/bin/wgit +3 -1
data/lib/wgit/assertable.rb +2 -2
data/lib/wgit/crawler.rb +56 -34
data/lib/wgit/database/database.rb +64 -52
data/lib/wgit/document.rb +67 -39
data/lib/wgit/document_extractors.rb +15 -1
data/lib/wgit/dsl.rb +16 -20
data/lib/wgit/indexer.rb +157 -63
data/lib/wgit/logger.rb +1 -1
data/lib/wgit/response.rb +21 -6
data/lib/wgit/robots_parser.rb +193 -0
data/lib/wgit/url.rb +118 -51
data/lib/wgit/utils.rb +81 -28
data/lib/wgit/version.rb +1 -1
data/lib/wgit.rb +1 -0
metadata +33 -38

data/lib/wgit/document.rb CHANGED Viewed

@@ -3,7 +3,6 @@ require_relative 'utils'
 require_relative 'assertable'
 require 'nokogiri'
 require 'json'
-require 'set'
 module Wgit
   # Class modeling/serialising a HTML web document, although other MIME types
@@ -18,9 +17,9 @@ module Wgit
     include Assertable
     # Regex for the allowed var names when defining an extractor.
-    REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
+    REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
-    # Set of text elements used to build Document#text.
+    # Set of text elements used to build the xpath for Document#text.
     @text_elements = Set.new(%i[
       a abbr address article aside b bdi bdo blockquote button caption cite
       code data dd del details dfn div dl dt em figcaption figure footer h1 h2
@@ -29,6 +28,13 @@ module Wgit
       summary sup td textarea th time u ul var wbr
     ])
+    # Instance vars to be ignored by Document#to_h and in turn Model.document.
+    @to_h_ignore_vars = [
+      '@parser',      # Always ignore the Nokogiri object.
+      '@meta_robots', # Used by #no_index?, ignore.
+      '@meta_wgit'    # Used by #no_index?, ignore.
+    ]
     # Set of Symbols representing the defined Document extractors.
     @extractors = Set.new
@@ -38,6 +44,12 @@ module Wgit
       # README.md for how to add to this Set dynamically.
       attr_reader :text_elements
+      # Array of instance vars to ignore when Document#to_h and in turn
+      # Model.document methods are called. Append your own defined extractor
+      # vars to omit them from the model (database object) when indexing.
+      # Each var should be a String starting with an '@' char e.g. "@data" etc.
+      attr_reader :to_h_ignore_vars
       # Set of Symbols representing the defined Document extractors. Is
       # read-only. Use Wgit::Document.define_extractor for a new extractor.
       attr_reader :extractors
@@ -76,9 +88,9 @@ module Wgit
     #   false if the Document content is an image etc.
     def initialize(url_or_obj, html = '', encode: true)
       if url_or_obj.is_a?(String)
-        init_from_strings(url_or_obj, html, encode: encode)
+        init_from_strings(url_or_obj, html, encode:)
       else
-        init_from_object(url_or_obj, encode: encode)
+        init_from_object(url_or_obj, encode:)
       end
     end
@@ -89,9 +101,9 @@ module Wgit
     #
     # @return [String] An xpath String to obtain a webpage's text elements.
     def self.text_elements_xpath
-      Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
+      @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
         xpath += ' | ' unless i.zero?
-        xpath += format('//%s/text()', el)
+        xpath + format('//%s/text()', el)
       end
     end
@@ -192,13 +204,27 @@ module Wgit
       Document.send(:remove_method, "init_#{var}_from_object")
       @extractors.delete(var.to_sym)
       true
     rescue NameError
       false
     end
+    # Removes all default and defined extractors by calling
+    # `Document.remove_extractor` underneath. See its documentation.
+    def self.remove_extractors
+      @extractors.each { |var| remove_extractor(var) }
+    end
     ### Document Instance Methods ###
+    # Overrides String#inspect to shorten the printed output of a Document.
+    #
+    # @return [String] A short textual representation of this Document.
+    def inspect
+      "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
+    end
     # Determines if both the url and html match. Use
     # doc.object_id == other.object_id for exact object comparison.
     #
@@ -227,10 +253,10 @@ module Wgit
     # Provide the `link:` parameter to get the correct base URL for that type
     # of link. For example, a link of `#top` would always return @url because
     # it applies to that page, not a different one. Query strings work in the
-    # same way. Use this parameter if manually concatting Url's e.g.
+    # same way. Use this parameter if manually joining Url's e.g.
     #
     #   relative_link = Wgit::Url.new('?q=hello')
-    #   absolute_link = doc.base_url(link: relative_link).concat(relative_link)
+    #   absolute_link = doc.base_url(link: relative_link).join(relative_link)
     #
     # This is similar to how Wgit::Document#internal_absolute_links works.
     #
@@ -250,7 +276,7 @@ module Wgit
 be relative"
       end
-      get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
+      get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
       if link
         link = Wgit::Url.new(link)
@@ -274,11 +300,11 @@ be relative"
     #   returned Hash.
     # @return [Hash] Containing self's instance vars.
     def to_h(include_html: false, include_score: true)
-      ignore = include_html ? [] : ['@html']
+      ignore = Wgit::Document.to_h_ignore_vars.dup
+      ignore << '@html' unless include_html
       ignore << '@score' unless include_score
-      ignore << '@parser' # Always ignore the Nokogiri object.
-      Wgit::Utils.to_h(self, ignore: ignore)
+      Wgit::Utils.to_h(self, ignore:)
     end
     # Converts this Document's #to_h return value to a JSON String.
@@ -287,7 +313,7 @@ be relative"
     #   returned JSON String.
     # @return [String] This Document represented as a JSON String.
     def to_json(include_html: false)
-      h = to_h(include_html: include_html)
+      h = to_h(include_html:)
       JSON.generate(h)
     end
@@ -309,7 +335,7 @@ be relative"
         else
           next unless instance_variable_get(var).respond_to?(:length)
-          hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
+          hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
         end
       end
@@ -417,7 +443,6 @@ be relative"
                 end
               end
               .reject { |link| link.relative?(host: @url.to_origin) }
-              .map(&:omit_trailing_slash)
       Wgit::Utils.sanitize(links)
     end
@@ -493,10 +518,7 @@ be relative"
       query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
     )
       orig_text = @text
-      @text = search(
-        query, case_sensitive: case_sensitive,
-               whole_sentence: whole_sentence, sentence_limit: sentence_limit
-      )
+      @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
       orig_text
     end
@@ -519,11 +541,17 @@ be relative"
     # @return [String, Object] The value found in the html or the default value
     #   (singleton ? nil : []).
     def extract(xpath, singleton: true, text_content_only: true, &block)
-      send(
-        :extract_from_html, xpath,
-        singleton: singleton, text_content_only: text_content_only,
-        &block
-      )
+      send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
+    end
+    # Works with the default extractors to extract and check the HTML meta tags
+    # instructing Wgit not to index this document (save it to a Database). If
+    # the default extractors are removed, this method will always return false.
+    #
+    # @return [Boolean] True if this document shouldn't be saved to a Database,
+    #   false otherwise.
+    def no_index?
+      [@meta_robots, @meta_wgit].include?('noindex')
     end
     protected
@@ -567,7 +595,7 @@ be relative"
         result = singleton ? result.content : result.map(&:content)
       end
-      Wgit::Utils.sanitize(result)
+      result = Wgit::Utils.sanitize(result)
       result = yield(result, self, :document) if block_given?
       result
     end
@@ -594,7 +622,7 @@ be relative"
       default = singleton ? nil : []
       result  = obj.fetch(key.to_s, default)
-      Wgit::Utils.sanitize(result)
+      result = Wgit::Utils.sanitize(result)
       result = yield(result, obj, :object) if block_given?
       result
     end
@@ -614,13 +642,13 @@ be relative"
       @parser = init_nokogiri
       @score  = 0.0
-      Wgit::Utils.sanitize(@html, encode: encode)
+      @html = Wgit::Utils.sanitize(@html, encode:)
       # Dynamically run the init_*_from_html methods.
       Document.private_instance_methods(false).each do |method|
         if method.to_s.start_with?('init_') &&
-           method.to_s.end_with?('_from_html')
-          send(method) unless method == __method__
+           method.to_s.end_with?('_from_html') && method != __method__
+          send(method)
         end
       end
     end
@@ -635,13 +663,13 @@ be relative"
       @parser = init_nokogiri
       @score  = obj.fetch('score', 0.0)
-      Wgit::Utils.sanitize(@html, encode: encode)
+      @html = Wgit::Utils.sanitize(@html, encode:)
       # Dynamically run the init_*_from_object methods.
       Document.private_instance_methods(false).each do |method|
         if method.to_s.start_with?('init_') &&
-           method.to_s.end_with?('_from_object')
-          send(method, obj) unless method == __method__
+           method.to_s.end_with?('_from_object') && method != __method__
+          send(method, obj)
         end
       end
     end
@@ -654,7 +682,7 @@ be relative"
     def init_var(var, value)
       # instance_var_name starts with @, var_name doesn't.
       var = var.to_s
-      var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
+      var_name = (var.start_with?('@') ? var[1..] : var).to_sym
       instance_var_name = "@#{var_name}".to_sym
       instance_variable_set(instance_var_name, value)
@@ -663,10 +691,10 @@ be relative"
       var_name
     end
-    alias content                html
-    alias statistics             stats
-    alias internal_urls          internal_links
-    alias internal_absolute_urls internal_absolute_links
-    alias external_urls          external_links
+    alias_method :content,                :html
+    alias_method :statistics,             :stats
+    alias_method :internal_urls,          :internal_links
+    alias_method :internal_absolute_urls, :internal_absolute_links
+    alias_method :external_urls,          :external_links
   end
 end

data/lib/wgit/document_extractors.rb CHANGED Viewed

@@ -2,6 +2,20 @@
 ### Default Document Extractors ###
+# No index.
+Wgit::Document.define_extractor(
+  :meta_robots,
+  '//meta[@name="robots"]/@content',
+  singleton: true,
+  text_content_only: true
+)
+Wgit::Document.define_extractor(
+  :meta_wgit,
+  '//meta[@name="wgit"]/@content',
+  singleton: true,
+  text_content_only: true
+)
 # Base.
 Wgit::Document.define_extractor(
   :base,
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
 ) do |keywords, _source, type|
   if keywords && (type == :document)
     keywords = keywords.split(',')
-    Wgit::Utils.sanitize(keywords)
+    keywords = Wgit::Utils.sanitize(keywords)
   end
   keywords
 end

data/lib/wgit/dsl.rb CHANGED Viewed

@@ -101,7 +101,7 @@ the 'start' function".freeze
       raise DSL_ERROR__NO_START_URL if urls.empty?
       urls.map! { |url| Wgit::Url.parse(url) }
-      crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
+      crawler.crawl_urls(*urls, follow_redirects:, &block)
     end
     # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,9 +135,7 @@ the 'start' function".freeze
       raise DSL_ERROR__NO_START_URL if urls.empty?
       xpath = follow || :default
-      opts  = {
-        follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
-      }
+      opts  = { follow: xpath, allow_paths:, disallow_paths: }
       urls.reduce([]) do |externals, url|
         externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
@@ -189,7 +187,7 @@ the 'start' function".freeze
       db      = Wgit::Database.new(connection_string)
       indexer = Wgit::Indexer.new(db, crawler)
-      indexer.index_www(max_sites: max_sites, max_data: max_data)
+      indexer.index_www(max_sites:, max_data:)
     end
     # Indexes a single website using `Wgit::Indexer#index_site` underneath.
@@ -226,8 +224,7 @@ the 'start' function".freeze
       indexer    = Wgit::Indexer.new(db, crawler)
       xpath      = follow || :default
       crawl_opts = {
-        insert_externals: insert_externals, follow: xpath,
-        allow_paths: allow_paths, disallow_paths: disallow_paths
+        insert_externals:, follow: xpath, allow_paths:, disallow_paths:
       }
       urls.reduce(0) do |total, url|
@@ -261,9 +258,11 @@ the 'start' function".freeze
       indexer = Wgit::Indexer.new(db, crawler)
       urls.map! { |url| Wgit::Url.parse(url) }
-      indexer.index_urls(*urls, insert_externals: insert_externals, &block)
+      indexer.index_urls(*urls, insert_externals:, &block)
     end
+    ### DATABASE METHODS ###
     # Performs a search of the database's indexed documents and pretty prints
     # the results in a search engine-esque format. See `Wgit::Database#search!`
     # and `Wgit::Document#search!` for details of how the search works.
@@ -285,7 +284,7 @@ the 'start' function".freeze
     #   database containing only its matching `#text`.
     # @return [Array<Wgit::Document>] The search results with matching text.
     def search(
-      query, connection_string: @dsl_conn_str, stream: STDOUT,
+      query, connection_string: @dsl_conn_str, stream: $stdout,
       case_sensitive: false, whole_sentence: true,
       limit: 10, skip: 0, sentence_limit: 80, &block
     )
@@ -294,15 +293,12 @@ the 'start' function".freeze
       results = db.search!(
         query,
-        case_sensitive: case_sensitive,
-        whole_sentence: whole_sentence,
-        limit: limit,
-        skip: skip,
-        sentence_limit: sentence_limit,
-        &block
+        case_sensitive:, whole_sentence:,
+        limit:, skip:,
+        sentence_limit:, &block
       )
-      Wgit::Utils.printf_search_results(results, stream: stream)
+      Wgit::Utils.pprint_search_results(results, stream:)
       results
     end
@@ -317,9 +313,9 @@ the 'start' function".freeze
       db.clear_db
     end
-    alias crawl_url  crawl
-    alias crawl_r    crawl_site
-    alias index_r    index_site
-    alias start_urls start
+    alias_method :crawl_url,  :crawl
+    alias_method :crawl_r,    :crawl_site
+    alias_method :index_r,    :index_site
+    alias_method :start_urls, :start
   end
 end