RubyGems - wgit - Versions diffs - 0.2.0 → 0.3.0 - Mend

wgit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/lib/wgit/crawler.rb +22 -16
data/lib/wgit/database/database.rb +5 -1
data/lib/wgit/database/model.rb +13 -9
data/lib/wgit/document.rb +16 -5
data/lib/wgit/document_extensions.rb +1 -1
data/lib/wgit/indexer.rb +38 -12
data/lib/wgit/url.rb +26 -15
data/lib/wgit/utils.rb +6 -7
data/lib/wgit/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
-  data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
+  metadata.gz: d9d045d2dd7f570db1811bafab1ac244103cc359033efd9279323c795a67bb9f
+  data.tar.gz: 996801763a6576ede812e2edd7d201ceb34b2135548a365b748f953e7df40db9
 SHA512:
-  metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
-  data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
+  metadata.gz: e0dfe907c599c320377464aec927b24700d0e9e17d7bb37b4903715af63cbf031dc5983cd6749b1d90353cbcffc0d71e76eb2a0f8c0ba77b3b03f2d51ca9634f
+  data.tar.gz: bade693ab5b32bf8a16747233356307fe489798855133b8e16b3a907d38f8fd9ecfadadab0273d1c0767106bcf85027c64ebb6f86dc38660240200d5fef07377

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -5,6 +5,7 @@ require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
 require 'net/http' # Requires 'uri'.
+require 'benchmark'
 module Wgit
   # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
@@ -105,7 +106,7 @@ module Wgit
     # Crawl the url returning the response Wgit::Document or nil if an error
     # occurs.
     #
-    # @param url [Wgit::Url] The Url to crawl.
+    # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
     # @param follow_external_redirects [Boolean] Whether or not to follow
     #   an external redirect. External meaning to a different host. False will
     #   return nil for such a crawl. If false, you must also provide a `host:`
@@ -123,16 +124,14 @@ module Wgit
       # A String url isn't allowed because it's passed by value not reference,
       # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
       assert_type(url, Wgit::Url)
+      raise 'host cannot be nil if follow_external_redirects is false' \
       if !follow_external_redirects && host.nil?
-        raise 'host cannot be nil if follow_external_redirects is false'
-      end
       html = fetch(
         url,
         follow_external_redirects: follow_external_redirects,
         host: host
       )
-      url.crawled = true
       doc = Wgit::Document.new(url, html)
       yield(doc) if block_given?
@@ -147,7 +146,8 @@ module Wgit
     # HTTP response that doesn't return a HTML body will be ignored and nil
     # will be returned; otherwise, the HTML String is returned.
     #
-    # @param url [Wgit::Url] The URL to fetch the HTML for.
+    # @param url [Wgit::Url] The URL to fetch the HTML for. This Url object
+    #   will likely be modified as a result of the fetch/crawl.
     # @param follow_external_redirects [Boolean] Whether or not to follow
     #   an external redirect. False will return nil for such a crawl. If false,
     #   you must also provide a `host:` parameter.
@@ -159,19 +159,26 @@ module Wgit
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
     def fetch(url, follow_external_redirects: true, host: nil)
-      response = resolve(
-        url,
-        follow_external_redirects: follow_external_redirects,
-        host: host
-      )
-      @last_response = response
+      crawl_duration = nil
+      response       = nil
+      crawl_duration = Benchmark.measure do
+        response = resolve(
+          url,
+          follow_external_redirects: follow_external_redirects,
+          host: host
+        )
+      end.real
       response.body.empty? ? nil : response.body
     rescue StandardError => e
       Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
-      @last_response = nil
       nil
+    ensure
+      url.crawled        = true # Also sets date_crawled underneath.
+      url.crawl_duration = crawl_duration
+      @last_response     = response
     end
     # The resolve method performs a HTTP GET to obtain the HTML response. The
@@ -204,10 +211,9 @@ module Wgit
         yield(url, response, location) if block_given?
+        raise "External redirect not allowed - Redirected to: \
+'#{location}', which is outside of host: '#{host}'" \
         if !follow_external_redirects && !location.is_relative?(host: host)
-          raise "External redirect not allowed - Redirected to: \
-'#{location}', which is outside of host: '#{host}'"
-        end
         raise "Too many redirects: #{redirect_count}" \
         if redirect_count >= @redirect_limit
@@ -235,7 +241,7 @@ module Wgit
          .uniq
          .reject do |link|
         ext = link.to_extension
-        ext ? !%w[htm html].include?(ext) : false
+        ext ? !%w[htm html].include?(ext.downcase) : false
       end
     end

data/lib/wgit/database/database.rb CHANGED Viewed

@@ -64,7 +64,9 @@ module Wgit
     #   Wgit::Model.document.
     # @raise [StandardError] If data isn't valid.
     def insert(data)
+      data = data.dup # Avoid modifying by reference.
       type = data.is_a?(Enumerable) ? data.first : data
       case type
       when Wgit::Url
         insert_urls(data)
@@ -146,7 +148,7 @@ module Wgit
     def search(
       query, case_sensitive: false, whole_sentence: false, limit: 10, skip: 0
     )
-      query.strip!
+      query = query.to_s.strip
       query.replace('"' + query + '"') if whole_sentence
       # Sort based on the most search hits (aka "textScore").
@@ -232,6 +234,8 @@ module Wgit
     # @param data [Wgit::Url, Wgit::Document] The data to update.
     # @raise [StandardError] If the data is not valid.
     def update(data)
+      data = data.dup # Avoid modifying by reference.
       case data
       when Wgit::Url
         update_url(data)

data/lib/wgit/database/model.rb CHANGED Viewed

@@ -3,43 +3,47 @@
 require_relative '../utils'
 module Wgit
-  # Module used to build the database collection objects.
+  # Module used to build the database collection objects, forming a data model.
   module Model
-    # The data model for a Wgit::Url.
+    # The data model for a Wgit::Url collection object and for an embedded
+    # 'url' inside a Wgit::Document collection object.
     #
-    # @param url [Wgit::Url] The Url DB record.
+    # @param url [Wgit::Url] The Url data object.
     # @return [Hash] The URL model ready for DB insertion.
     def self.url(url)
       raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
       model = url.to_h
       Wgit::Utils.remove_non_bson_types(model)
     end
-    # The data model for a Wgit::Document.
+    # The data model for a Wgit::Document collection object.
     #
-    # @param doc [Wgit::Document] The Document DB record.
+    # @param doc [Wgit::Document] The Document data object.
     # @return [Hash] The Document model ready for DB insertion.
     def self.document(doc)
       raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
-      model = doc.to_h(include_html: false)
+      model = doc.to_h(include_html: false, include_score: false)
+      model['url'] = self.url(doc.url) # Expand Url String into full object.
       Wgit::Utils.remove_non_bson_types(model)
     end
     # Common fields when inserting a record into the DB.
     #
-    # @return [Hash] Containing common insertion fields for all models.
+    # @return [Hash] Insertion fields common to all models.
     def self.common_insert_data
       {
-        date_added: Wgit::Utils.time_stamp,
+        date_added:    Wgit::Utils.time_stamp,
         date_modified: Wgit::Utils.time_stamp
       }
     end
     # Common fields when updating a record in the DB.
     #
-    # @return [Hash] Containing common update fields for all models.
+    # @return [Hash] Update fields common to all models.
     def self.common_update_data
       {
         date_modified: Wgit::Utils.time_stamp

data/lib/wgit/document.rb CHANGED Viewed

@@ -87,7 +87,7 @@ module Wgit
       xpath
     end
-    # Defines an extension, which is a way to extract HTML elements into
+    # Defines an extension, which is a way to serialise HTML elements into
     # instance variables upon Document initialization. See the default
     # extensions defined in 'document_extensions.rb' as examples.
     #
@@ -189,13 +189,20 @@ module Wgit
       @html[range]
     end
-    # Returns the timestamp of when this Wgit::Document was crawled.
+    # Returns the timestamp of when this Document was crawled.
     #
-    # @return [Time] Time of when this Wgit::Document was crawled.
+    # @return [Time] Time of when this Document was crawled.
     def date_crawled
       @url.date_crawled
     end
+    # Returns the duration of the crawl for this Document (in seconds).
+    #
+    # @return [Float] The duration of the crawl for this Document.
+    def crawl_duration
+      @url.crawl_duration
+    end
     # Returns the base URL of this Wgit::Document. The base URL is either the
     # <base> element's href value or @url (if @base is nil). If @base is
     # present and relative, then @url.to_base + @base is returned. This method
@@ -240,8 +247,9 @@ module Wgit
     # @param include_html [Boolean] Whether or not to include @html in the
     #   returned Hash.
     # @return [Hash] Containing self's instance vars.
-    def to_h(include_html: false)
+    def to_h(include_html: false, include_score: true)
       ignore = include_html ? [] : ['@html']
+      ignore << '@score' unless include_score
       ignore << '@doc' # Always ignore Nokogiri @doc.
       Wgit::Utils.to_h(self, ignore: ignore)
@@ -525,7 +533,10 @@ module Wgit
       assert_types(html, [String, NilClass])
       # We already know url.is_a?(String) so parse into Url unless already so.
-      @url   = Wgit::Url.parse(url)
+      url = Wgit::Url.parse(url)
+      url.crawled = true unless url.crawled # Avoid overriding date_crawled.
+      @url   = url
       @html  = html || ''
       @doc   = init_nokogiri
       @score = 0.0

data/lib/wgit/document_extensions.rb CHANGED Viewed

@@ -49,7 +49,7 @@ Wgit::Document.define_extension(
   singleton: false,
   text_content_only: true
 ) do |links|
-  links&.map! { |link| Wgit::Url.new(link) }
+  links.map! { |link| Wgit::Url.new(link) }
 end
 # Text.

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -12,6 +12,8 @@ module Wgit
   # external url's to be crawled later on. Logs info on the crawl
   # using Wgit.logger as it goes along.
   #
+  # @param connection_string [String] The database connection string. Set as
+  #   nil to use ENV['WGIT_CONNECTION_STRING'].
   # @param max_sites [Integer] The number of separate and whole
   #   websites to be crawled before the method exits. Defaults to -1 which
   #   means the crawl will occur until manually stopped (Ctrl+C etc).
@@ -19,8 +21,10 @@ module Wgit
   #   scraped from the web (default is 1GB). Note, that this value is used to
   #   determine when to stop crawling; it's not a guarantee of the max data
   #   that will be obtained.
-  def self.index_www(max_sites: -1, max_data: 1_048_576_000)
-    db = Wgit::Database.new
+  def self.index_www(
+    connection_string: nil, max_sites: -1, max_data: 1_048_576_000
+  )
+    db = Wgit::Database.new(connection_string)
     indexer = Wgit::Indexer.new(db)
     indexer.index_www(max_sites: max_sites, max_data: max_data)
   end
@@ -32,14 +36,18 @@ module Wgit
   # There is no max download limit so be careful which sites you index.
   #
   # @param url [Wgit::Url, String] The base Url of the website to crawl.
+  # @param connection_string [String] The database connection string. Set as
+  #   nil to use ENV['WGIT_CONNECTION_STRING'].
   # @param insert_externals [Boolean] Whether or not to insert the website's
   #   external Url's into the database.
   # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
   #   inserted into the database allowing for prior manipulation.
   # @return [Integer] The total number of pages crawled within the website.
-  def self.index_site(url, insert_externals: true, &block)
+  def self.index_site(
+    url, connection_string: nil, insert_externals: true, &block
+  )
     url = Wgit::Url.parse(url)
-    db = Wgit::Database.new
+    db = Wgit::Database.new(connection_string)
     indexer = Wgit::Indexer.new(db)
     indexer.index_site(url, insert_externals: insert_externals, &block)
   end
@@ -51,13 +59,17 @@ module Wgit
   # There is no max download limit so be careful of large pages.
   #
   # @param url [Wgit::Url, String] The Url of the webpage to crawl.
+  # @param connection_string [String] The database connection string. Set as
+  #   nil to use ENV['WGIT_CONNECTION_STRING'].
   # @param insert_externals [Boolean] Whether or not to insert the website's
   #   external Url's into the database.
   # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
   #   inserted into the database allowing for prior manipulation.
-  def self.index_page(url, insert_externals: true, &block)
+  def self.index_page(
+    url, connection_string: nil, insert_externals: true, &block
+  )
     url = Wgit::Url.parse(url)
-    db = Wgit::Database.new
+    db = Wgit::Database.new(connection_string)
     indexer = Wgit::Indexer.new(db)
     indexer.index_page(url, insert_externals: insert_externals, &block)
   end
@@ -67,6 +79,8 @@ module Wgit
   # details of how the search works.
   #
   # @param query [String] The text query to search with.
+  # @param connection_string [String] The database connection string. Set as
+  #   nil to use ENV['WGIT_CONNECTION_STRING'].
   # @param case_sensitive [Boolean] Whether character case must match.
   # @param whole_sentence [Boolean] Whether multiple words should be searched
   #   for separately.
@@ -76,11 +90,20 @@ module Wgit
   #   snippet.
   # @yield [doc] Given each search result (Wgit::Document) returned from the
   #   database.
-  def self.indexed_search(query, case_sensitive: false, whole_sentence: false,
-                          limit: 10, skip: 0, sentence_limit: 80, &block)
-    results = Wgit::Database.new.search(
-      query, case_sensitive: case_sensitive, whole_sentence: whole_sentence,
-      limit: limit, skip: skip, &block
+  def self.indexed_search(
+    query, connection_string: nil,
+    case_sensitive: false, whole_sentence: false,
+    limit: 10, skip: 0, sentence_limit: 80, &block
+  )
+    db = Wgit::Database.new(connection_string)
+    results = db.search(
+      query,
+      case_sensitive: case_sensitive,
+      whole_sentence: whole_sentence,
+      limit: limit,
+      skip: skip,
+      &block
     )
     results.each do |doc|
@@ -88,7 +111,8 @@ module Wgit
         query,
         case_sensitive: case_sensitive,
         whole_sentence: whole_sentence,
-        sentence_limit: sentence_limit)
+        sentence_limit: sentence_limit
+      )
     end
     Wgit::Utils.printf_search_results(results)
@@ -137,6 +161,7 @@ runs out of urls to crawl (which might be never).")
         if uncrawled_urls.empty?
           Wgit.logger.info('No urls to crawl, exiting.')
           return
         end
         Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
@@ -148,6 +173,7 @@ runs out of urls to crawl (which might be never).")
           unless keep_crawling?(site_count, max_sites, max_data)
             Wgit.logger.info("Reached max number of sites to crawl or \
 database capacity, exiting.")
             return
           end
           site_count += 1

data/lib/wgit/url.rb CHANGED Viewed

@@ -15,12 +15,15 @@ module Wgit
     include Assertable
     # Whether or not the Url has been crawled or not. A custom crawled= method
-    # is also provided by this class.
-    attr_reader :crawled
+    # is provided by this class, overridding the default one.
+    attr_accessor :crawled
-    # The Time which the Url was crawled.
+    # The Time stamp of when this Url was crawled.
     attr_accessor :date_crawled
+    # The duration of the crawl for this Url (in seconds).
+    attr_accessor :crawl_duration
     # Initializes a new instance of Wgit::Url which represents a web based
     # HTTP URL.
     #
@@ -32,8 +35,12 @@ module Wgit
     # @param date_crawled [Time] Should only be provided if crawled is true. A
     #   suitable object can be returned from Wgit::Utils.time_stamp. Only used
     #   if url_or_obj is a String.
+    # @param crawl_duration [Float] Should only be provided if crawled is true.
+    #   The duration of the crawl for this Url (in seconds).
     # @raise [StandardError] If url_or_obj is an Object with missing methods.
-    def initialize(url_or_obj, crawled: false, date_crawled: nil)
+    def initialize(
+      url_or_obj, crawled: false, date_crawled: nil, crawl_duration: nil
+    )
       # Init from a URL String.
       if url_or_obj.is_a?(String)
         url = url_or_obj.to_s
@@ -42,14 +49,16 @@ module Wgit
         obj = url_or_obj
         assert_respond_to(obj, :fetch)
-        url          = obj.fetch('url') # Should always be present.
-        crawled      = obj.fetch('crawled', false)
-        date_crawled = obj.fetch('date_crawled', nil)
+        url            = obj.fetch('url') # Should always be present.
+        crawled        = obj.fetch('crawled', false)
+        date_crawled   = obj.fetch('date_crawled', nil)
+        crawl_duration = obj.fetch('crawl_duration', nil)
       end
-      @uri          = Addressable::URI.parse(url)
-      @crawled      = crawled
-      @date_crawled = date_crawled
+      @uri            = Addressable::URI.parse(url)
+      @crawled        = crawled
+      @date_crawled   = date_crawled
+      @crawl_duration = crawl_duration
       super(url)
     end
@@ -77,14 +86,16 @@ module Wgit
       obj.is_a?(Wgit::Url) ? obj : new(obj)
     end
-    # Sets the @crawled instance var, also setting @date_crawled to the
-    # current time or nil (depending on the bool value).
+    # Sets the @crawled instance var, also setting @date_crawled for
+    # convenience.
     #
-    # @param bool [Boolean] True if self has been crawled, false otherwise.
-    # @return [Time, NilClass] Returns the date crawled, if set.
+    # @param bool [Boolean] True if this Url has been crawled, false otherwise.
+    # @return [Boolean] The value of bool having been set.
     def crawled=(bool)
-      @crawled = bool
+      @crawled      = bool
       @date_crawled = bool ? Wgit::Utils.time_stamp : nil
+      bool
     end
     # Overrides String#replace setting the new_url @uri and String value.

data/lib/wgit/utils.rb CHANGED Viewed

@@ -120,20 +120,19 @@ module Wgit
     #   outputted to the stream.
     # @param stream [#puts] Any object that respond_to?(:puts). It is used
     #   to output text somewhere e.g. a file or STDOUT.
-    # @return [NilClass] Returns nil.
     def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
       raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
       results.each do |doc|
-        title = (doc.title || '<no title>')
-        missing_keywords = (doc.keywords.nil? || doc.keywords.empty?)
-        keywords = missing_keywords ? nil : doc.keywords.take(keyword_limit)
+        title    = (doc.title || '<no title>')
+        keywords = doc.keywords&.take(keyword_limit)&.join(', ')
         sentence = doc.text.first
+        url      = doc.url
         stream.puts title
-        stream.puts keywords.join(', ') if keywords
-        stream.puts sentence if sentence
-        stream.puts doc.url
+        stream.puts keywords if keywords
+        stream.puts sentence
+        stream.puts url
         stream.puts
       end

data/lib/wgit/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 # @author Michael Telford
 module Wgit
   # The current gem version of Wgit.
-  VERSION = '0.2.0'
+  VERSION = '0.3.0'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wgit
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Michael Telford
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-09-22 00:00:00.000000000 Z
+date: 2019-10-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable