RubyGems - wgit - Versions diffs - 0.0.17 → 0.0.18 - Mend

wgit 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +61 -0
data/LICENSE.txt +21 -0
data/README.md +16 -7
data/TODO.txt +34 -0
data/lib/wgit.rb +3 -1
data/lib/wgit/assertable.rb +35 -29
data/lib/wgit/core_ext.rb +5 -3
data/lib/wgit/crawler.rb +96 -58
data/lib/wgit/database/connection_details.rb +4 -2
data/lib/wgit/database/database.rb +84 -46
data/lib/wgit/database/model.rb +12 -10
data/lib/wgit/document.rb +100 -72
data/lib/wgit/document_extensions.rb +11 -9
data/lib/wgit/indexer.rb +34 -24
data/lib/wgit/logger.rb +4 -2
data/lib/wgit/url.rb +94 -59
data/lib/wgit/utils.rb +13 -11
data/lib/wgit/version.rb +3 -1
metadata +41 -38

data/lib/wgit/document_extensions.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 ### Default Document Extensions ###
 # Base.
@@ -5,9 +7,9 @@ Wgit::Document.define_extension(
   :base,
   '//base/@href',
   singleton: true,
-  text_content_only: true,
+  text_content_only: true
 ) do |base|
-  base = Wgit::Url.new(base) if base
+  Wgit::Url.new(base) if base
 end
 # Title.
@@ -15,7 +17,7 @@ Wgit::Document.define_extension(
   :title,
   '//title',
   singleton: true,
-  text_content_only: true,
+  text_content_only: true
 )
 # Author.
@@ -23,7 +25,7 @@ Wgit::Document.define_extension(
   :author,
   '//meta[@name="author"]/@content',
   singleton: true,
-  text_content_only: true,
+  text_content_only: true
 )
 # Keywords.
@@ -31,9 +33,9 @@ Wgit::Document.define_extension(
   :keywords,
   '//meta[@name="keywords"]/@content',
   singleton: true,
-  text_content_only: true,
+  text_content_only: true
 ) do |keywords, source|
-  if keywords and source == :html
+  if keywords && (source == :html)
     keywords = keywords.split(',')
     Wgit::Utils.process_arr(keywords)
   end
@@ -45,9 +47,9 @@ Wgit::Document.define_extension(
   :links,
   '//a/@href',
   singleton: false,
-  text_content_only: true,
+  text_content_only: true
 ) do |links|
-  links.map! { |link| Wgit::Url.new(link) } if links
+  links&.map! { |link| Wgit::Url.new(link) }
 end
 # Text.
@@ -55,5 +57,5 @@ Wgit::Document.define_extension(
   :text,
   proc { Wgit::Document.text_elements_xpath },
   singleton: false,
-  text_content_only: true,
+  text_content_only: true
 )

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -1,8 +1,9 @@
+# frozen_string_literal: true
 require_relative 'crawler'
 require_relative 'database/database'
 module Wgit
   # Convience method to index the World Wide Web using
   # Wgit::Indexer#index_the_web.
   #
@@ -18,7 +19,7 @@ module Wgit
   #   scraped from the web (default is 1GB). Note, that this value is used to
   #   determine when to stop crawling; it's not a guarantee of the max data
   #   that will be obtained.
-  def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
+  def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
     db = Wgit::Database.new
     indexer = Wgit::Indexer.new(db)
     indexer.index_the_web(max_sites_to_crawl, max_data_size)
@@ -81,7 +82,6 @@ module Wgit
   # Class which sets up a crawler and saves the indexed docs to a database.
   class Indexer
     # The crawler used to scrape the WWW.
     attr_reader :crawler
@@ -109,19 +109,19 @@ module Wgit
     #   scraped from the web (default is 1GB). Note, that this value is used to
     #   determine when to stop crawling; it's not a guarantee of the max data
     #   that will be obtained.
-    def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
+    def index_the_web(max_sites_to_crawl = -1, max_data_size = 1_048_576_000)
       if max_sites_to_crawl < 0
         Wgit.logger.info("Indexing until the database has been filled or it runs out of \
 urls to crawl (which might be never).")
       end
       site_count = 0
-      while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
+      while keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
         Wgit.logger.info("Current database size: #{@db.size}")
         @crawler.urls = @db.uncrawled_urls
         if @crawler.urls.empty?
-          Wgit.logger.info("No urls to crawl, exiting.")
+          Wgit.logger.info('No urls to crawl, exiting.')
           return
         end
         Wgit.logger.info("Starting crawl loop for: #{@crawler.urls}")
@@ -181,9 +181,7 @@ iteration.")
       ext_urls = @crawler.crawl_site(url) do |doc|
         result = true
-        if block_given?
-          result = yield(doc)
-        end
+        result = yield(doc) if block_given?
         if result
           if write_doc_to_db(doc)
@@ -221,9 +219,7 @@ site: #{url}")
     def index_this_page(url, insert_externals = true)
       document = @crawler.crawl_page(url) do |doc|
         result = true
-        if block_given?
-          result = yield(doc)
-        end
+        result = yield(doc) if block_given?
         if result
           if write_doc_to_db(doc)
@@ -244,11 +240,20 @@ site: #{url}")
       nil
     end
-  private
+    protected
-    # Keep crawling or not based on DB size and current loop iteration.
+    # Returns whether or not to keep crawling based on the DB size and current
+    # loop iteration.
+    #
+    # @param site_count [Integer] The current number of crawled sites.
+    # @param max_sites_to_crawl [Integer] The maximum number of sites to crawl
+    #   before stopping.
+    # @param max_data_size [Integer] The maximum amount of data to crawl before
+    #   stopping.
+    # @return [Boolean] True if the crawl should continue, false otherwise.
     def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
       return false if @db.size >= max_data_size
       # If max_sites_to_crawl is -1 for example then crawl away.
       if max_sites_to_crawl < 0
         true
@@ -257,8 +262,11 @@ site: #{url}")
       end
     end
-    # The unique url index on the documents collection prevents duplicate
-    # inserts.
+    # Write the doc to the DB. Note that the unique url index on the documents
+    # collection deliberately prevents duplicate inserts.
+    #
+    # @param doc [Wgit::Document] The document to write to the DB.
+    # @return [Boolean] True if the write was successful, false otherwise.
     def write_doc_to_db(doc)
       @db.insert(doc)
       Wgit.logger.info("Saved document for url: #{doc.url}")
@@ -268,18 +276,20 @@ site: #{url}")
       false
     end
-    # The unique url index on the urls collection prevents duplicate inserts.
+    # Write the urls to the DB. Note that the unique url index on the urls
+    # collection deliberately prevents duplicate inserts.
+    #
+    # @param urls [Array<Wgit::Url>] The urls to write to the DB.
+    # @return [Boolean] True if the write was successful, false otherwise.
     def write_urls_to_db(urls)
       count = 0
       if urls.respond_to?(:each)
         urls.each do |url|
-          begin
-            @db.insert(url)
-            count += 1
-            Wgit.logger.info("Inserted url: #{url}")
-          rescue Mongo::Error::OperationFailure
-            Wgit.logger.info("Url already exists: #{url}")
-          end
+          @db.insert(url)
+          count += 1
+          Wgit.logger.info("Inserted url: #{url}")
+        rescue Mongo::Error::OperationFailure
+          Wgit.logger.info("Url already exists: #{url}")
         end
       end
       count

data/lib/wgit/logger.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 # FYI: The default logger is set at the bottom of this file.
 require 'logger'
@@ -24,7 +26,7 @@ module Wgit
   # @return [Logger] The default Logger instance.
   def self.default_logger
     logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
-    logger.formatter = proc do |severity, datetime, progname, msg|
+    logger.formatter = proc do |_severity, _datetime, progname, msg|
       "[#{progname}] #{msg}\n"
     end
     logger
@@ -33,7 +35,7 @@ module Wgit
   # Sets the default Logger instance to be used by Wgit.
   # @return [Logger] The default Logger instance.
   def self.use_default_logger
-    @logger = self.default_logger
+    @logger = default_logger
   end
 end

data/lib/wgit/url.rb CHANGED Viewed

@@ -1,10 +1,11 @@
+# frozen_string_literal: true
 require_relative 'utils'
 require_relative 'assertable'
 require 'uri'
 require 'addressable/uri'
 module Wgit
   # Class modeling a web based URL.
   # Can be an internal/relative link e.g. "about.html" or a full URL
   # e.g. "http://www.google.co.uk". Is a subclass of String and uses
@@ -12,8 +13,9 @@ module Wgit
   class Url < String
     include Assertable
-    # Whether or not the Url has been crawled or not.
-    attr_accessor :crawled
+    # Whether or not the Url has been crawled or not. A custom crawled= method
+    # is also provided by this class.
+    attr_reader :crawled
     # The date which the Url was crawled.
     attr_accessor :date_crawled
@@ -39,9 +41,9 @@ module Wgit
         obj = url_or_obj
         assert_respond_to(obj, [:fetch, :[]])
-        url = obj.fetch("url") # Should always be present.
-        crawled = obj.fetch("crawled", false)
-        date_crawled = obj["date_crawled"]
+        url = obj.fetch('url') # Should always be present.
+        crawled = obj.fetch('crawled', false)
+        date_crawled = obj['date_crawled']
       end
       @uri = Addressable::URI.parse(url)
@@ -56,7 +58,7 @@ module Wgit
     # @param str [String] The URL string to parse.
     # @return [Wgit::Url] The parsed Url object.
     def self.parse(str)
-      self.new(str)
+      new(str)
     end
     # Raises an exception if url is not a valid HTTP URL.
@@ -65,13 +67,11 @@ module Wgit
     # @raise [RuntimeError] If url is invalid.
     def self.validate(url)
       url = Wgit::Url.new(url)
-      if url.relative_link?
-        raise "Invalid url (or a relative link): #{url}"
-      end
-      unless url.start_with?("http://") or url.start_with?("https://")
+      raise "Invalid url (or a relative link): #{url}" if url.relative_link?
+      unless url.start_with?('http://') || url.start_with?('https://')
         raise "Invalid url (missing protocol prefix): #{url}"
       end
-      if URI.regexp.match(url.normalise).nil?
+      if URI::DEFAULT_PARSER.make_regexp.match(url.normalise).nil?
         raise "Invalid url: #{url}"
       end
     end
@@ -83,7 +83,7 @@ module Wgit
     def self.valid?(url)
       Wgit::Url.validate(url)
       true
-    rescue
+    rescue StandardError
       false
     end
@@ -95,7 +95,7 @@ module Wgit
     # @param https [Boolean] Whether the protocol prefix is https or http.
     # @return [Wgit::Url] The url with a protocol prefix.
     def self.prefix_protocol(url, https = false)
-      unless url.start_with?("http://") or url.start_with?("https://")
+      unless url.start_with?('http://') || url.start_with?('https://')
         if https
           url.replace("https://#{url}")
         else
@@ -113,7 +113,7 @@ module Wgit
     def self.concat(host, link)
       host = Wgit::Url.new(host).without_trailing_slash
       link = Wgit::Url.new(link).without_leading_slash
-      separator = (link.start_with?('#') or link.start_with?('?')) ? '' : '/'
+      separator = (link.start_with?('#') || link.start_with?('?')) ? '' : '/'
       Wgit::Url.new(host + separator + link)
     end
@@ -126,26 +126,35 @@ module Wgit
       super(new_url)
     end
-    # Returns true if self is a relative Url.
+    # Returns true if self is a relative Url; false if absolute.
     #
     # All external links in a page are expected to have a protocol prefix e.g.
     # "http://", otherwise the link is treated as an internal link (regardless
-    # of whether it's valid or not). The only exception is if host or domain is
-    # provided and self is a page belonging to that host/domain; then the link
-    # is relative.
+    # of whether it's valid or not). The only exception is if an opts arg is
+    # provided and self is a page belonging to that arg type e.g. domain; then
+    # the link is relative.
     #
-    # @param host [Wgit::Url, String] The Url host e.g.
-    #   http://www.google.com/how which gives a host of www.google.com.
+    # @param opts [Hash] The options with which to check relativity.
+    # @option opts [Wgit::Url, String] :host The Url host e.g.
+    #   http://www.google.com/how which gives a host of 'www.google.com'.
     #   The host must be absolute and prefixed with a protocol.
-    # @param domain [Wgit::Url, String] The Url domain e.g.
-    #   http://www.google.com/how which gives a domain of google.com. The
+    # @option opts [Wgit::Url, String] :domain The Url domain e.g.
+    #   http://www.google.com/how which gives a domain of 'google.com'. The
     #   domain must be absolute and prefixed with a protocol.
-    # @return [Boolean] True if relative, false if absolute.
+    # @option opts [Wgit::Url, String] :brand The Url brand e.g.
+    #   http://www.google.com/how which gives a domain of 'google'. The
+    #   brand must be absolute and prefixed with a protocol.
     # @raise [RuntimeError] If self is invalid e.g. empty.
-    def is_relative?(host: nil, domain: nil)
-      raise "Invalid link: #{self}" if nil? or empty?
-      raise "Provide host or domain, not both" if host and domain
+    # @return [Boolean] True if relative, false if absolute.
+    def is_relative?(opts = {})
+      opts = { host: nil, domain: nil, brand: nil }.merge(opts)
+      raise "Invalid link: '#{self}'" if empty?
+      if opts.values.count(nil) < (opts.length - 1)
+        raise "Provide only one of: #{opts.keys}"
+      end
+      host = opts[:host]
       if host
         host = Wgit::Url.new(host)
         if host.to_base.nil?
@@ -153,6 +162,7 @@ module Wgit
         end
       end
+      domain = opts[:domain]
       if domain
         domain = Wgit::Url.new(domain)
         if domain.to_base.nil?
@@ -160,11 +170,22 @@ module Wgit
         end
       end
+      brand = opts[:brand]
+      if brand
+        brand = Wgit::Url.new(brand)
+        if brand.to_base.nil?
+          raise "Invalid brand, must be absolute and contain protocol: #{brand}"
+        end
+      end
       if @uri.relative?
         true
       else
         return host   ? to_host   == host.to_host     : false if host
         return domain ? to_domain == domain.to_domain : false if domain
+        return brand  ? to_brand  == brand.to_brand   : false if brand
+        false
       end
     end
@@ -240,11 +261,21 @@ module Wgit
       domain ? Wgit::Url.new(domain) : nil
     end
+    # Returns a new Wgit::Url containing just the brand of this URL e.g.
+    # Given http://www.google.co.uk/about.html, google is returned.
+    #
+    # @return [Wgit::Url, nil] Containing just the brand or nil.
+    def to_brand
+      domain = to_domain
+      domain ? Wgit::Url.new(domain.split('.').first) : nil
+    end
     # Returns only the base of this URL e.g. the protocol and host combined.
     #
     # @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
     def to_base
-      return nil if @uri.scheme.nil? or @uri.host.nil?
+      return nil if @uri.scheme.nil? || @uri.host.nil?
       base = "#{@uri.scheme}://#{@uri.host}"
       Wgit::Url.new(base)
     end
@@ -257,8 +288,9 @@ module Wgit
     # @return [Wgit::Url, nil] Path of self e.g. about.html or nil.
     def to_path
       path = @uri.path
-      return nil if path.nil? or path.empty?
+      return nil if path.nil? || path.empty?
       return Wgit::Url.new('/') if path == '/'
       Wgit::Url.new(path).without_slashes
     end
@@ -300,6 +332,7 @@ module Wgit
     def to_extension
       path = to_path
       return nil unless path
       segs = path.split('.')
       segs.length > 1 ? Wgit::Url.new(segs.last) : nil
     end
@@ -344,6 +377,7 @@ module Wgit
       without_base = base_url ? gsub(base_url, '') : self
       return self if ['', '/'].include?(without_base)
       Wgit::Url.new(without_base).without_slashes
     end
@@ -395,36 +429,37 @@ module Wgit
     #
     # @return [Hash] self's instance vars as a Hash.
     def to_h
-      ignore = ["@uri"]
+      ignore = ['@uri']
       h = Wgit::Utils.to_h(self, ignore)
-      Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
-    end
-    alias :uri :to_uri
-    alias :url :to_url
-    alias :scheme :to_scheme
-    alias :to_protocol :to_scheme
-    alias :protocol :to_scheme
-    alias :host :to_host
-    alias :domain :to_domain
-    alias :base :to_base
-    alias :path :to_path
-    alias :endpoint :to_endpoint
-    alias :query_string :to_query_string
-    alias :query :to_query_string
-    alias :anchor :to_anchor
-    alias :to_fragment :to_anchor
-    alias :fragment :to_anchor
-    alias :extension :to_extension
-    alias :without_query :without_query_string
-    alias :without_fragment :without_anchor
-    alias :is_query? :is_query_string?
-    alias :is_fragment? :is_anchor?
-    alias :relative_link? :is_relative?
-    alias :internal_link? :is_relative?
-    alias :is_internal? :is_relative?
-    alias :relative? :is_relative?
-    alias :crawled? :crawled
-    alias :normalize :normalise
+      Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
+    end
+    alias uri to_uri
+    alias url to_url
+    alias scheme to_scheme
+    alias to_protocol to_scheme
+    alias protocol to_scheme
+    alias host to_host
+    alias domain to_domain
+    alias brand to_brand
+    alias base to_base
+    alias path to_path
+    alias endpoint to_endpoint
+    alias query_string to_query_string
+    alias query to_query_string
+    alias anchor to_anchor
+    alias to_fragment to_anchor
+    alias fragment to_anchor
+    alias extension to_extension
+    alias without_query without_query_string
+    alias without_fragment without_anchor
+    alias is_query? is_query_string?
+    alias is_fragment? is_anchor?
+    alias relative_link? is_relative?
+    alias internal_link? is_relative?
+    alias is_internal? is_relative?
+    alias relative? is_relative?
+    alias crawled? crawled
+    alias normalize normalise
   end
 end