RubyGems - wgit - Versions diffs - 0.0.18 → 0.2.0 - Mend

wgit 0.0.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/wgit.rb +0 -1
data/lib/wgit/assertable.rb +20 -23
data/lib/wgit/core_ext.rb +6 -14
data/lib/wgit/crawler.rb +94 -183
data/lib/wgit/database/database.rb +209 -185
data/lib/wgit/database/model.rb +7 -7
data/lib/wgit/document.rb +281 -241
data/lib/wgit/indexer.rb +99 -92
data/lib/wgit/logger.rb +5 -1
data/lib/wgit/url.rb +171 -185
data/lib/wgit/utils.rb +57 -68
data/lib/wgit/version.rb +1 -1
metadata +86 -60
data/CHANGELOG.md +0 -61
data/LICENSE.txt +0 -21
data/README.md +0 -361
data/TODO.txt +0 -34
data/lib/wgit/database/connection_details.rb +0 -41

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 26e6a29fbf72b0ecbbc487c8aba9ec243a260b4761805c6c7923f2af82fa94f5
-  data.tar.gz: 9e15ad14991418fc3b4b2c0dafacac617b32197e825ad72887d91182c8ddf652
+  metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
+  data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
 SHA512:
-  metadata.gz: 4b17b8467abf13b186e88fb63fe8630163612bc685d7d521122fdc4c693e7d9229c59888afa1191189b3838317fa29e028c90757b880177c1e7a8f81a0a38047
-  data.tar.gz: 6fb7bb518ca3b9e520e1edbf25b4c265018686b5c61e134d623c38efa1bdf5073affb5205e47aee3a32a4502b56205080ded00a79bd6f138cf9178b019a2b32d
+  metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
+  data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990

data/lib/wgit.rb CHANGED

@@ -8,7 +8,6 @@ require_relative 'wgit/url'
 require_relative 'wgit/document'
 require_relative 'wgit/document_extensions'
 require_relative 'wgit/crawler'
-require_relative 'wgit/database/connection_details'
 require_relative 'wgit/database/model'
 require_relative 'wgit/database/database'
 require_relative 'wgit/indexer'

data/lib/wgit/assertable.rb CHANGED

@@ -1,8 +1,7 @@
 # frozen_string_literal: true
 module Wgit
-  # Module containing assert methods including type checking which can be used
-  # for asserting the integrity of method definitions etc.
+  # Module containing assertion methods including type checking and duck typing.
   module Assertable
     # Default type fail message.
     DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
@@ -11,21 +10,23 @@ module Wgit
     # Default duck fail message.
     DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
     # Default required keys message.
-    DEFAULT_REQUIRED_KEYS_MSG = 'Some or all of the required keys are not present: %s'
+    DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
+present: %s"
-    # Tests if the obj is of a given type.
+    # Tests if the obj is_a? given type; raises an Exception if not.
     #
     # @param obj [Object] The Object to test.
     # @param type_or_types [Type, Array<Type>] The type/types that obj must
     #     belong to or an exception is thrown.
-    # @param msg [String] The raised RuntimeError message, if provided.
+    # @param msg [String] The raised StandardError message, if provided.
+    # @raise [StandardError] If the assertion fails.
     # @return [Object] The given obj on successful assertion.
     def assert_types(obj, type_or_types, msg = nil)
       msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
       match = if type_or_types.respond_to?(:any?)
-                type_or_types.any? { |type| obj.instance_of?(type) }
+                type_or_types.any? { |type| obj.is_a?(type) }
               else
-                obj.instance_of?(type_or_types)
+                obj.is_a?(type_or_types)
               end
       raise msg unless match
@@ -33,36 +34,36 @@ module Wgit
     end
     # Each object within arr must match one of the types listed in
-    # type_or_types or an exception is raised using msg, if provided.
+    # type_or_types; or an exception is raised using msg, if provided.
     #
     # @param arr [Enumerable#each] Enumerable of objects to type check.
     # @param type_or_types [Type, Array<Type>] The allowed type(s).
-    # @param msg [String] The raised RuntimeError message, if provided.
+    # @param msg [String] The raised StandardError message, if provided.
+    # @raise [StandardError] If the assertion fails.
     # @return [Object] The given arr on successful assertion.
     def assert_arr_types(arr, type_or_types, msg = nil)
       raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
-      arr.each do |obj|
-        assert_types(obj, type_or_types, msg)
-      end
+      arr.each { |obj| assert_types(obj, type_or_types, msg) }
     end
     # The obj_or_objs must respond_to? all of the given methods or an
     # Exception is raised using msg, if provided.
     #
-    # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
+    # @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
     # @param methods [Array<Symbol>] The methods to :respond_to?.
-    # @param msg [String] The raised RuntimeError message, if provided.
+    # @param msg [String] The raised StandardError message, if provided.
+    # @raise [StandardError] If the assertion fails.
     # @return [Object] The given obj_or_objs on successful assertion.
     def assert_respond_to(obj_or_objs, methods, msg = nil)
       methods = [methods] unless methods.respond_to?(:all?)
       if obj_or_objs.respond_to?(:each)
-        obj_or_objs.each do |obj|
-          _assert_respond_to(obj, methods, msg)
-        end
+        obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
       else
         _assert_respond_to(obj_or_objs, methods, msg)
       end
       obj_or_objs
     end
@@ -71,6 +72,7 @@ module Wgit
     # @param hash [Hash] The hash which should include the required keys.
     # @param keys [Array<String, Symbol>] The keys whose presence to assert.
     # @param msg [String] The raised KeyError message, if provided.
+    # @raise [KeyError] If the assertion fails.
     # @return [Hash] The given hash on successful assertion.
     def assert_required_keys(hash, keys, msg = nil)
       msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
@@ -93,12 +95,7 @@ module Wgit
       obj
     end
-    alias assert_type assert_types
-    alias type assert_types
-    alias types assert_types
+    alias assert_type     assert_types
     alias assert_arr_type assert_arr_types
-    alias arr_type assert_arr_types
-    alias arr_types assert_arr_types
-    alias respond_to assert_respond_to
   end
 end

data/lib/wgit/core_ext.rb CHANGED

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 # Script which extends Ruby's core functionality when parsed.
-# Needs to be required separately using `require 'wgit/core_ext'`.
+# Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
 require_relative 'url'
@@ -22,19 +22,15 @@ module Enumerable
   #
   # @return [Array<Wgit::Url>] The converted URL's.
   def to_urls
-    map do |element|
-      process_url_element(element)
-    end
+    map { |element| process_url_element(element) }
   end
-  # Converts each String instance into a Wgit::Url object and returns the
-  # updated array. Modifies the receiver.
+  # Converts each String instance into a Wgit::Url object and returns self
+  # having modified the receiver.
   #
   # @return [Array<Wgit::Url>] Self containing the converted URL's.
   def to_urls!
-    map! do |element|
-      process_url_element(element)
-    end
+    map! { |element| process_url_element(element) }
   end
 end
@@ -42,9 +38,5 @@ private
 # Converts the element to a Wgit::Url if the element is a String.
 def process_url_element(element)
-  if element.is_a? String
-    element.to_url
-  else
-    element
-  end
+  element.is_a?(String) ? element.to_url : element
 end

data/lib/wgit/crawler.rb CHANGED

@@ -7,142 +7,24 @@ require_relative 'assertable'
 require 'net/http' # Requires 'uri'.
 module Wgit
-  # The Crawler class provides a means of crawling web based Wgit::Url's, turning
-  # their HTML into Wgit::Document instances.
+  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
+  # serialising their HTML into Wgit::Document instances.
   class Crawler
     include Assertable
-    # The default maximum amount of allowed URL redirects.
-    @default_redirect_limit = 5
-    class << self
-      # Class level instance accessor methods for @default_redirect_limit.
-      # Call using Wgit::Crawler.default_redirect_limit etc.
-      attr_accessor :default_redirect_limit
-    end
-    # The urls to crawl.
-    attr_reader :urls
-    # The docs of the crawled @urls.
-    attr_reader :docs
+    # The amount of allowed redirects before raising an error. Set to 0 to
+    # disable redirects completely.
+    attr_accessor :redirect_limit
     # The Net::HTTPResponse of the most recently crawled URL or nil.
     attr_reader :last_response
-    # Initializes the Crawler and sets the @urls and @docs.
-    #
-    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
-    #   Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
-    #   will NOT update if they happen to redirect when crawled. If in doubt,
-    #   pass the url(s) directly to the crawl_* method instead of to the new
-    #   method.
-    def initialize(*urls)
-      self.[](*urls)
-      @docs = []
-    end
-    # Sets this Crawler's @urls.
-    #
-    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
-    #   crawl_url or crawl_site. Note that the urls passed here will NOT update
-    #   if they happen to redirect when crawled. If in doubt, pass the url(s)
-    #   directly to the crawl_* method instead of to the new method.
-    def urls=(urls)
-      @urls = []
-      Wgit::Utils.each(urls) { |url| add_url(url) }
-    end
-    # Sets this Crawler's @urls.
-    #
-    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
-    #   crawl_url or crawl_site. Note that the urls passed here will NOT update
-    #   if they happen to redirect when crawled. If in doubt, pass the url(s)
-    #   directly to the crawl_* method instead of to the new method.
-    def [](*urls)
-      # If urls is nil then add_url (when called later) will set @urls = []
-      # so we do nothing here.
-      unless urls.nil?
-        # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
-        # outer array is bogus so we use the inner one only.
-        if  urls.is_a?(Enumerable) &&
-            urls.length == 1 &&
-            urls.first.is_a?(Enumerable)
-          urls = urls.first
-        end
-        # Here we call urls= method using self because the param name is also
-        # urls which conflicts.
-        self.urls = urls
-      end
-    end
-    # Adds the url to this Crawler's @urls.
+    # Initializes and returns a Wgit::Crawler instance.
     #
-    # @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
-    #   Note that the url added here will NOT update if it happens to
-    #   redirect when crawled. If in doubt, pass the url directly to the
-    #   crawl_* method instead of to the new method.
-    def <<(url)
-      add_url(url)
-    end
-    # Crawls one or more individual urls using Wgit::Crawler#crawl_url
-    # underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
-    # that any external redirects are followed. Use Wgit::Crawler#crawl_url if
-    # this isn't desirable.
-    #
-    # @param urls [Array<Wgit::Url>] The URLs to crawl.
-    # @yield [Wgit::Document] If provided, the block is given each crawled
-    #   Document. Otherwise each doc is added to @docs which can be accessed
-    #   by Crawler#docs after this method returns.
-    # @return [Wgit::Document] The last Document crawled.
-    def crawl_urls(urls = @urls, &block)
-      raise 'No urls to crawl' unless urls
-      @docs = []
-      doc = nil
-      Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
-      doc || @docs.last
-    end
-    # Crawl the url returning the response Wgit::Document or nil if an error
-    # occurs.
-    #
-    # @param url [Wgit::Url] The URL to crawl.
-    # @param follow_external_redirects [Boolean] Whether or not to follow
-    #   an external redirect. False will return nil for such a crawl. If false,
-    #   you must also provide a `host:` parameter.
-    # @param host [Wgit::Url, String] Specify the host by which
-    #   an absolute redirect is determined to be internal or not. Must be
-    #   absolute and contain a protocol prefix. For example, a `host:` of
-    #   'http://www.example.com' will only allow redirects for Urls with a
-    #   `to_host` value of 'www.example.com'.
-    # @yield [Wgit::Document] The crawled HTML Document regardless if the
-    #   crawl was successful or not. Therefore, the Document#url can be used.
-    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
-    #   crawl was unsuccessful.
-    def crawl_url(
-      url = @urls.first,
-      follow_external_redirects: true,
-      host: nil
-    )
-      assert_type(url, Wgit::Url)
-      if !follow_external_redirects && host.nil?
-        raise 'host cannot be nil if follow_external_redirects is false'
-      end
-      html = fetch(
-        url,
-        follow_external_redirects: follow_external_redirects,
-        host: host
-      )
-      url.crawled = true
-      doc = Wgit::Document.new(url, html)
-      yield(doc) if block_given?
-      doc.empty? ? nil : doc
+    # @param redirect_limit [Integer] The amount of allowed redirects before
+    #   raising an error. Set to 0 to disable redirects completely.
+    def initialize(redirect_limit: 5)
+      @redirect_limit = redirect_limit
     end
     # Crawls an entire website's HTML pages by recursively going through
@@ -159,18 +41,16 @@ module Wgit
     # @param url [Wgit::Url] The base URL of the website to be crawled.
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
-    # @yield [Wgit::Document] Given each crawled Document/page of the site.
+    # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
     #   from all of the site's pages or nil if the url could not be
     #   crawled successfully.
-    def crawl_site(url = @urls.first, &block)
-      assert_type(url, Wgit::Url)
+    def crawl_site(url, &block)
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      host      = url.to_base
+      opts      = { follow_external_redirects: false, host: url.to_base }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
       crawled   = [url, alt_url]
       externals = doc.external_links
@@ -187,9 +67,7 @@ module Wgit
         links.each do |link|
           orig_link = link.dup
-          doc = crawl_url(
-            link, follow_external_redirects: false, host: host, &block
-          )
+          doc = crawl_url(link, opts, &block)
           crawled.push(orig_link, link) # Push both in case of redirects.
           next if doc.nil?
@@ -202,6 +80,66 @@ module Wgit
       externals.uniq
     end
+    # Crawls one or more individual urls using Wgit::Crawler#crawl_url
+    # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
+    #
+    # @param urls [*Wgit::Url] The Url's to crawl.
+    # @yield [doc] Given each crawled page (Wgit::Document); this is the only
+    #   way to interact with them.
+    # @raise [StandardError] If no urls are provided.
+    # @return [Wgit::Document] The last Document crawled.
+    def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
+      raise 'You must provide at least one Url' if urls.empty?
+      opts = {
+        follow_external_redirects: follow_external_redirects,
+        host: host
+      }
+      doc = nil
+      Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
+      doc
+    end
+    # Crawl the url returning the response Wgit::Document or nil if an error
+    # occurs.
+    #
+    # @param url [Wgit::Url] The Url to crawl.
+    # @param follow_external_redirects [Boolean] Whether or not to follow
+    #   an external redirect. External meaning to a different host. False will
+    #   return nil for such a crawl. If false, you must also provide a `host:`
+    #   parameter.
+    # @param host [Wgit::Url, String] Specify the host by which
+    #   an absolute redirect is determined to be internal or not. Must be
+    #   absolute and contain a protocol prefix. For example, a `host:` of
+    #   'http://www.example.com' will only allow redirects for Url's with a
+    #   `to_host` value of 'www.example.com'.
+    # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
+    #   crawl was successful or not. Therefore, Document#url etc. can be used.
+    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
+    #   crawl was unsuccessful.
+    def crawl_url(url, follow_external_redirects: true, host: nil)
+      # A String url isn't allowed because it's passed by value not reference,
+      # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
+      assert_type(url, Wgit::Url)
+      if !follow_external_redirects && host.nil?
+        raise 'host cannot be nil if follow_external_redirects is false'
+      end
+      html = fetch(
+        url,
+        follow_external_redirects: follow_external_redirects,
+        host: host
+      )
+      url.crawled = true
+      doc = Wgit::Document.new(url, html)
+      yield(doc) if block_given?
+      doc.empty? ? nil : doc
+    end
     protected
     # This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
@@ -227,22 +165,19 @@ module Wgit
         host: host
       )
       @last_response = response
       response.body.empty? ? nil : response.body
     rescue StandardError => e
-      Wgit.logger.debug(
-        "Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
-      )
+      Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
       @last_response = nil
       nil
     end
     # The resolve method performs a HTTP GET to obtain the HTML response. The
-    # Net::HTTPResponse will be returned or an error raised. Redirects can be
-    # disabled by setting `redirect_limit: 0`.
+    # Net::HTTPResponse will be returned or an error raised.
     #
     # @param url [Wgit::Url] The URL to fetch the HTML from.
-    # @param redirect_limit [Integer] The number of redirect hops to allow
-    #   before raising an error.
     # @param follow_external_redirects [Boolean] Whether or not to follow
     #   an external redirect. If false, you must also provide a `host:`
     #   parameter.
@@ -254,12 +189,7 @@ module Wgit
     # @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
     #   allowed.
     # @return [Net::HTTPResponse] The HTTP response of the GET request.
-    def resolve(
-      url,
-      redirect_limit: Wgit::Crawler.default_redirect_limit,
-      follow_external_redirects: true,
-      host: nil
-    )
+    def resolve(url, follow_external_redirects: true, host: nil)
       raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
       redirect_count = 0
@@ -267,25 +197,25 @@ module Wgit
       loop do
         response = Net::HTTP.get_response(url.to_uri)
+        break unless response.is_a?(Net::HTTPRedirection)
         location = Wgit::Url.new(response.fetch('location', ''))
+        raise 'Encountered redirect without Location header' if location.empty?
-        break unless response.is_a?(Net::HTTPRedirection)
         yield(url, response, location) if block_given?
-        unless location.empty?
-          if  !follow_external_redirects &&
-              !location.is_relative?(host: host)
-            raise "External redirect not allowed - Redirected to: \
+        if !follow_external_redirects && !location.is_relative?(host: host)
+          raise "External redirect not allowed - Redirected to: \
 '#{location}', which is outside of host: '#{host}'"
-          end
+        end
-          raise 'Too many redirects' if redirect_count >= redirect_limit
+        raise "Too many redirects: #{redirect_count}" \
+        if redirect_count >= @redirect_limit
-          redirect_count += 1
+        redirect_count += 1
-          location = url.to_base.concat(location) if location.is_relative?
-          url.replace(location)
-        end
+        location = url.to_base.concat(location) if location.is_relative?
+        url.replace(location) # Update the url on redirect.
       end
       response
@@ -300,7 +230,7 @@ module Wgit
     #   internal page links.
     # @return [Array<Wgit::Url>] The internal page links from doc.
     def get_internal_links(doc)
-      doc.internal_full_links
+      doc.internal_absolute_links
          .map(&:without_anchor) # Because anchors don't change page content.
          .uniq
          .reject do |link|
@@ -309,28 +239,9 @@ module Wgit
       end
     end
-    private
-    # Add the document to the @docs array for later processing or let the block
-    # process it here and now.
-    def handle_crawl_block(url, &block)
-      if block_given?
-        crawl_url(url, &block)
-      else
-        @docs << crawl_url(url)
-        nil
-      end
-    end
-    # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
-    def add_url(url)
-      @urls = [] if @urls.nil?
-      @urls << Wgit::Url.new(url)
-    end
-    alias crawl crawl_urls
+    alias crawl       crawl_urls
     alias crawl_pages crawl_urls
-    alias crawl_page crawl_url
-    alias crawl_r crawl_site
+    alias crawl_page  crawl_url
+    alias crawl_r     crawl_site
   end
 end