RubyGems - wgit - Versions diffs - 0.0.16 → 0.0.17 - Mend

wgit 0.0.16 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +9 -9
data/lib/wgit/crawler.rb +111 -48
data/lib/wgit/document.rb +46 -32
data/lib/wgit/document_extensions.rb +11 -9
data/lib/wgit/indexer.rb +2 -2
data/lib/wgit/logger.rb +5 -1
data/lib/wgit/url.rb +79 -17
data/lib/wgit/version.rb +1 -1
metadata +1 -15

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 269236ab66e07aaabe01d61f765689e3d997628ad76d5f61a9c477e35d67880b
-  data.tar.gz: 5fd11a994c23cd9569099109f8e2236873cf2d6267ea38bd661329620ece50b0
+  metadata.gz: a805551a72869241a425dc4d0f88ed6f740c75b95db6e3acf2564393b79708d9
+  data.tar.gz: 5425e8bb21c7822b5ac93afbe6c7d90777a649808702c3e6012c0ec19cbe1dfb
 SHA512:
-  metadata.gz: 1c97aab9a225690205fcf10a99d2f632c45c08e7c3c5a543a0d374eb0595a6953baf77acd16eebf032c741d671d9e0fece030b01578af14fabc2acfa446734aa
-  data.tar.gz: eefb60a4462142fce4643dc12edac1fa11951c32f7e2d72f3295369fc8db83b3b126ea5e51410bc9ca9955cc1d7c386ac3c0aac77b1c6eaf9bc89ffc517f44ee
+  metadata.gz: fd5dcc1b4e9706326810b3fdbdf1df285ec1a98788aac9521fbcd52ad4132c039ab2a2b2d2e574af115845d1968c0eb1bc8d487dbbec4ee9a3427597bb99b09f
+  data.tar.gz: 3e947536f694ea74460f919cab1ec8e42274eb6bd0f856ac900c6b2e4f31da22ddc920afbaaa3a4b80abe3d9729fdcf00964e794363f3d76e1f35dc33a05224a

data/README.md CHANGED Viewed

@@ -57,13 +57,13 @@ doc.stats # => {
 # doc responds to the following methods:
 Wgit::Document.instance_methods(false).sort # => [
-# :==, :[], :author, :css, :date_crawled, :doc, :empty?, :external_links,
-# :external_urls, :html, :internal_full_links, :internal_links,
-# :internal_links_without_anchors, :keywords, :links, :relative_full_links,
-# :relative_full_urls, :relative_links, :relative_urls, :score, :search,
-# :search!, :size, :stats, :text, :title, :to_h, :to_json, :url,
-# :xpath
-#]
+#   :==, :[], :author, :base, :base_url, :css, :date_crawled, :doc, :empty?,
+#   :external_links, :external_urls, :html, :internal_absolute_links,
+#   :internal_full_links, :internal_links, :keywords, :links,
+#   :relative_absolute_links, :relative_absolute_urls, :relative_full_links,
+#   :relative_full_urls, :relative_links, :relative_urls, :score, :search,
+#   :search!, :size, :stats, :text, :title, :to_h, :to_json, :url, :xpath
+# ]
 results = doc.search "corruption"
 results.first # => "ial materials involving war, spying and corruption.
@@ -325,7 +325,7 @@ Currently there is no executable provided with Wgit, however...
 In future versions of Wgit, an executable will be packaged with the gem. The executable will provide a `pry` console with the `wgit` gem already loaded. Using the console, you'll easily be able to index and search the web without having to write your own scripts.
-This executable will be very similar in nature to `./bin/console` which is currently used only for development and isn't packaged as part of the `wgit` gem.
+This executable will be similar in nature to `./bin/console` which is currently used for development and isn't packaged as part of the `wgit` gem.
 ## Change Log
@@ -345,7 +345,7 @@ The current road map is rudimentally listed in the [TODO.txt](https://github.com
 For a full list of available Rake tasks, run `bundle exec rake help`. The most commonly used tasks are listed below...
-After checking out the repo, run `./bin/setup` to install dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `./bin/console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
+After checking out the repo, run `bundle exec rake setup` to install the dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `bundle exec rake console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
 To generate code documentation run `bundle exec yard doc`. To browse the generated documentation run `bundle exec yard server -r`.

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -6,8 +6,8 @@ require 'net/http' # Requires 'uri'.
 module Wgit
-  # The Crawler class provides a means of crawling web based URL's, turning
-  # their HTML into Wgit::Document's.
+  # The Crawler class provides a means of crawling web based Wgit::Url's, turning
+  # their HTML into Wgit::Document instances.
   class Crawler
     include Assertable
@@ -29,9 +29,13 @@ module Wgit
     # The Net::HTTPResponse of the most recently crawled URL or nil.
     attr_reader :last_response
-    # Initializes the Crawler by setting the @urls and @docs.
+    # Initializes the Crawler and sets the @urls and @docs.
     #
-    # @param urls [*Wgit::Url] The URLs to crawl.
+    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
+    #   Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
+    #   will NOT update if they happen to redirect when crawled. If in doubt,
+    #   pass the url(s) directly to the crawl_* method instead of to the new
+    #   method.
     def initialize(*urls)
       self.[](*urls)
       @docs = []
@@ -39,7 +43,10 @@ module Wgit
     # Sets this Crawler's @urls.
     #
-    # @param urls [Array<Wgit::Url>] The URLs to crawl.
+    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
+    #   crawl_url or crawl_site. Note that the urls passed here will NOT update
+    #   if they happen to redirect when crawled. If in doubt, pass the url(s)
+    #   directly to the crawl_* method instead of to the new method.
     def urls=(urls)
       @urls = []
       Wgit::Utils.each(urls) { |url| add_url(url) }
@@ -47,7 +54,10 @@ module Wgit
     # Sets this Crawler's @urls.
     #
-    # @param urls [*Wgit::Url] The URLs to crawl.
+    # @param urls [*Wgit::Url] The URL's to crawl in the future using either
+    #   crawl_url or crawl_site. Note that the urls passed here will NOT update
+    #   if they happen to redirect when crawled. If in doubt, pass the url(s)
+    #   directly to the crawl_* method instead of to the new method.
     def [](*urls)
       # If urls is nil then add_url (when called later) will set @urls = []
       # so we do nothing here.
@@ -68,12 +78,18 @@ module Wgit
     # Adds the url to this Crawler's @urls.
     #
-    # @param url [Wgit::Url] A URL to crawl.
+    # @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
+    #   Note that the url added here will NOT update if it happens to
+    #   redirect when crawled. If in doubt, pass the url directly to the
+    #   crawl_* method instead of to the new method.
     def <<(url)
       add_url(url)
     end
-    # Crawls individual urls, not entire sites.
+    # Crawls one or more individual urls using Wgit::Crawler#crawl_url
+    # underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
+    # that any external redirects are followed. Use Wgit::Crawler#crawl_url if
+    # this isn't desirable.
     #
     # @param urls [Array<Wgit::Url>] The URLs to crawl.
     # @yield [Wgit::Document] If provided, the block is given each crawled
@@ -88,68 +104,100 @@ module Wgit
       doc ? doc : @docs.last
     end
-    # Crawl the url and return the response document or nil.
+    # Crawl the url returning the response Wgit::Document or nil if an error
+    # occurs.
     #
-    # @param url [Wgit::Document] The URL to crawl.
+    # @param url [Wgit::Url] The URL to crawl.
     # @param follow_external_redirects [Boolean] Whether or not to follow
-    #   external redirects. False will return nil for such a crawl.
+    #   an external redirect. False will return nil for such a crawl. If false,
+    #   you must also provide a `host:` parameter.
+    # @param host [Wgit::Url, String] Specify the host by which
+    #   an absolute redirect is determined to be internal or not. Must be
+    #   absolute and contain a protocol prefix. For example, a `host:` of
+    #   'http://www.example.com' will only allow redirects for Urls with a
+    #   `to_host` value of 'www.example.com'.
     # @yield [Wgit::Document] The crawled HTML Document regardless if the
     #   crawl was successful or not. Therefore, the Document#url can be used.
     # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
     #   crawl was unsuccessful.
-    def crawl_url(url = @urls.first, follow_external_redirects: true)
+    def crawl_url(
+        url = @urls.first,
+        follow_external_redirects: true,
+        host: nil
+      )
       assert_type(url, Wgit::Url)
-      markup = fetch(url, follow_external_redirects: follow_external_redirects)
+      if !follow_external_redirects and host.nil?
+        raise 'host cannot be nil if follow_external_redirects is false'
+      end
+      html = fetch(
+        url,
+        follow_external_redirects: follow_external_redirects,
+        host: host
+      )
       url.crawled = true
-      doc = Wgit::Document.new(url, markup)
+      doc = Wgit::Document.new(url, html)
       yield(doc) if block_given?
       doc.empty? ? nil : doc
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal links. Each crawled web Document is yielded to a block.
+    # its internal links. Each crawled Document is yielded to a block.
     #
-    # @param base_url [Wgit::Url] The base URL of the website to be crawled.
+    # Only redirects to the same host are followed. For example, the Url
+    # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
+    # a link which redirects to 'https://ftp.example.co.uk' or
+    # 'https://www.example.com' will not be followed. The only exception to
+    # this is the initially crawled url which is allowed to redirect anywhere;
+    # it's host is then used for other link redirections on the site, as
+    # described above.
+    #
+    # @param url [Wgit::Url] The base URL of the website to be crawled.
+    #   It is recommended that this URL be the index page of the site to give a
+    #   greater chance of finding all pages within that site/host.
     # @yield [Wgit::Document] Given each crawled Document/page of the site.
     #   A block is the only way to interact with each crawled Document.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
-    #   from all of the site's pages or nil if the base_url could not be
+    #   from all of the site's pages or nil if the url could not be
     #   crawled successfully.
-    def crawl_site(base_url = @urls.first, &block)
-      assert_type(base_url, Wgit::Url)
+    def crawl_site(url = @urls.first, &block)
+      assert_type(url, Wgit::Url)
-      doc = crawl_url(base_url, follow_external_redirects: false, &block)
+      doc = crawl_url(url, &block)
       return nil if doc.nil?
-      path = base_url.path.nil? ? '/' : base_url.path
-      crawled_urls  = [path]
-      external_urls = doc.external_links
-      internal_urls = get_internal_links(doc)
+      host      = url.to_base
+      alt_url   = url.end_with?('/') ? url.chop : url + '/'
+      crawled   = [url, alt_url]
+      externals = doc.external_links
+      internals = get_internal_links(doc)
-      return doc.external_links.uniq if internal_urls.empty?
+      return doc.external_links.uniq if internals.empty?
       loop do
-        internal_urls.uniq!
+        crawled.uniq!
+        internals.uniq!
-        links = internal_urls - crawled_urls
+        links = internals - crawled
         break if links.empty?
         links.each do |link|
+          orig_link = link.dup
           doc = crawl_url(
-            Wgit::Url.concat(base_url.to_base, link),
-            follow_external_redirects: false,
-            &block
+            link, follow_external_redirects: false, host: host, &block
           )
-          crawled_urls << link
+          crawled.push(orig_link, link) # Push both in case of redirects.
           next if doc.nil?
-          internal_urls.concat(get_internal_links(doc))
-          external_urls.concat(doc.external_links)
+          internals.concat(get_internal_links(doc))
+          externals.concat(doc.external_links)
         end
       end
-      external_urls.uniq
+      externals.uniq
     end
   private
@@ -168,8 +216,13 @@ module Wgit
     # The fetch method performs a HTTP GET to obtain the HTML document.
     # Invalid urls or any HTTP response that doesn't return a HTML body will be
     # ignored and nil will be returned. Otherwise, the HTML is returned.
-    def fetch(url, follow_external_redirects: true)
-      response = resolve(url, follow_external_redirects: follow_external_redirects)
+    # External redirects are followed by default but can be disabled.
+    def fetch(url, follow_external_redirects: true, host: nil)
+      response = resolve(
+        url,
+        follow_external_redirects: follow_external_redirects,
+        host: host
+      )
       @last_response = response
       response.body.empty? ? nil : response.body
     rescue Exception => ex
@@ -183,28 +236,35 @@ module Wgit
     # The resolve method performs a HTTP GET to obtain the HTML document.
     # A certain amount of redirects will be followed by default before raising
     # an exception. Redirects can be disabled by setting `redirect_limit: 0`.
+    # External redirects are followed by default but can be disabled.
     # The Net::HTTPResponse will be returned.
     def resolve(
         url,
         redirect_limit: Wgit::Crawler.default_redirect_limit,
-        follow_external_redirects: true
+        follow_external_redirects: true,
+        host: nil
       )
-      raise 'url must respond to :normalise' unless url.respond_to?(:normalise)
-      redirect_count = -1
+      raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
+      redirect_count = 0
       begin
-        raise 'Too many redirects' if redirect_count >= redirect_limit
-        redirect_count += 1
         response = Net::HTTP.get_response(url.to_uri)
         location = Wgit::Url.new(response.fetch('location', ''))
+        yield(url, response, location) if block_given?
         if not location.empty?
-          if !follow_external_redirects and !location.is_relative?
-            raise 'External redirect encountered but not allowed'
+          if  !follow_external_redirects and
+              !location.is_relative?(host: host)
+            raise "External redirect not allowed - Redirected to: \
+'#{location}', which is outside of host: '#{host}'"
           end
-          url = location.is_relative? ? url.to_base.concat(location) : location
+          raise 'Too many redirects' if redirect_count >= redirect_limit
+          redirect_count += 1
+          location = url.to_base.concat(location) if location.is_relative?
+          url.replace(location)
         end
       end while response.is_a?(Net::HTTPRedirection)
@@ -217,10 +277,13 @@ module Wgit
       @urls << Wgit::Url.new(url)
     end
-    # Pull out the doc's internal HTML page links for crawling.
+    # Returns doc's internal HTML page links in absolute form for crawling.
+    # We remove anchors because they are client side and don't change the
+    # resulting page's HTML; unlike query strings for example, which do.
     def get_internal_links(doc)
-      doc.
-        internal_links_without_anchors.
+      doc.internal_full_links.
+        map(&:without_anchor).
+        uniq.
         reject do |link|
           ext = link.to_extension
           ext ? !['htm', 'html'].include?(ext) : false

data/lib/wgit/document.rb CHANGED Viewed

@@ -126,6 +126,38 @@ module Wgit
       @url.date_crawled
     end
+    # Returns the base URL of this Wgit::Document. The base URL is either the
+    # <base> element's href value or @url (if @base is nil). If @base is
+    # present and relative, then @url.to_base + @base is returned. This method
+    # should be used instead of `doc.url.to_base` etc. if manually building
+    # absolute links.
+    #
+    # Provide the `link:` parameter to get the correct base URL for that type
+    # of link. For example, a link of `#top` would always return @url because
+    # it applies to that page, not a different one. Query strings work in the
+    # same way. Use this parameter if manually concatting links e.g.
+    # `absolute_link = doc.base_url(link: link).concat(link)` etc.
+    #
+    # @param link [Wgit::Url] The link to obtain the correct base URL for.
+    # @return [Wgit::Url] The base URL of this Document e.g.
+    #   'http://example.com/public'.
+    def base_url(link: nil)
+      get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
+      if link
+        assert_type(link, Wgit::Url)
+        raise "link must be relative: #{link}" unless link.is_relative?
+        if link.is_anchor? or link.is_query_string?
+          base_url = @base ? get_base.call : @url
+          return base_url.without_anchor.without_query_string
+        end
+      end
+      base_url = @base ? get_base.call : @url.base
+      base_url.without_anchor.without_query_string
+    end
     # Returns a Hash containing this Document's instance vars.
     # Used when storing the Document in a Database e.g. MongoDB etc.
     # By default the @html var is excluded from the returned Hash.
@@ -209,23 +241,19 @@ module Wgit
     end
     # Get all the internal links of this Document in relative form. Internal
-    # meaning a link to another document on this domain. This Document's domain
+    # meaning a link to another document on the same host. This Document's host
     # is used to determine if an absolute URL is actually a relative link e.g.
-    # For a Document representing http://server.com/about, an absolute link of
-    # <a href='http://server.com/search'> will be recognized and returned as an
-    # internal link because both Documents live on the same domain. Also see
-    # Wgit::Document#internal_full_links.
+    # For a Document representing http://www.server.com/about, an absolute link
+    # of <a href='http://www.server.com/search'> will be recognized and
+    # returned as an internal link because both Documents live on the same
+    # host. Also see Wgit::Document#internal_full_links.
     #
     # @return [Array<Wgit::Url>] self's internal/relative URL's.
     def internal_links
       return [] if @links.empty?
       links = @links.
-        reject do |link|
-          not link.relative_link?(base: @url.to_base)
-        rescue
-          true
-        end.
+        reject { |link| !link.is_relative?(host: @url.to_base) }.
         map(&:without_base).
         map do |link| # We map @url.to_host into / because it's a duplicate.
           link.to_host == @url.to_host ? Wgit::Url.new('/') : link
@@ -234,19 +262,6 @@ module Wgit
       Wgit::Utils.process_arr(links)
     end
-    # Get all the internal links of this Document with their anchors removed
-    # (if present). Also see Wgit::Document#internal_links.
-    #
-    # @return [Array<Wgit::Url>] self's internal/relative URL's with their
-    #   anchors removed.
-    def internal_links_without_anchors
-      in_links = internal_links
-      return [] if in_links.empty?
-      in_links.
-        map(&:without_anchor).
-        reject(&:empty?)
-    end
     # Get all the internal links of this Document and append them to this
     # Document's base URL making them absolute. Also see
     # Wgit::Document#internal_links.
@@ -254,24 +269,20 @@ module Wgit
     # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
     #   form.
     def internal_full_links
-      in_links = internal_links
-      return [] if in_links.empty?
-      in_links.map { |link| @url.to_base.concat(link) }
+      links = internal_links
+      return [] if links.empty?
+      links.map { |link| base_url(link: link).concat(link) }
     end
     # Get all the external links of this Document. External meaning a link to
-    # a different domain.
+    # a different host.
     #
     # @return [Array<Wgit::Url>] self's external/absolute URL's.
     def external_links
       return [] if @links.empty?
       links = @links.
-        reject do |link|
-          link.relative_link?(base: @url.to_base)
-        rescue
-          true
-        end.
+        reject { |link| link.relative_link?(host: @url.to_base) }.
         map(&:without_trailing_slash)
       Wgit::Utils.process_arr(links)
@@ -506,6 +517,9 @@ module Wgit
     alias :relative_urls :internal_links
     alias :relative_full_links :internal_full_links
     alias :relative_full_urls :internal_full_links
+    alias :internal_absolute_links :internal_full_links
+    alias :relative_absolute_links :internal_full_links
+    alias :relative_absolute_urls :internal_full_links
     alias :external_urls :external_links
   end
 end

data/lib/wgit/document_extensions.rb CHANGED Viewed

@@ -1,5 +1,15 @@
 ### Default Document Extensions ###
+# Base.
+Wgit::Document.define_extension(
+  :base,
+  '//base/@href',
+  singleton: true,
+  text_content_only: true,
+) do |base|
+  base = Wgit::Url.new(base) if base
+end
 # Title.
 Wgit::Document.define_extension(
   :title,
@@ -37,15 +47,7 @@ Wgit::Document.define_extension(
   singleton: false,
   text_content_only: true,
 ) do |links|
-  if links
-    links.map! do |link|
-      Wgit::Url.new(link)
-    rescue
-      nil
-    end
-    links.compact!
-  end
-  links
+  links.map! { |link| Wgit::Url.new(link) } if links
 end
 # Text.

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -219,7 +219,7 @@ site: #{url}")
     #   manipulation. Return nil or false from the block to prevent the
     #   document from being saved into the database.
     def index_this_page(url, insert_externals = true)
-      doc = @crawler.crawl_page(url) do |doc|
+      document = @crawler.crawl_page(url) do |doc|
         result = true
         if block_given?
           result = yield(doc)
@@ -236,7 +236,7 @@ site: #{url}")
       @db.url?(url) ? @db.update(url) : @db.insert(url)
       if insert_externals
-        ext_urls = doc.external_links
+        ext_urls = document.external_links
         write_urls_to_db(ext_urls)
         Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
       end

data/lib/wgit/logger.rb CHANGED Viewed

@@ -23,7 +23,11 @@ module Wgit
   # Returns the default Logger instance.
   # @return [Logger] The default Logger instance.
   def self.default_logger
-    Logger.new(STDOUT, progname: 'wgit', level: :info)
+    logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
+    logger.formatter = proc do |severity, datetime, progname, msg|
+      "[#{progname}] #{msg}\n"
+    end
+    logger
   end
   # Sets the default Logger instance to be used by Wgit.

data/lib/wgit/url.rb CHANGED Viewed

@@ -117,30 +117,54 @@ module Wgit
       Wgit::Url.new(host + separator + link)
     end
+    # Overrides String#replace setting the new_url @uri and String value.
+    #
+    # @param new_url [Wgit::Url, String] The new URL value.
+    # @return [String] The new URL value once set.
+    def replace(new_url)
+      @uri = Addressable::URI.parse(new_url)
+      super(new_url)
+    end
     # Returns true if self is a relative Url.
     #
     # All external links in a page are expected to have a protocol prefix e.g.
     # "http://", otherwise the link is treated as an internal link (regardless
-    # of whether it's valid or not). The only exception is if base is provided
-    # and self is a page within that site; then the link is relative.
+    # of whether it's valid or not). The only exception is if host or domain is
+    # provided and self is a page belonging to that host/domain; then the link
+    # is relative.
     #
-    # @param base [Wgit::Url, String] The Url base e.g. http://www.google.com.
+    # @param host [Wgit::Url, String] The Url host e.g.
+    #   http://www.google.com/how which gives a host of www.google.com.
+    #   The host must be absolute and prefixed with a protocol.
+    # @param domain [Wgit::Url, String] The Url domain e.g.
+    #   http://www.google.com/how which gives a domain of google.com. The
+    #   domain must be absolute and prefixed with a protocol.
     # @return [Boolean] True if relative, false if absolute.
     # @raise [RuntimeError] If self is invalid e.g. empty.
-    def is_relative?(base: nil)
+    def is_relative?(host: nil, domain: nil)
       raise "Invalid link: #{self}" if nil? or empty?
+      raise "Provide host or domain, not both" if host and domain
+      if host
+        host = Wgit::Url.new(host)
+        if host.to_base.nil?
+          raise "Invalid host, must be absolute and contain protocol: #{host}"
+        end
+      end
-      if base
-        base = Wgit::Url.new(base)
-        if base.to_scheme.nil?
-          raise "Invalid base, must contain protocol prefix: #{base}"
+      if domain
+        domain = Wgit::Url.new(domain)
+        if domain.to_base.nil?
+          raise "Invalid domain, must be absolute and contain protocol: #{domain}"
         end
       end
       if @uri.relative?
         true
       else
-        base ? to_host == base.to_host : false
+        return host   ? to_host   == host.to_host     : false if host
+        return domain ? to_domain == domain.to_domain : false if domain
       end
     end
@@ -207,6 +231,15 @@ module Wgit
       host ? Wgit::Url.new(host) : nil
     end
+    # Returns a new Wgit::Url containing just the domain of this URL e.g.
+    # Given http://www.google.co.uk/about.html, google.co.uk is returned.
+    #
+    # @return [Wgit::Url, nil] Containing just the domain or nil.
+    def to_domain
+      domain = @uri.domain
+      domain ? Wgit::Url.new(domain) : nil
+    end
     # Returns only the base of this URL e.g. the protocol and host combined.
     #
     # @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
@@ -226,9 +259,7 @@ module Wgit
       path = @uri.path
       return nil if path.nil? or path.empty?
       return Wgit::Url.new('/') if path == '/'
-      Wgit::Url.new(path).
-        without_leading_slash.
-        without_trailing_slash
+      Wgit::Url.new(path).without_slashes
     end
     # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -245,12 +276,12 @@ module Wgit
     end
     # Returns a new Wgit::Url containing just the query string of this URL
-    # e.g. Given http://google.com?q=ruby, 'ruby' is returned.
+    # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
     #
     # @return [Wgit::Url, nil] Containing just the query string or nil.
     def to_query_string
       query = @uri.query
-      query ? Wgit::Url.new(query) : nil
+      query ? Wgit::Url.new("?#{query}") : nil
     end
     # Returns a new Wgit::Url containing just the anchor string of this URL
@@ -313,9 +344,21 @@ module Wgit
       without_base = base_url ? gsub(base_url, '') : self
       return self if ['', '/'].include?(without_base)
-      Wgit::Url.new(without_base).
-        without_leading_slash.
-        without_trailing_slash
+      Wgit::Url.new(without_base).without_slashes
+    end
+    # Returns a new Wgit::Url with the query string portion removed e.g. Given
+    # http://google.com/search?q=hello, http://google.com/search is
+    # returned. Self is returned as is if no query string is present. A URL
+    # consisting of only a query string e.g. '?q=hello' will return an empty
+    # URL.
+    #
+    # @return [Wgit::Url] Self with the query string portion removed.
+    def without_query_string
+      query = to_query_string
+      without_query_string = query ? gsub(query, '') : self
+      Wgit::Url.new(without_query_string)
     end
     # Returns a new Wgit::Url with the anchor portion removed e.g. Given
@@ -333,6 +376,20 @@ module Wgit
       Wgit::Url.new(without_anchor)
     end
+    # Returns true if self is a URL query string e.g. ?q=hello etc.
+    #
+    # @return [Boolean] True if self is a query string, false otherwise.
+    def is_query_string?
+      start_with?('?')
+    end
+    # Returns true if self is a URL anchor/fragment e.g. #top etc.
+    #
+    # @return [Boolean] True if self is a anchor/fragment, false otherwise.
+    def is_anchor?
+      start_with?('#')
+    end
     # Returns a Hash containing this Url's instance vars excluding @uri.
     # Used when storing the URL in a Database e.g. MongoDB etc.
     #
@@ -349,6 +406,7 @@ module Wgit
     alias :to_protocol :to_scheme
     alias :protocol :to_scheme
     alias :host :to_host
+    alias :domain :to_domain
     alias :base :to_base
     alias :path :to_path
     alias :endpoint :to_endpoint
@@ -358,10 +416,14 @@ module Wgit
     alias :to_fragment :to_anchor
     alias :fragment :to_anchor
     alias :extension :to_extension
+    alias :without_query :without_query_string
     alias :without_fragment :without_anchor
+    alias :is_query? :is_query_string?
+    alias :is_fragment? :is_anchor?
     alias :relative_link? :is_relative?
     alias :internal_link? :is_relative?
     alias :is_internal? :is_relative?
+    alias :relative? :is_relative?
     alias :crawled? :crawled
     alias :normalize :normalise
   end

data/lib/wgit/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # @author Michael Telford
 module Wgit
   # The current gem version of Wgit.
-  VERSION = "0.0.16".freeze
+  VERSION = "0.0.17".freeze
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wgit
 version: !ruby/object:Gem::Version
-  version: 0.0.16
+  version: 0.0.17
 platform: ruby
 authors:
 - Michael Telford
@@ -128,20 +128,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.6'
-- !ruby/object:Gem::Dependency
-  name: rack
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '2.0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '2.0'
 - !ruby/object:Gem::Dependency
   name: addressable
   requirement: !ruby/object:Gem::Requirement