RubyGems - wgit - Versions diffs - 0.8.0 → 0.10.2 - Mend

wgit 0.8.0 → 0.10.2

Files changed (21) hide show

checksums.yaml +4 -4
data/.yardopts +1 -1
data/CHANGELOG.md +68 -2
data/LICENSE.txt +1 -1
data/README.md +114 -326
data/bin/wgit +9 -5
data/lib/wgit/assertable.rb +3 -3
data/lib/wgit/base.rb +39 -0
data/lib/wgit/crawler.rb +206 -76
data/lib/wgit/database/database.rb +309 -134
data/lib/wgit/database/model.rb +10 -3
data/lib/wgit/document.rb +145 -95
data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
data/lib/wgit/dsl.rb +324 -0
data/lib/wgit/indexer.rb +66 -163
data/lib/wgit/response.rb +5 -2
data/lib/wgit/url.rb +177 -63
data/lib/wgit/utils.rb +32 -20
data/lib/wgit/version.rb +2 -1
data/lib/wgit.rb +3 -1
metadata +34 -19

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -6,12 +6,14 @@ require_relative 'utils'
 require_relative 'assertable'
 require_relative 'response'
 require 'set'
+require 'benchmark'
 require 'typhoeus'
+require 'ferrum'
 module Wgit
-  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
-  # serialising their HTML into Wgit::Document instances. This is the only Wgit
-  # class which contains network logic e.g. HTTP request/response handling.
+  # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
+  # and serialising their HTML into `Wgit::Document` instances. This is the
+  # only Wgit class containing network logic (HTTP request/response handling).
   class Crawler
     include Assertable
@@ -38,12 +40,21 @@ module Wgit
     # The maximum amount of time (in seconds) a crawl request has to complete
     # before raising an error. Set to 0 to disable time outs completely.
-    attr_accessor :time_out
+    attr_accessor :timeout
     # Whether or not to UTF-8 encode the response body once crawled. Set to
     # false if crawling more than just HTML e.g. images.
     attr_accessor :encode
+    # Whether or not to parse the Javascript of the crawled document.
+    # Parsing requires Chrome/Chromium to be installed and in $PATH.
+    attr_accessor :parse_javascript
+    # The delay between checks in a page's HTML size. When the page has stopped
+    # "growing", the Javascript has finished dynamically updating the DOM.
+    # The value should balance between a good UX and enough JS parse time.
+    attr_accessor :parse_javascript_delay
     # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
@@ -51,20 +62,27 @@ module Wgit
     #
     # @param redirect_limit [Integer] The amount of allowed redirects before
     #   raising an error. Set to 0 to disable redirects completely.
-    # @param time_out [Integer, Float] The maximum amount of time (in seconds)
+    # @param timeout [Integer, Float] The maximum amount of time (in seconds)
     #   a crawl request has to complete before raising an error. Set to 0 to
     #   disable time outs completely.
     # @param encode [Boolean] Whether or not to UTF-8 encode the response body
     #   once crawled. Set to false if crawling more than just HTML e.g. images.
-    def initialize(redirect_limit: 5, time_out: 5, encode: true)
-      @redirect_limit = redirect_limit
-      @time_out       = time_out
-      @encode         = encode
+    # @param parse_javascript [Boolean] Whether or not to parse the Javascript
+    #   of the crawled document. Parsing requires Chrome/Chromium to be
+    #   installed and in $PATH.
+    def initialize(redirect_limit: 5, timeout: 5, encode: true,
+                   parse_javascript: false, parse_javascript_delay: 1)
+      @redirect_limit         = redirect_limit
+      @timeout                = timeout
+      @encode                 = encode
+      @parse_javascript       = parse_javascript
+      @parse_javascript_delay = parse_javascript_delay
     end
     # Crawls an entire website's HTML pages by recursively going through
-    # its internal `<a>` links. Each crawled Document is yielded to a block.
-    # Use `doc.empty?` to determine if the crawled link is valid.
+    # its internal `<a>` links; this can be overridden with `follow: xpath`.
+    # Each crawled Document is yielded to a block. Use `doc.empty?` to
+    # determine if the crawled link was successful / is valid.
     #
     # Use the allow and disallow paths params to partially and selectively
     # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
@@ -82,26 +100,36 @@ module Wgit
     # @param url [Wgit::Url] The base URL of the website to be crawled.
     #   It is recommended that this URL be the index page of the site to give a
     #   greater chance of finding all pages within that site/host.
-    # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them if their path `File.fnmatch?` one of allow_paths.
-    # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path `File.fnmatch?` one of disallow_paths.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
     # @yield [doc] Given each crawled page (Wgit::Document) of the site.
     #   A block is the only way to interact with each crawled Document.
     #   Use `doc.empty?` to determine if the page is valid.
     # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
     #   from all of the site's pages or nil if the given url could not be
     #   crawled successfully.
-    def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
+    def crawl_site(
+      url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
+    )
       doc = crawl_url(url, &block)
       return nil if doc.nil?
-      path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      link_opts = {
+        xpath: follow,
+        allow_paths: allow_paths,
+        disallow_paths: disallow_paths
+      }
       alt_url   = url.end_with?('/') ? url.chop : url + '/'
       crawled   = Set.new([url, alt_url])
       externals = Set.new(doc.external_links)
-      internals = Set.new(get_internal_links(doc, path_opts))
+      internals = Set.new(next_internal_links(doc, **link_opts))
       return externals.to_a if internals.empty?
@@ -116,7 +144,7 @@ module Wgit
           crawled += [orig_link, link] # Push both links in case of redirects.
           next if doc.nil?
-          internals += get_internal_links(doc, path_opts)
+          internals += next_internal_links(doc, **link_opts)
           externals += doc.external_links
         end
       end
@@ -131,10 +159,11 @@ module Wgit
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     #   This value will be used for all urls crawled.
     # @yield [doc] Given each crawled page (Wgit::Document); this is the only
-    #   way to interact with them.
+    #   way to interact with them. Use `doc.empty?` to determine if the page
+    #   is valid.
     # @raise [StandardError] If no urls are provided.
     # @return [Wgit::Document] The last Document crawled.
     def crawl_urls(*urls, follow_redirects: true, &block)
@@ -143,7 +172,7 @@ module Wgit
       opts = { follow_redirects: follow_redirects }
       doc = nil
-      Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
+      Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
       doc
     end
@@ -151,13 +180,15 @@ module Wgit
     # Crawl the url returning the response Wgit::Document or nil, if an error
     # occurs.
     #
-    # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
+    # @param url [Wgit::Url] The Url to crawl; which will be modified in the
+    #   event of a redirect.
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
     #   crawl was successful or not. Therefore, Document#url etc. can be used.
+    #   Use `doc.empty?` to determine if the page is valid.
     # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
     #   crawl was unsuccessful.
     def crawl_url(url, follow_redirects: true)
@@ -175,16 +206,19 @@ module Wgit
     protected
-    # Returns the url HTML String or nil. Handles any errors that arise
+    # Returns the URL's HTML String or nil. Handles any errors that arise
     # and sets the @last_response. Errors or any HTTP response that doesn't
     # return a HTML body will be ignored, returning nil.
     #
+    # If @parse_javascript is true, then the final resolved URL will be browsed
+    # to and Javascript parsed allowing for dynamic HTML generation.
+    #
     # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
     #   reference and gets modified as a result of the fetch/crawl.
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If url isn't valid and absolute.
     # @return [String, nil] The crawled HTML or nil if the crawl was
     #   unsuccessful.
@@ -193,6 +227,8 @@ module Wgit
       raise "Invalid url: #{url}" if url.invalid?
       resolve(url, response, follow_redirects: follow_redirects)
+      get_browser_response(url, response) if @parse_javascript
       response.body_or_nil
     rescue StandardError => e
       Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
@@ -214,14 +250,14 @@ module Wgit
     # @param follow_redirects [Boolean, Symbol] Whether or not to follow
     #   redirects. Pass a Symbol to limit where the redirect is allowed to go
     #   e.g. :host only allows redirects within the same host. Choose from
-    #   :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
+    #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If a redirect isn't allowed etc.
     def resolve(url, response, follow_redirects: true)
-      orig_url_base = url.to_url.to_base # Recorded before any redirects.
+      origin = url.to_url.to_origin # Recorded before any redirects.
       follow_redirects, within = redirect?(follow_redirects)
       loop do
-        get_response(url, response)
+        get_http_response(url, response)
         break unless response.redirect?
         # Handle response 'Location' header.
@@ -233,7 +269,7 @@ module Wgit
         # Validate if the redirect is allowed.
         raise "Redirect not allowed: #{location}" unless follow_redirects
-        if within && !location.relative?(within => orig_url_base)
+        if within && !location.relative?(within => origin)
           raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
         end
@@ -241,7 +277,7 @@ module Wgit
         if response.redirect_count >= @redirect_limit
         # Process the location to be crawled next.
-        location = url.to_base.concat(location) if location.relative?
+        location = url.to_origin.concat(location) if location.relative?
         response.redirections[url.to_s] = location.to_s
         url.replace(location) # Update the url on redirect.
       end
@@ -254,7 +290,7 @@ module Wgit
     #   reference.
     # @raise [StandardError] If a response can't be obtained.
     # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
-    def get_response(url, response)
+    def get_http_response(url, response)
       # Perform a HTTP GET request.
       orig_url = url.to_s
       url      = url.normalize if url.respond_to?(:normalize)
@@ -271,10 +307,40 @@ module Wgit
       response.add_total_time(http_response.total_time)
       # Log the request/response details.
-      log_http(response)
+      log_net(:http, response, http_response.total_time)
+      # Handle a failed response.
+      raise "No response (within timeout: #{@timeout} second(s))" \
+      if response.failure?
+    end
+    # Makes a browser request and enriches the given Wgit::Response from it.
+    #
+    # @param url [String] The url to browse to. Will call url#normalize if
+    #   possible.
+    # @param response [Wgit::Response] The response to enrich. Modifies by
+    #   reference.
+    # @raise [StandardError] If a response can't be obtained.
+    # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
+    def get_browser_response(url, response)
+      url     = url.normalize if url.respond_to?(:normalize)
+      browser = nil
+      crawl_time = Benchmark.measure { browser = browser_get(url) }.real
+      yield browser if block_given?
+      # Enrich the given Wgit::Response object (on top of Typhoeus response).
+      response.adapter_response = browser.network.response
+      response.status           = browser.network.response.status
+      response.headers          = browser.network.response.headers
+      response.body             = browser.body
+      response.add_total_time(crawl_time)
+      # Log the request/response details.
+      log_net(:browser, response, crawl_time)
       # Handle a failed response.
-      raise "No response (within timeout: #{@time_out} second(s))" \
+      raise "No browser response (within timeout: #{@timeout} second(s))" \
       if response.failure?
     end
@@ -285,7 +351,7 @@ module Wgit
     def http_get(url)
       opts = {
         followlocation: false,
-        timeout: @time_out,
+        timeout: @timeout,
         accept_encoding: 'gzip',
         headers: {
           'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -294,37 +360,58 @@ module Wgit
       }
       # See https://rubydoc.info/gems/typhoeus for more info.
-      Typhoeus.get(url, opts)
+      Typhoeus.get(url, **opts)
+    end
+    # Performs a HTTP GET request in a web browser and parses the response JS
+    # before returning the HTML body of the fully rendered webpage. This allows
+    # Javascript (SPA apps etc.) to generate HTML dynamically.
+    #
+    # @param url [String] The url to browse to.
+    # @return [Ferrum::Browser] The browser response object.
+    def browser_get(url)
+      @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
+      @browser.goto(url)
+      # Wait for the page's JS to finish dynamically manipulating the DOM.
+      html = @browser.body
+      loop do
+        sleep @parse_javascript_delay
+        break if html.size == @browser.body.size
+        html = @browser.body
+      end
+      @browser
     end
     # Returns a doc's internal HTML page links in absolute form; used when
-    # crawling a site. Use the allow and disallow paths params to partially
-    # and selectively crawl a site; the glob syntax is supported e.g.
-    # `'wiki/\*'` etc. Note that each path should NOT start with a slash.
+    # crawling a site. By default, any `<a>` href returning HTML is returned;
+    # override this with `xpath:` if desired.
     #
-    # Override this method in a subclass to change how a site
-    # is crawled, not what is extracted from each page (Document extensions
-    # should be used for this purpose instead). Just remember that only HTML
-    # files containing `<a>` links keep the crawl going beyond the base URL.
+    # Use the allow and disallow paths params to partially and selectively
+    # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
+    # that each path should NOT start with a slash.
     #
     # @param doc [Wgit::Document] The document from which to extract it's
     #   internal (absolute) page links.
+    # @param xpath [String] The xpath selecting links to be returned. Only
+    #   links pointing to the doc.url domain are allowed. The :default is any
+    #   <a> href returning HTML. The allow/disallow paths will be applied to
+    #   the returned value.
     # @param allow_paths [String, Array<String>] Filters links by selecting
     #   them if their path `File.fnmatch?` one of allow_paths.
     # @param disallow_paths [String, Array<String>] Filters links by rejecting
     #   them if their path `File.fnmatch?` one of disallow_paths.
     # @return [Array<Wgit::Url>] The internal page links from doc.
-    def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
-      links = doc
-              .internal_absolute_links
-              .map(&:omit_fragment) # Because fragments don't alter content.
-              .uniq
-              .select do |link|
-        ext = link.to_extension
-        ext ?
-          Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
-          true # URLs without an extension are assumed HTML.
-      end
+    def next_internal_links(
+      doc, xpath: :default, allow_paths: nil, disallow_paths: nil
+    )
+      links = if xpath && xpath != :default
+                follow_xpath(doc, xpath)
+              else
+                follow_default(doc)
+              end
       return links if allow_paths.nil? && disallow_paths.nil?
@@ -333,29 +420,40 @@ module Wgit
     private
-    # Returns whether or not to follow redirects, and within what context e.g.
-    # :host, :domain etc.
-    def redirect?(follow_redirects)
-      return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
+    # Returns the next links used to continue crawling a site. The xpath value
+    # is used to obtain the links. Any valid URL Strings will be converted into
+    # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
+    # pointing to the site domain will raise an error.
+    def follow_xpath(doc, xpath)
+      links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
+        urls
+          .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
+          .compact
+      end
-      unless [true, false].include?(follow_redirects)
-        raise "follow_redirects: must be a Boolean or Symbol, not: \
-#{follow_redirects}"
+      if links.any? { |link| link.to_domain != doc.url.to_domain }
+        raise 'The links to follow must be within the site domain'
       end
-      [follow_redirects, nil]
+      links
     end
-    # Log (at debug level) the HTTP request/response details.
-    def log_http(response)
-      resp_template  = '[http] Response: %s (%s bytes in %s seconds)'
-      log_status     = (response.status || 0)
-      log_total_time = response.total_time.truncate(3)
-      Wgit.logger.debug("[http] Request:  #{response.url}")
-      Wgit.logger.debug(
-        format(resp_template, log_status, response.size, log_total_time)
-      )
+    # Returns the default set of links used to continue crawling a site.
+    # By default, any <a> href returning HTML and pointing to the same domain
+    # will get returned.
+    def follow_default(doc)
+      doc
+        .internal_absolute_links
+        .map(&:omit_fragment) # Because fragments don't alter content.
+        .uniq
+        .select do |link| # Whitelist only HTML content.
+          ext = link.to_extension
+          if ext
+            Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
+          else
+            true # URLs without an extension are assumed HTML.
+          end
+        end
     end
     # Validate and filter by the given URL paths.
@@ -375,14 +473,17 @@ module Wgit
     # Validate the paths are suitable for filtering.
     def validate_paths(paths)
-      paths = [paths] unless paths.is_a?(Array)
+      paths = *paths
       raise 'The provided paths must all be Strings' \
       unless paths.all? { |path| path.is_a?(String) }
-      Wgit::Utils.process_arr(paths, encode: false)
+      Wgit::Utils.sanitize(paths, encode: false)
       raise 'The provided paths cannot be empty' if paths.empty?
-      paths
+      paths.map do |path|
+        path = Wgit::Url.parse(path)
+        path.index? ? path : path.omit_slashes
+      end
     end
     # Filters links by selecting/rejecting them based on their path.
@@ -390,7 +491,7 @@ module Wgit
     def filter_links(links, filter_method, paths)
       links.send(filter_method) do |link|
         # Turn http://example.com into / meaning index.
-        link = link.to_endpoint == '/' ? '/' : link.omit_base
+        link = link.to_endpoint.index? ? '/' : link.omit_base
         match = false
         paths.each do |pattern|
@@ -402,6 +503,35 @@ module Wgit
       end
     end
+    # Returns whether or not to follow redirects, and within what context e.g.
+    # :host, :domain etc.
+    def redirect?(follow_redirects)
+      return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
+      unless [true, false].include?(follow_redirects)
+        raise "follow_redirects: must be a Boolean or Symbol, not: \
+#{follow_redirects}"
+      end
+      [follow_redirects, nil]
+    end
+    # Log (at debug level) the network request/response details.
+    def log_net(client, response, duration)
+      resp_template  = "[#{client}] Response: %s (%s bytes in %s seconds)"
+      log_status     = (response.status || 0)
+      log_total_time = (duration || 0.0).truncate(3)
+      # The browsers request URL is the same so ignore it.
+      if client.to_sym == :http
+        Wgit.logger.debug("[#{client}] Request:  #{response.url}")
+      end
+      Wgit.logger.debug(
+        format(resp_template, log_status, response.size, log_total_time)
+      )
+    end
     alias crawl       crawl_urls
     alias crawl_pages crawl_urls
     alias crawl_page  crawl_url