RubyGems - wgit - Versions diffs - 0.10.8 → 0.12.0 - Mend

wgit 0.10.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +72 -1
data/CODE_OF_CONDUCT.md +1 -1
data/CONTRIBUTING.md +2 -2
data/README.md +24 -20
data/bin/wgit +75 -19
data/lib/wgit/assertable.rb +33 -6
data/lib/wgit/core_ext.rb +1 -1
data/lib/wgit/crawler.rb +102 -37
data/lib/wgit/database/adapters/in_memory.rb +204 -0
data/lib/wgit/database/adapters/mongo_db.rb +627 -0
data/lib/wgit/database/database.rb +18 -651
data/lib/wgit/database/database_adapter.rb +147 -0
data/lib/wgit/document.rb +222 -98
data/lib/wgit/document_extractors.rb +16 -10
data/lib/wgit/dsl.rb +74 -81
data/lib/wgit/html_to_text.rb +277 -0
data/lib/wgit/indexer.rb +184 -71
data/lib/wgit/logger.rb +2 -2
data/lib/wgit/model.rb +164 -0
data/lib/wgit/response.rb +25 -13
data/lib/wgit/robots_parser.rb +193 -0
data/lib/wgit/url.rb +150 -90
data/lib/wgit/utils.rb +200 -37
data/lib/wgit/version.rb +1 -1
data/lib/wgit.rb +18 -13
metadata +56 -43
data/lib/wgit/database/model.rb +0 -60

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -5,7 +5,6 @@ require_relative 'document'
 require_relative 'utils'
 require_relative 'assertable'
 require_relative 'response'
-require 'set'
 require 'benchmark'
 require 'typhoeus'
 require 'ferrum'
@@ -55,6 +54,11 @@ module Wgit
     # The value should balance between a good UX and enough JS parse time.
     attr_accessor :parse_javascript_delay
+    # The opts Hash passed directly to the ferrum Chrome browser when
+    # `parse_javascript: true`.
+    # See https://github.com/rubycdp/ferrum for details.
+    attr_accessor :ferrum_opts
     # The Wgit::Response of the most recently crawled URL.
     attr_reader :last_response
@@ -70,13 +74,31 @@ module Wgit
     # @param parse_javascript [Boolean] Whether or not to parse the Javascript
     #   of the crawled document. Parsing requires Chrome/Chromium to be
     #   installed and in $PATH.
+    # @param parse_javascript_delay [Integer] The delay time given to a page's
+    #   JS to update the DOM. After the delay, the HTML is crawled.
     def initialize(redirect_limit: 5, timeout: 5, encode: true,
-                   parse_javascript: false, parse_javascript_delay: 1)
+                   parse_javascript: false, parse_javascript_delay: 1,
+                   ferrum_opts: {})
+      assert_type(redirect_limit, Integer)
+      assert_type(timeout, [Integer, Float])
+      assert_type(encode, [TrueClass, FalseClass])
+      assert_type(parse_javascript, [TrueClass, FalseClass])
+      assert_type(parse_javascript_delay, Integer)
+      assert_type(ferrum_opts, Hash)
       @redirect_limit         = redirect_limit
       @timeout                = timeout
       @encode                 = encode
       @parse_javascript       = parse_javascript
       @parse_javascript_delay = parse_javascript_delay
+      @ferrum_opts            = default_ferrum_opts.merge(ferrum_opts)
+    end
+    # Overrides String#inspect to shorten the printed output of a Crawler.
+    #
+    # @return [String] A short textual representation of this Crawler.
+    def inspect
+      "#<Wgit::Crawler timeout=#{@timeout} redirect_limit=#{@redirect_limit} encode=#{@encode} parse_javascript=#{@parse_javascript} parse_javascript_delay=#{@parse_javascript_delay} ferrum_opts=#{@ferrum_opts}>"
     end
     # Crawls an entire website's HTML pages by recursively going through
@@ -86,8 +108,6 @@ module Wgit
     #
     # Use the allow and disallow paths params to partially and selectively
     # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
-    # Note that each path must NOT start with a slash; the only exception being
-    # a `/` on its own with no other characters, referring to the index page.
     #
     # Only redirects to the same host are followed. For example, the Url
     # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -104,6 +124,7 @@ module Wgit
     #   the crawl. This changes how a site is crawled. Only links pointing to
     #   the site domain are allowed. The `:default` is any `<a>` href returning
     #   HTML.
+    # @param max_pages [Integer]
     # @param allow_paths [String, Array<String>] Filters the `follow:` links by
     #   selecting them if their path `File.fnmatch?` one of allow_paths.
     # @param disallow_paths [String, Array<String>] Filters the `follow` links
@@ -115,40 +136,48 @@ module Wgit
     #   from all of the site's pages or nil if the given url could not be
     #   crawled successfully.
     def crawl_site(
-      url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
+      url, follow: :default, max_pages: nil,
+      allow_paths: nil, disallow_paths: nil, &block
     )
       doc = crawl_url(url, &block)
-      return nil if doc.nil?
+      return nil if doc.empty?
-      link_opts = {
-        xpath: follow,
-        allow_paths: allow_paths,
-        disallow_paths: disallow_paths
-      }
-      alt_url   = url.end_with?('/') ? url.chop : url + '/'
+      total_pages = 1
+      limit_reached = max_pages && total_pages >= max_pages
+      link_opts = { xpath: follow, allow_paths:, disallow_paths: }
-      crawled   = Set.new([url, alt_url])
+      crawled   = Set.new(url.redirects_journey)
       externals = Set.new(doc.external_links)
       internals = Set.new(next_internal_links(doc, **link_opts))
       return externals.to_a if internals.empty?
       loop do
-        links = internals - crawled
+        if limit_reached
+          Wgit.logger.debug("Crawled and reached the max_pages limit of: #{max_pages}")
+          break
+        end
+        links = subtract_links(internals, crawled)
         break if links.empty?
         links.each do |link|
-          orig_link = link.dup
+          limit_reached = max_pages && total_pages >= max_pages
+          break if limit_reached
           doc = crawl_url(link, follow_redirects: :host, &block)
-          crawled += [orig_link, link] # Push both links in case of redirects.
-          next if doc.nil?
+          crawled += link.redirects_journey
+          next if doc.empty?
-          internals += next_internal_links(doc, **link_opts)
-          externals += doc.external_links
+          total_pages += 1
+          internals   += next_internal_links(doc, **link_opts)
+          externals   += doc.external_links
         end
       end
+      Wgit.logger.debug("Crawled #{total_pages} documents for the site: #{url}")
       externals.to_a
     end
@@ -169,7 +198,7 @@ module Wgit
     def crawl_urls(*urls, follow_redirects: true, &block)
       raise 'You must provide at least one Url' if urls.empty?
-      opts = { follow_redirects: follow_redirects }
+      opts = { follow_redirects: }
       doc = nil
       Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
@@ -189,19 +218,19 @@ module Wgit
     # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
     #   crawl was successful or not. Therefore, Document#url etc. can be used.
     #   Use `doc.empty?` to determine if the page is valid.
-    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
-    #   crawl was unsuccessful.
+    # @return [Wgit::Document] The crawled HTML Document. Check if the crawl
+    #   was successful with doc.empty? (true if unsuccessful).
     def crawl_url(url, follow_redirects: true)
       # A String url isn't allowed because it's passed by value not reference,
       # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
       assert_type(url, Wgit::Url)
-      html = fetch(url, follow_redirects: follow_redirects)
+      html = fetch(url, follow_redirects:)
       doc  = Wgit::Document.new(url, html, encode: @encode)
       yield(doc) if block_given?
-      doc.empty? ? nil : doc
+      doc
     end
     protected
@@ -226,7 +255,7 @@ module Wgit
       response = Wgit::Response.new
       raise "Invalid url: #{url}" if url.invalid?
-      resolve(url, response, follow_redirects: follow_redirects)
+      resolve(url, response, follow_redirects:)
       get_browser_response(url, response) if @parse_javascript
       response.body_or_nil
@@ -238,6 +267,9 @@ module Wgit
       url.crawled        = true # Sets date_crawled underneath.
       url.crawl_duration = response.total_time
+      # Don't override previous url.redirects if response is fully resolved.
+      url.redirects      = response.redirects unless response.redirects.empty?
       @last_response = response
     end
@@ -253,7 +285,7 @@ module Wgit
     #   :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
     # @raise [StandardError] If a redirect isn't allowed etc.
     def resolve(url, response, follow_redirects: true)
-      origin = url.to_url.to_origin # Recorded before any redirects.
+      origin = url.to_origin # Record the origin before any redirects.
       follow_redirects, within = redirect?(follow_redirects)
       loop do
@@ -277,7 +309,7 @@ module Wgit
         if response.redirect_count >= @redirect_limit
         # Process the location to be crawled next.
-        location = url.to_origin.concat(location) if location.relative?
+        location = url.to_origin.join(location) if location.relative?
         response.redirections[url.to_s] = location.to_s
         url.replace(location) # Update the url on redirect.
       end
@@ -370,7 +402,7 @@ module Wgit
     # @param url [String] The url to browse to.
     # @return [Ferrum::Browser] The browser response object.
     def browser_get(url)
-      @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
+      @browser ||= Ferrum::Browser.new(**@ferrum_opts)
       @browser.goto(url)
       # Wait for the page's JS to finish dynamically manipulating the DOM.
@@ -420,6 +452,38 @@ module Wgit
     private
+    # The default opts which are merged with the user's ferrum_opts: and then
+    # passed directly to the ferrum Chrome browser.
+    def default_ferrum_opts
+      {
+        timeout: @timeout,
+        process_timeout: 10,
+        headless: true
+      }
+    end
+    # Manually does the following: `links = internals - crawled`.
+    # This is needed due to an apparent bug in Set<Url> (when upgrading from
+    # Ruby v3.0.2 to v3.3.0) causing an infinite crawl loop in #crawl_site.
+    # Run in a shell to test: bundle exec toys test infinite_crawl_loop
+    # TODO: Check in future Ruby versions and remove this method when fixed.
+    def subtract_links(internals, crawled)
+      links = Set.new
+      internals.each do |internal_url|
+        already_crawled = false
+        crawled.each do |crawled_url|
+          already_crawled = internal_url == crawled_url
+          break if already_crawled
+        end
+        links.add(internal_url) unless already_crawled
+      end
+      links
+    end
     # Returns the next links used to continue crawling a site. The xpath value
     # is used to obtain the links. Any valid URL Strings will be converted into
     # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
@@ -431,7 +495,8 @@ module Wgit
           .compact
       end
-      if links.any? { |link| link.to_domain != doc.url.to_domain }
+      doc_domain = doc.url.to_domain
+      if links.any? { |link| link.to_domain != doc_domain }
         raise 'The links to follow must be within the site domain'
       end
@@ -458,12 +523,12 @@ module Wgit
     # Validate and filter by the given URL paths.
     def process_paths(links, allow_paths, disallow_paths)
-      if allow_paths
+      if allow_paths && !allow_paths.empty?
         paths = validate_paths(allow_paths)
         filter_links(links, :select!, paths)
       end
-      if disallow_paths
+      if disallow_paths && !disallow_paths.empty?
         paths = validate_paths(disallow_paths)
         filter_links(links, :reject!, paths)
       end
@@ -477,7 +542,7 @@ module Wgit
       raise 'The provided paths must all be Strings' \
       unless paths.all? { |path| path.is_a?(String) }
-      Wgit::Utils.sanitize(paths, encode: false)
+      paths = Wgit::Utils.sanitize(paths, encode: false)
       raise 'The provided paths cannot be empty' if paths.empty?
       paths.map do |path|
@@ -491,7 +556,7 @@ module Wgit
     def filter_links(links, filter_method, paths)
       links.send(filter_method) do |link|
         # Turn http://example.com into / meaning index.
-        link = link.to_endpoint.index? ? '/' : link.omit_base
+        link = link.to_endpoint.index? ? '/' : link.omit_base.omit_trailing_slash
         match = false
         paths.each do |pattern|
@@ -532,9 +597,9 @@ module Wgit
       )
     end
-    alias crawl       crawl_urls
-    alias crawl_pages crawl_urls
-    alias crawl_page  crawl_url
-    alias crawl_r     crawl_site
+    alias_method :crawl,       :crawl_urls
+    alias_method :crawl_pages, :crawl_urls
+    alias_method :crawl_page,  :crawl_url
+    alias_method :crawl_r,     :crawl_site
   end
 end

data/lib/wgit/database/adapters/in_memory.rb ADDED Viewed

@@ -0,0 +1,204 @@
+require_relative "../../utils"
+require_relative "../../url"
+require_relative "../../document"
+require_relative "../../model"
+require_relative "../database_adapter"
+module Wgit::Database
+  # Database implementer class for in-memory (RAM) storage. This DB is mainly used
+  # for testing and experimenting with. This DB is thread safe.
+  class InMemory < DatabaseAdapter
+    # Initializes a thread safe InMemory Database instance.
+    #
+    # @param connection_string [String] Not used but needed to adhere to the
+    #   DatabaseAdapter interface.
+    def initialize(connection_string = nil)
+      # Inits @urls and @docs vars.
+      initialize_store
+      super
+    end
+    # Overrides String#inspect to display collection sizes.
+    #
+    # @return [String] A short textual representation of this object.
+    def inspect
+      "#<Wgit::Database::InMemory num_urls=#{@urls.size} \
+num_docs=#{@docs.size} size=#{size}>"
+    end
+    # The Wgit::Url's collection stored as an in-memory Concurrent::Array.
+    def urls(&block)
+      map_urls(@urls, &block)
+    end
+    # The Wgit::Document's collection stored as an in-memory Concurrent::Array.
+    def docs(&block)
+      map_documents(@docs, &block)
+    end
+    # The raw url Hashes, not mapped into their corresponding Wgit objects.
+    def url_hashes
+      @urls
+    end
+    # The raw doc Hashes, not mapped into their corresponding Wgit objects.
+    def doc_hashes
+      @docs
+    end
+    # Returns the current size of the in-memory database.
+    # An empty database will return a size of 4 because there are 4 bytes in
+    # two empty arrays (urls and docs collections).
+    #
+    # @return [Integer] The current size of the in-memory DB.
+    def size
+      @urls.to_s.size + @docs.to_s.size
+    end
+    # Searches the database's Document#text for the given query. The returned
+    # Documents are sorted for relevance, starting with the most relevant. Each
+    # Document's #score value will be set accordingly.
+    #
+    # @param query [Regexp, #to_s] The regex or text value to search each
+    #   document's @text for.
+    # @param case_sensitive [Boolean] Whether character case must match.
+    # @param whole_sentence [Boolean] Whether multiple words should be searched
+    #   for separately.
+    # @param limit [Integer] The max number of results to return.
+    # @param skip [Integer] The number of results to skip.
+    # @yield [doc] Given each search result (Wgit::Document) returned from the
+    #   DB.
+    # @return [Array<Wgit::Document>] The search results obtained from the DB.
+    def search(
+      query, case_sensitive: false, whole_sentence: true,
+      limit: 10, skip: 0, &block
+    )
+      regex = Wgit::Utils.build_search_regex(
+        query, case_sensitive:, whole_sentence:)
+      # Search the Wgit::Document's, not the raw Hashes.
+      results = docs.select do |doc|
+        score = 0
+        doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
+          score = results_hash.values.sum
+        end
+        next false if score.zero?
+        doc.instance_variable_set :@score, score
+        true
+      end
+      return [] if results.empty?
+      results = results.sort_by { |doc| -doc.score }
+      results = results[skip..]
+      return [] unless results
+      results = results[0...limit] if limit.positive?
+      results.each(&block) if block_given?
+      results
+    end
+    # Deletes everything in the urls and documents collections.
+    #
+    # @return [Integer] The number of deleted records.
+    def empty
+      previous_size = @urls.size + @docs.size
+      initialize_store
+      previous_size
+    end
+    # Returns Url records that haven't yet been crawled.
+    #
+    # @param limit [Integer] The max number of Url's to return. 0 returns all.
+    # @param skip [Integer] Skip n amount of Url's.
+    # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
+    # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
+    def uncrawled_urls(limit: 0, skip: 0, &block)
+      uncrawled = @urls.reject { |url| url["crawled"] }
+      uncrawled = uncrawled[skip..]
+      return [] unless uncrawled
+      uncrawled = uncrawled[0...limit] if limit.positive?
+      map_urls(uncrawled, &block)
+    end
+    # Inserts or updates the object in the in-memory database.
+    #
+    # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
+    # @return [Boolean] True if inserted, false if updated.
+    def upsert(obj)
+      collection, index, model = get_model_info(obj)
+      if index
+        collection[index] = model
+        false
+      else
+        collection << model
+        true
+      end
+    end
+    # Bulk upserts the objects in the in-memory database collection.
+    # You cannot mix collection objs types, all must be Urls or Documents.
+    #
+    # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
+    #   inserted/updated.
+    # @return [Integer] The total number of newly inserted objects.
+    def bulk_upsert(objs)
+      assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
+      objs.reduce(0) do |inserted, obj|
+        inserted += 1 if upsert(obj)
+        inserted
+      end
+    end
+    private
+    # Creates a new Concurrent::Array for each collection.
+    def initialize_store
+      @urls = Concurrent::Array.new
+      @docs = Concurrent::Array.new
+    end
+    # Get the database's model info (collection type, index, model) for
+    # obj.
+    #
+    # Use like:
+    # ```
+    # collection, index, model = get_model_info(obj)
+    # ```
+    #
+    # Raises an error if obj isn't a Wgit::Url or Wgit::Document.
+    #
+    # @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
+    # @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
+    # @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
+    #   the collection, nil otherwise) and the Wgit::Model of obj.
+    def get_model_info(obj)
+      obj = obj.dup
+      case obj
+      when Wgit::Url
+        key        = obj.to_s
+        collection = @urls
+        index      = @urls.index { |url| url["url"] == key }
+        model      = build_model(obj)
+      when Wgit::Document
+        key        = obj.url.to_s
+        collection = @docs
+        index      = @docs.index { |doc| doc["url"]&.[]("url") == key }
+        model      = build_model(obj)
+      else
+        raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
+      end
+      [collection, index, model]
+    end
+  end
+end