RubyGems - wgit - Versions diffs - 0.10.7 → 0.11.0 - Mend

wgit 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +44 -1
data/CONTRIBUTING.md +1 -1
data/README.md +22 -2
data/bin/wgit +3 -1
data/lib/wgit/assertable.rb +2 -2
data/lib/wgit/crawler.rb +56 -34
data/lib/wgit/database/database.rb +64 -52
data/lib/wgit/document.rb +67 -39
data/lib/wgit/document_extractors.rb +15 -1
data/lib/wgit/dsl.rb +16 -20
data/lib/wgit/indexer.rb +157 -63
data/lib/wgit/logger.rb +1 -1
data/lib/wgit/response.rb +21 -6
data/lib/wgit/robots_parser.rb +193 -0
data/lib/wgit/url.rb +118 -51
data/lib/wgit/utils.rb +81 -28
data/lib/wgit/version.rb +1 -1
data/lib/wgit.rb +1 -0
metadata +33 -38

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -26,33 +26,38 @@ module Wgit
     # Retrieves uncrawled url's from the database and recursively crawls each
     # site storing their internal pages into the database and adding their
     # external url's to be crawled later on. Logs info on the crawl using
-    # Wgit.logger as it goes along.
+    # Wgit.logger as it goes along. This method will honour all site's
+    # robots.txt and 'noindex' requests.
     #
     # @param max_sites [Integer] The number of separate and whole
     #   websites to be crawled before the method exits. Defaults to -1 which
-    #   means the crawl will occur until manually stopped (Ctrl+C etc).
+    #   means the crawl will occur until manually stopped (Ctrl+C), the
+    #   max_data has been reached, or it runs out of external urls to index.
     # @param max_data [Integer] The maximum amount of bytes that will be
     #   scraped from the web (default is 1GB). Note, that this value is used to
     #   determine when to stop crawling; it's not a guarantee of the max data
     #   that will be obtained.
-    def index_www(max_sites: -1, max_data: 1_048_576_000)
+    # @param max_urls_per_iteration [Integer] The maximum number of uncrawled
+    #   urls to index for each iteration, before checking max_sites and
+    #   max_data, possibly ending the crawl.
+    def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
       if max_sites.negative?
         Wgit.logger.info("Indexing until the database has been filled or it \
-runs out of urls to crawl (which might be never).")
+runs out of urls to crawl (which might be never)")
       end
       site_count = 0
       while keep_crawling?(site_count, max_sites, max_data)
         Wgit.logger.info("Current database size: #{@db.size}")
-        uncrawled_urls = @db.uncrawled_urls(limit: 100)
+        uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
         if uncrawled_urls.empty?
-          Wgit.logger.info('No urls to crawl, exiting.')
+          Wgit.logger.info('No urls to crawl, exiting')
           return
         end
-        Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
+        Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
         docs_count = 0
         urls_count = 0
@@ -60,38 +65,48 @@ runs out of urls to crawl (which might be never).")
         uncrawled_urls.each do |url|
           unless keep_crawling?(site_count, max_sites, max_data)
             Wgit.logger.info("Reached max number of sites to crawl or \
-database capacity, exiting.")
+database capacity, exiting")
             return
           end
           site_count += 1
+          parser = parse_robots_txt(url)
+          if parser&.no_index?
+            upsert_url_and_redirects(url)
+            next
+          end
           site_docs_count = 0
-          ext_links = @crawler.crawl_site(url) do |doc|
-            unless doc.empty?
-              write_doc_to_db(doc)
-              docs_count += 1
-              site_docs_count += 1
-            end
+          ext_links = @crawler.crawl_site(
+            url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
+          ) do |doc|
+            next if doc.empty? || no_index?(@crawler.last_response, doc)
+            upsert_doc(doc)
+            docs_count += 1
+            site_docs_count += 1
           end
-          raise 'Error updating url' unless @db.update(url) == 1
+          upsert_url_and_redirects(url)
-          urls_count += write_urls_to_db(ext_links)
+          urls_count += upsert_external_urls(ext_links)
         end
         Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
-url(s) overall for this iteration.")
+url(s) during this iteration")
         Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
-the next iteration.")
-        nil
+future iterations")
       end
+      nil
     end
     # Crawls a single website's pages and stores them into the database.
     # There is no max download limit so be careful which sites you index.
-    # Logs info on the crawl using Wgit.logger as it goes along.
+    # Logs info on the crawl using Wgit.logger as it goes along. This method
+    # will honour the site's robots.txt and 'noindex' requests.
     #
     # @param url [Wgit::Url] The base Url of the website to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -113,28 +128,30 @@ the next iteration.")
       url, insert_externals: false, follow: :default,
       allow_paths: nil, disallow_paths: nil
     )
-      crawl_opts = {
-        follow: follow,
-        allow_paths: allow_paths,
-        disallow_paths: disallow_paths
-      }
+      parser = parse_robots_txt(url)
+      if parser&.no_index?
+        upsert_url_and_redirects(url)
+        return 0
+      end
+      allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
+      crawl_opts = { follow:, allow_paths:, disallow_paths: }
       total_pages_indexed = 0
       ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
+        next if no_index?(@crawler.last_response, doc)
         result = block_given? ? yield(doc) : true
         if result && !doc.empty?
-          write_doc_to_db(doc)
+          upsert_doc(doc)
           total_pages_indexed += 1
         end
       end
-      @db.upsert(url)
-      if insert_externals && ext_urls
-        num_inserted_urls = write_urls_to_db(ext_urls)
-        Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
-      end
+      upsert_url_and_redirects(url)
+      upsert_external_urls(ext_urls) if insert_externals && ext_urls
       Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
 for the site: #{url}")
@@ -145,6 +162,8 @@ for the site: #{url}")
     # Crawls one or more webpages and stores them into the database.
     # There is no max download limit so be careful of large pages.
     # Logs info on the crawl using Wgit.logger as it goes along.
+    # This method will honour the site's robots.txt and 'noindex' requests
+    # in relation to the given urls.
     #
     # @param urls [*Wgit::Url] The webpage Url's to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -157,7 +176,7 @@ for the site: #{url}")
     def index_urls(*urls, insert_externals: false, &block)
       raise 'You must provide at least one Url' if urls.empty?
-      opts = { insert_externals: insert_externals }
+      opts = { insert_externals: }
       Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
       nil
@@ -166,6 +185,8 @@ for the site: #{url}")
     # Crawls a single webpage and stores it into the database.
     # There is no max download limit so be careful of large pages.
     # Logs info on the crawl using Wgit.logger as it goes along.
+    # This method will honour the site's robots.txt and 'noindex' requests
+    # in relation to the given url.
     #
     # @param url [Wgit::Url] The webpage Url to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -175,18 +196,24 @@ for the site: #{url}")
     #   manipulation. Return nil or false from the block to prevent the
     #   document from being saved into the database.
     def index_url(url, insert_externals: false)
+      parser = parse_robots_txt(url)
+      if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
+        upsert_url_and_redirects(url)
+        return
+      end
       document = @crawler.crawl_url(url) do |doc|
+        break if no_index?(@crawler.last_response, doc)
         result = block_given? ? yield(doc) : true
-        write_doc_to_db(doc) if result && !doc.empty?
+        upsert_doc(doc) if result && !doc.empty?
       end
-      @db.upsert(url)
+      upsert_url_and_redirects(url)
       ext_urls = document&.external_links
-      if insert_externals && ext_urls
-        num_inserted_urls = write_urls_to_db(ext_urls)
-        Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
-      end
+      upsert_external_urls(ext_urls) if insert_externals && ext_urls
       nil
     end
@@ -210,10 +237,11 @@ for the site: #{url}")
     end
     # Write the doc to the DB. Note that the unique url index on the documents
-    # collection deliberately prevents duplicate inserts.
+    # collection deliberately prevents duplicate inserts. If the document
+    # already exists, then it will be updated in the DB.
     #
     # @param doc [Wgit::Document] The document to write to the DB.
-    def write_doc_to_db(doc)
+    def upsert_doc(doc)
       if @db.upsert(doc)
         Wgit.logger.info("Saved document for url: #{doc.url}")
       else
@@ -221,35 +249,101 @@ for the site: #{url}")
       end
     end
-    # Write the urls to the DB. Note that the unique url index on the urls
-    # collection deliberately prevents duplicate inserts.
+    # Upsert the url and its redirects, setting all to crawled = true.
     #
-    # @param urls [Array<Wgit::Url>] The urls to write to the DB.
-    # @return [Integer] The number of inserted urls.
-    def write_urls_to_db(urls)
-      count = 0
+    # @param url [Wgit::Url] The url to write to the DB.
+    # @return [Integer] The number of upserted urls (url + redirect urls).
+    def upsert_url_and_redirects(url)
+      url.crawled = true unless url.crawled?
-      return count unless urls.respond_to?(:each)
+      # Upsert the url and any url redirects, setting them as crawled also.
+      @db.bulk_upsert(url.redirects_journey)
+    end
-      urls.each do |url|
-        if url.invalid?
-          Wgit.logger.info("Ignoring invalid external url: #{url}")
-          next
-        end
+    # Write the external urls to the DB. For any external url, its origin will
+    # be inserted e.g. if the external url is http://example.com/contact then
+    # http://example.com will be inserted into the database. Note that the
+    # unique url index on the urls collection deliberately prevents duplicate
+    # inserts.
+    #
+    # @param urls [Array<Wgit::Url>] The external urls to write to the DB.
+    # @return [Integer] The number of upserted urls.
+    def upsert_external_urls(urls)
+      urls = urls
+             .reject(&:invalid?)
+             .map(&:to_origin)
+             .uniq
+      return 0 if urls.empty?
+      count = @db.bulk_upsert(urls)
+      Wgit.logger.info("Saved #{count} external urls")
-        @db.insert(url)
-        count += 1
+      count
+    end
+    private
+    # Crawls and parses robots.txt file (if found). Returns the parser or nil.
+    def parse_robots_txt(url)
+      robots_url = url.to_origin.join('/robots.txt')
+      Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
+      doc = @crawler.crawl_url(robots_url)
+      return nil if !@crawler.last_response.ok? || doc.empty?
+      parser = Wgit::RobotsParser.new(doc.content)
-        Wgit.logger.info("Inserted external url: #{url}")
-      rescue Mongo::Error::OperationFailure
-        Wgit.logger.info("External url already exists: #{url}")
+      Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
+      Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
+      if parser.no_index?
+        Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
       end
-      count
+      parser
+    end
+    # Takes the user defined allow/disallow_paths and merges robots paths
+    # into them. The allow/disallow_paths vars each can be of type nil, String,
+    # Enumerable<String>.
+    def merge_paths(parser, allow_paths, disallow_paths)
+      return allow_paths, disallow_paths unless parser&.rules?
+      allow = allow_paths || []
+      allow = [allow] unless allow.is_a?(Enumerable)
+      disallow = disallow_paths || []
+      disallow = [disallow] unless disallow.is_a?(Enumerable)
+      allow.concat(parser.allow_paths)
+      disallow.concat(parser.disallow_paths)
+      [allow, disallow]
+    end
+    # Returns true if url is included in the given paths.
+    def contains_path?(paths, url)
+      paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
+    end
+    # Returns if the last_response or doc #no_index? is true or not.
+    def no_index?(last_response, doc)
+      url = last_response.url.to_s
+      if last_response.no_index?
+        Wgit.logger.info("Skipping page due to no-index response header: #{url}")
+        return true
+      end
+      if doc&.no_index?
+        Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
+        return true
+      end
+      false
     end
-    alias database db
-    alias index    index_urls
-    alias index_r  index_site
+    alias_method :database, :db
+    alias_method :index,    :index_urls
+    alias_method :index_r,  :index_site
   end
 end

data/lib/wgit/logger.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module Wgit
   #
   # @return [Logger] The default Logger instance.
   def self.default_logger
-    logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
+    logger = Logger.new($stdout, progname: 'wgit', level: :info)
     logger.formatter = proc do |_severity, _datetime, progname, msg|
       "[#{progname}] #{msg}\n"
     end

data/lib/wgit/response.rb CHANGED Viewed

@@ -33,6 +33,13 @@ module Wgit
       @total_time   = 0.0
     end
+    # Overrides String#inspect to shorten the printed output of a Response.
+    #
+    # @return [String] A short textual representation of this Response.
+    def inspect
+      "#<Wgit::Response url=\"#{@url}\" status=#{status}>"
+    end
     # Adds time to @total_time (incrementally).
     #
     # @param time [Float] The time to add to @total_time.
@@ -134,11 +141,19 @@ module Wgit
       @status.positive?
     end
-    alias code           status
-    alias content        body
-    alias crawl_duration total_time
-    alias to_s           body
-    alias redirects      redirections
-    alias length         size
+    # Returns whether or not Wgit is banned from indexing this site.
+    #
+    # @return [Boolean] True if Wgit should not index this site, false
+    #   otherwise.
+    def no_index?
+      headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
+    end
+    alias_method :code,           :status
+    alias_method :content,        :body
+    alias_method :crawl_duration, :total_time
+    alias_method :to_s,           :body
+    alias_method :redirects,      :redirections
+    alias_method :length,         :size
   end
 end

data/lib/wgit/robots_parser.rb ADDED Viewed

@@ -0,0 +1,193 @@
+# frozen_string_literal: true
+module Wgit
+  # The RobotsParser class handles parsing and processing of a web servers
+  # robots.txt file.
+  class RobotsParser
+    include Wgit::Assertable
+    # Key representing the start of a comment.
+    KEY_COMMENT    = '#'
+    # Key value separator used in robots.txt files.
+    KEY_SEPARATOR  = ':'
+    # Key representing a user agent.
+    KEY_USER_AGENT = 'User-agent'
+    # Key representing an allow URL rule.
+    KEY_ALLOW      = 'Allow'
+    # Key representing a disallow URL rule.
+    KEY_DISALLOW   = 'Disallow'
+    # Value representing the Wgit user agent.
+    USER_AGENT_WGIT = :wgit
+    # Value representing any user agent including Wgit.
+    USER_AGENT_ANY  = :*
+    # Value representing any and all paths.
+    PATHS_ALL = %w[/ *].freeze
+    # Hash containing the user-agent allow/disallow URL rules. Looks like:
+    #   allow_paths:    ["/"]
+    #   disallow_paths: ["/accounts", ...]
+    attr_reader :rules
+    # Initializes and returns a Wgit::RobotsParser instance having parsed the
+    # robot.txt contents.
+    #
+    # @param contents [String, #to_s] The contents of the robots.txt file to be
+    #   parsed.
+    def initialize(contents)
+      @rules = {
+        allow_paths: Set.new,
+        disallow_paths: Set.new
+      }
+      assert_respond_to(contents, :to_s)
+      parse(contents.to_s)
+    end
+    # Overrides String#inspect to shorten the printed output of a Parser.
+    #
+    # @return [String] A short textual representation of this Parser.
+    def inspect
+      "#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
+    end
+    # Returns the allow paths/rules for this parser's robots.txt contents.
+    #
+    # @return [Array<String>] The allow paths/rules to follow.
+    def allow_paths
+      @rules[:allow_paths].to_a
+    end
+    # Returns the disallow paths/rules for this parser's robots.txt contents.
+    #
+    # @return [Array<String>] The disallow paths/rules to follow.
+    def disallow_paths
+      @rules[:disallow_paths].to_a
+    end
+    # Returns whether or not there are rules applying to Wgit.
+    #
+    # @return [Boolean] True if there are rules for Wgit to follow, false
+    #   otherwise.
+    def rules?
+      allow_rules? || disallow_rules?
+    end
+    # Returns whether or not there are allow rules applying to Wgit.
+    #
+    # @return [Boolean] True if there are allow rules for Wgit to follow,
+    #   false otherwise.
+    def allow_rules?
+      @rules[:allow_paths].any?
+    end
+    # Returns whether or not there are disallow rules applying to Wgit.
+    #
+    # @return [Boolean] True if there are disallow rules for Wgit to follow,
+    #   false otherwise.
+    def disallow_rules?
+      @rules[:disallow_paths].any?
+    end
+    # Returns whether or not Wgit is banned from indexing this site.
+    #
+    # @return [Boolean] True if Wgit should not index this site, false
+    #   otherwise.
+    def no_index?
+      @rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
+    end
+    private
+    # Parses the file contents and sets @rules.
+    def parse(contents)
+      user_agents = []
+      new_block = false
+      contents.split("\n").each do |line|
+        line.strip!
+        next if line.empty? || line.start_with?(KEY_COMMENT)
+        # A user agent block is denoted by N User-agent's followed by N
+        # Allow/Disallow's. After which a new block is formed from scratch.
+        if start_with_any_case?(line, KEY_USER_AGENT)
+          if new_block
+            user_agents = []
+            new_block = false
+          end
+          user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
+        else
+          new_block = true
+        end
+        if start_with_any_case?(line, KEY_ALLOW)
+          append_allow_rule(user_agents, line)
+        elsif start_with_any_case?(line, KEY_DISALLOW)
+          append_disallow_rule(user_agents, line)
+        elsif !start_with_any_case?(line, KEY_USER_AGENT)
+          Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
+        end
+      end
+    end
+    # Implements start_with? but case insensitive.
+    def start_with_any_case?(str, prefix)
+      str.downcase.start_with?(prefix.downcase)
+    end
+    # Returns line with key removed (if present). Otherwise line is returned
+    # as given.
+    def remove_key(line, key)
+      return line unless start_with_any_case?(line, key)
+      return line unless line.count(KEY_SEPARATOR) == 1
+      segs = line.split(KEY_SEPARATOR)
+      return '' if segs.size == 1
+      segs.last.strip
+    end
+    # Don't append * or /, as this means all paths, which is the same as no
+    # allow_paths when passed to Wgit::Crawler.
+    def append_allow_rule(user_agents, line)
+      return unless wgit_user_agent?(user_agents)
+      path = remove_key(line, KEY_ALLOW)
+      path = parse_special_syntax(path)
+      return if PATHS_ALL.include?(path)
+      @rules[:allow_paths] << path
+    end
+    def append_disallow_rule(user_agents, line)
+      return unless wgit_user_agent?(user_agents)
+      path = remove_key(line, KEY_DISALLOW)
+      path = parse_special_syntax(path)
+      @rules[:disallow_paths] << path
+    end
+    def wgit_user_agent?(user_agents)
+      user_agents.any? do |agent|
+        [USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
+      end
+    end
+    def parse_special_syntax(path)
+      # Remove $ e.g. "/blah$" becomes "/blah"
+      path = path.gsub('$', '')
+      # Remove any inline comments e.g. "/blah # comment" becomes "/blah"
+      path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
+      # Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
+      path = '*' if path.empty?
+      path
+    end
+    alias_method :paths, :rules
+    alias_method :banned?, :no_index?
+  end
+end