gort 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - checksums.yaml.gz.sig +2 -0
 - data/LICENSE.txt +21 -0
 - data/lib/gort/allow_rule.rb +13 -0
 - data/lib/gort/disallow_rule.rb +13 -0
 - data/lib/gort/group.rb +85 -0
 - data/lib/gort/invalid_line.rb +45 -0
 - data/lib/gort/parser.rb +144 -0
 - data/lib/gort/path_rule.rb +98 -0
 - data/lib/gort/robots_txt.rb +96 -0
 - data/lib/gort/rule.rb +48 -0
 - data/lib/gort/rule_set.rb +25 -0
 - data/lib/gort/user_agent_rule.rb +52 -0
 - data/lib/gort/version.rb +6 -0
 - data/lib/gort.rb +19 -0
 - data.tar.gz.sig +0 -0
 - metadata +105 -0
 - metadata.gz.sig +2 -0
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            SHA256:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: e05f323b6f9699a6f39f7042d494e18856bd954b974cf6943d0a0fa6bdb1f263
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 396813a5d7c484e5408603c17bb76d79ec4debce48edec8a6b8ecf8eeb0231da
         
     | 
| 
      
 5 
     | 
    
         
            +
            SHA512:
         
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 9978aba28e9e77ea750cf61de9a22137706fc578fbe0bfbc348d9c97b4d516ea3b6a2e11250f115a2eeae8fea9752c48dc5803556fd3e9a4c0dcfc1e4e9a02e0
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: cae8fc644d21a8d31da01b2390f801ceb06b0fc857d1c3d900a62496ff111f41636e09316c86520ab829add7248dcfb1d2a818eea0dd04b030429f8734388be8
         
     | 
    
        checksums.yaml.gz.sig
    ADDED
    
    
    
        data/LICENSE.txt
    ADDED
    
    | 
         @@ -0,0 +1,21 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            The MIT License (MIT)
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Copyright (c) 2024 Alexander Mankuta
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         
     | 
| 
      
 6 
     | 
    
         
            +
            of this software and associated documentation files (the "Software"), to deal
         
     | 
| 
      
 7 
     | 
    
         
            +
            in the Software without restriction, including without limitation the rights
         
     | 
| 
      
 8 
     | 
    
         
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         
     | 
| 
      
 9 
     | 
    
         
            +
            copies of the Software, and to permit persons to whom the Software is
         
     | 
| 
      
 10 
     | 
    
         
            +
            furnished to do so, subject to the following conditions:
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be included in
         
     | 
| 
      
 13 
     | 
    
         
            +
            all copies or substantial portions of the Software.
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         
     | 
| 
      
 16 
     | 
    
         
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         
     | 
| 
      
 17 
     | 
    
         
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         
     | 
| 
      
 18 
     | 
    
         
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         
     | 
| 
      
 19 
     | 
    
         
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         
     | 
| 
      
 20 
     | 
    
         
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
         
     | 
| 
      
 21 
     | 
    
         
            +
            THE SOFTWARE.
         
     | 
| 
         @@ -0,0 +1,13 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "path_rule"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 6 
     | 
    
         
            +
              # Disallow rule.
         
     | 
| 
      
 7 
     | 
    
         
            +
              class DisallowRule < PathRule
         
     | 
| 
      
 8 
     | 
    
         
            +
                # @param value [String] the path pattern to disallow.
         
     | 
| 
      
 9 
     | 
    
         
            +
                def initialize(value)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  super(:disallow, value)
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
              end
         
     | 
| 
      
 13 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/gort/group.rb
    ADDED
    
    | 
         @@ -0,0 +1,85 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "rule_set"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 6 
     | 
    
         
            +
              # An access group
         
     | 
| 
      
 7 
     | 
    
         
            +
              class Group < RuleSet
         
     | 
| 
      
 8 
     | 
    
         
            +
                # Is this group valid?
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # A valid group has at least one valid user-agent rule.
         
     | 
| 
      
 11 
     | 
    
         
            +
                #
         
     | 
| 
      
 12 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 13 
     | 
    
         
            +
                # @see UserAgentRule#valid?
         
     | 
| 
      
 14 
     | 
    
         
            +
                def valid?
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @valid ||=
         
     | 
| 
      
 16 
     | 
    
         
            +
                    rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? }
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                # Does this group apply to this specific user agent?
         
     | 
| 
      
 20 
     | 
    
         
            +
                #
         
     | 
| 
      
 21 
     | 
    
         
            +
                # This performa user agent matcchign acording to the RFC.
         
     | 
| 
      
 22 
     | 
    
         
            +
                #
         
     | 
| 
      
 23 
     | 
    
         
            +
                # @param user_agent [String]
         
     | 
| 
      
 24 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 25 
     | 
    
         
            +
                def apply?(user_agent)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  apply_to_all? || user_agent.match?(user_agent_regexp)
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                # A human readable representation of the group.
         
     | 
| 
      
 32 
     | 
    
         
            +
                #
         
     | 
| 
      
 33 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 34 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 35 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 36 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 37 
     | 
    
         
            +
                  "#<#{self.class.name}:#{object_id} #{rules.inspect}>"
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                # Produces a pretty human readable representation of the group.
         
     | 
| 
      
 42 
     | 
    
         
            +
                #
         
     | 
| 
      
 43 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 44 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 45 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 46 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 47 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}")
         
     | 
| 
      
 49 
     | 
    
         
            +
                  pp.group(1, "[", "]") do
         
     | 
| 
      
 50 
     | 
    
         
            +
                    pp.breakable("")
         
     | 
| 
      
 51 
     | 
    
         
            +
                    pp.seplist(rules) do |rule|
         
     | 
| 
      
 52 
     | 
    
         
            +
                      pp.pp(rule)
         
     | 
| 
      
 53 
     | 
    
         
            +
                    end
         
     | 
| 
      
 54 
     | 
    
         
            +
                    pp.breakable("")
         
     | 
| 
      
 55 
     | 
    
         
            +
                  end
         
     | 
| 
      
 56 
     | 
    
         
            +
                end
         
     | 
| 
      
 57 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
                private
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                # Does this rule apply to all user agents?
         
     | 
| 
      
 64 
     | 
    
         
            +
                #
         
     | 
| 
      
 65 
     | 
    
         
            +
                # Effectively, is this rule is a `*` rule.
         
     | 
| 
      
 66 
     | 
    
         
            +
                #
         
     | 
| 
      
 67 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 68 
     | 
    
         
            +
                def apply_to_all?
         
     | 
| 
      
 69 
     | 
    
         
            +
                  @apply_to_all ||= rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? && rule.value == "*" }
         
     | 
| 
      
 70 
     | 
    
         
            +
                end
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
                # A compiled Regexp that mathes all user agents in this group.
         
     | 
| 
      
 73 
     | 
    
         
            +
                #
         
     | 
| 
      
 74 
     | 
    
         
            +
                # @return [Regexp]
         
     | 
| 
      
 75 
     | 
    
         
            +
                def user_agent_regexp
         
     | 
| 
      
 76 
     | 
    
         
            +
                  @user_agent_regexp ||=
         
     | 
| 
      
 77 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 78 
     | 
    
         
            +
                      specific_user_agent_rules = rules.select { |rule|
         
     | 
| 
      
 79 
     | 
    
         
            +
                        rule.is_a?(UserAgentRule) && rule.valid? && (rule.value != "*")
         
     | 
| 
      
 80 
     | 
    
         
            +
                      }
         
     | 
| 
      
 81 
     | 
    
         
            +
                      Regexp.new(specific_user_agent_rules.map { Regexp.escape(_1.value) }.join("|"), Regexp::IGNORECASE)
         
     | 
| 
      
 82 
     | 
    
         
            +
                    end
         
     | 
| 
      
 83 
     | 
    
         
            +
                end
         
     | 
| 
      
 84 
     | 
    
         
            +
              end
         
     | 
| 
      
 85 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,45 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Represents an invalid line in a robots.txt file.
         
     | 
| 
      
 5 
     | 
    
         
            +
              #
         
     | 
| 
      
 6 
     | 
    
         
            +
              # @note Technically, the RFC doesn't have invalid lines in its grammar
         
     | 
| 
      
 7 
     | 
    
         
            +
              #   but there are just too many broken robots.txt files on the internet.
         
     | 
| 
      
 8 
     | 
    
         
            +
              #
         
     | 
| 
      
 9 
     | 
    
         
            +
              # An invalid line is a line that can not be parsed as a rule and is not a comment.
         
     | 
| 
      
 10 
     | 
    
         
            +
              class InvalidLine
         
     | 
| 
      
 11 
     | 
    
         
            +
                # @param text [String] content of the line
         
     | 
| 
      
 12 
     | 
    
         
            +
                def initialize(text)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  @value = text
         
     | 
| 
      
 14 
     | 
    
         
            +
                end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                # Content of the line.
         
     | 
| 
      
 17 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 18 
     | 
    
         
            +
                attr_reader :value
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                # A human readable representation of the invalid line.
         
     | 
| 
      
 23 
     | 
    
         
            +
                #
         
     | 
| 
      
 24 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 25 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 26 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 27 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 28 
     | 
    
         
            +
                  %(#<#{self.class.name}:#{object_id} "#{value}">)
         
     | 
| 
      
 29 
     | 
    
         
            +
                end
         
     | 
| 
      
 30 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                # Produces a pretty human readable representation of the invalid line.
         
     | 
| 
      
 33 
     | 
    
         
            +
                #
         
     | 
| 
      
 34 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 35 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 36 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 37 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 38 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 39 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}< #{value} >")
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/gort/parser.rb
    ADDED
    
    | 
         @@ -0,0 +1,144 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "invalid_line"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative "rule"
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative "user_agent_rule"
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative "allow_rule"
         
     | 
| 
      
 7 
     | 
    
         
            +
            require_relative "disallow_rule"
         
     | 
| 
      
 8 
     | 
    
         
            +
            require_relative "rule_set"
         
     | 
| 
      
 9 
     | 
    
         
            +
            require_relative "group"
         
     | 
| 
      
 10 
     | 
    
         
            +
            require_relative "robots_txt"
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 13 
     | 
    
         
            +
              # robots.txt parser. It implements the parsing logic according to RFC 9309, including errata.
         
     | 
| 
      
 14 
     | 
    
         
            +
              class Parser
         
     | 
| 
      
 15 
     | 
    
         
            +
                # You may get this error if the input does not look like a text file.
         
     | 
| 
      
 16 
     | 
    
         
            +
                class BinaryInputError < Error; end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                # You may get this error if the input looks like a text file but its encoding is invalid.
         
     | 
| 
      
 19 
     | 
    
         
            +
                class InvalidEncodingError < Error; end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                UTF_8_BOM = "\ufeff"
         
     | 
| 
      
 22 
     | 
    
         
            +
                private_constant :UTF_8_BOM
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                # @param input [String] The robots.txt content to parse. It must be encoded in UTF-8 or compatible encoding.
         
     | 
| 
      
 25 
     | 
    
         
            +
                def initialize(input)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  @input = detect_and_fix_encoding(input).then { |string| strip_bom(string) }
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # RFC does not explicitly define the generic rule name syntax. It only defines that it has to be case-insensitive.
         
     | 
| 
      
 30 
     | 
    
         
            +
                # It also provides a few pre-defined rule names such as User-Agent, Allow, and Disallow.
         
     | 
| 
      
 31 
     | 
    
         
            +
                # Things that might be different from the RFC intention:
         
     | 
| 
      
 32 
     | 
    
         
            +
                # - The rule name must start with a letter. RFC might allow other characters.
         
     | 
| 
      
 33 
     | 
    
         
            +
                # - The rule name might contain underscores. RFC doesn't mention underscores.
         
     | 
| 
      
 34 
     | 
    
         
            +
                # - The rule name might contain digits. RFC doesn't mention digits, either.
         
     | 
| 
      
 35 
     | 
    
         
            +
                #
         
     | 
| 
      
 36 
     | 
    
         
            +
                # This is only used for plausible rule detection.
         
     | 
| 
      
 37 
     | 
    
         
            +
                RULE_KEY = /\A[a-z][a-z0-9_-]*\s*:/i
         
     | 
| 
      
 38 
     | 
    
         
            +
                private_constant :RULE_KEY
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                # Actually parse the file.
         
     | 
| 
      
 41 
     | 
    
         
            +
                #
         
     | 
| 
      
 42 
     | 
    
         
            +
                # @return [Gort::RobotsTxt]
         
     | 
| 
      
 43 
     | 
    
         
            +
                def parse
         
     | 
| 
      
 44 
     | 
    
         
            +
                  content_lines =
         
     | 
| 
      
 45 
     | 
    
         
            +
                    input.lines.map { |line|
         
     | 
| 
      
 46 
     | 
    
         
            +
                      line.split("#", 2).first.strip
         
     | 
| 
      
 47 
     | 
    
         
            +
                    }
         
     | 
| 
      
 48 
     | 
    
         
            +
                    .reject(&:empty?)
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                  rules = content_lines.map { |line| parse_line(line) }
         
     | 
| 
      
 51 
     | 
    
         
            +
                  grouped_rules, standalone_rules = partition_rules(rules)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  groups = group_rules(grouped_rules)
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                  RobotsTxt.new(groups + standalone_rules)
         
     | 
| 
      
 55 
     | 
    
         
            +
                end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                private
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 60 
     | 
    
         
            +
                attr_reader :input
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                # @param string [String]
         
     | 
| 
      
 63 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 64 
     | 
    
         
            +
                def detect_and_fix_encoding(string)
         
     | 
| 
      
 65 
     | 
    
         
            +
                  string.encode(Encoding::UTF_8)
         
     | 
| 
      
 66 
     | 
    
         
            +
                rescue EncodingError
         
     | 
| 
      
 67 
     | 
    
         
            +
                  require "rchardet"
         
     | 
| 
      
 68 
     | 
    
         
            +
                  result = CharDet.detect(string)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  raise BinaryInputError, "Input does not look like text" if result["encoding"].nil? || result["confidence"] < 0.25
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 72 
     | 
    
         
            +
                    string
         
     | 
| 
      
 73 
     | 
    
         
            +
                      .dup
         
     | 
| 
      
 74 
     | 
    
         
            +
                      .force_encoding(result["encoding"])
         
     | 
| 
      
 75 
     | 
    
         
            +
                      .encode(Encoding::UTF_8)
         
     | 
| 
      
 76 
     | 
    
         
            +
                  rescue EncodingError
         
     | 
| 
      
 77 
     | 
    
         
            +
                    raise InvalidEncodingError, "Input string looks like text but its encoding is invalid."
         
     | 
| 
      
 78 
     | 
    
         
            +
                  end
         
     | 
| 
      
 79 
     | 
    
         
            +
                end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                # @param string [String]
         
     | 
| 
      
 82 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 83 
     | 
    
         
            +
                def strip_bom(string)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  if string[0] == UTF_8_BOM
         
     | 
| 
      
 85 
     | 
    
         
            +
                    string[1..] # Remove BOM
         
     | 
| 
      
 86 
     | 
    
         
            +
                  else
         
     | 
| 
      
 87 
     | 
    
         
            +
                    string
         
     | 
| 
      
 88 
     | 
    
         
            +
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
                end
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
                # @param line [String]
         
     | 
| 
      
 92 
     | 
    
         
            +
                # @return [UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine]
         
     | 
| 
      
 93 
     | 
    
         
            +
                def parse_line(line)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  if line.match?(RULE_KEY)
         
     | 
| 
      
 95 
     | 
    
         
            +
                    # @type var key: String
         
     | 
| 
      
 96 
     | 
    
         
            +
                    # @type var value: String
         
     | 
| 
      
 97 
     | 
    
         
            +
                    key, value = line.split(":", 2).map(&:strip)
         
     | 
| 
      
 98 
     | 
    
         
            +
                    case key.downcase
         
     | 
| 
      
 99 
     | 
    
         
            +
                    when "user-agent"
         
     | 
| 
      
 100 
     | 
    
         
            +
                      UserAgentRule.new(value)
         
     | 
| 
      
 101 
     | 
    
         
            +
                    when "allow"
         
     | 
| 
      
 102 
     | 
    
         
            +
                      AllowRule.new(value)
         
     | 
| 
      
 103 
     | 
    
         
            +
                    when "disallow"
         
     | 
| 
      
 104 
     | 
    
         
            +
                      DisallowRule.new(value)
         
     | 
| 
      
 105 
     | 
    
         
            +
                    else
         
     | 
| 
      
 106 
     | 
    
         
            +
                      Rule.new(key, value)
         
     | 
| 
      
 107 
     | 
    
         
            +
                    end
         
     | 
| 
      
 108 
     | 
    
         
            +
                  else
         
     | 
| 
      
 109 
     | 
    
         
            +
                    InvalidLine.new(line)
         
     | 
| 
      
 110 
     | 
    
         
            +
                  end
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                # @param rules [Array<UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
         
     | 
| 
      
 114 
     | 
    
         
            +
                # @return [(Array<UserAgentRule, AllowRule, DisallowRule>, Array<AllowRule, DisallowRule, Rule, InvalidLine>)]
         
     | 
| 
      
 115 
     | 
    
         
            +
                def partition_rules(rules)
         
     | 
| 
      
 116 
     | 
    
         
            +
                  standalone_rules = []
         
     | 
| 
      
 117 
     | 
    
         
            +
                  grouped_rules = []
         
     | 
| 
      
 118 
     | 
    
         
            +
                  rules.each do |rule|
         
     | 
| 
      
 119 
     | 
    
         
            +
                    case rule
         
     | 
| 
      
 120 
     | 
    
         
            +
                    when UserAgentRule
         
     | 
| 
      
 121 
     | 
    
         
            +
                      grouped_rules << rule
         
     | 
| 
      
 122 
     | 
    
         
            +
                    when AllowRule, DisallowRule
         
     | 
| 
      
 123 
     | 
    
         
            +
                      if grouped_rules.empty?
         
     | 
| 
      
 124 
     | 
    
         
            +
                        standalone_rules << rule
         
     | 
| 
      
 125 
     | 
    
         
            +
                      else
         
     | 
| 
      
 126 
     | 
    
         
            +
                        grouped_rules << rule
         
     | 
| 
      
 127 
     | 
    
         
            +
                      end
         
     | 
| 
      
 128 
     | 
    
         
            +
                    else
         
     | 
| 
      
 129 
     | 
    
         
            +
                      standalone_rules << rule
         
     | 
| 
      
 130 
     | 
    
         
            +
                    end
         
     | 
| 
      
 131 
     | 
    
         
            +
                  end
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                  [grouped_rules, standalone_rules]
         
     | 
| 
      
 134 
     | 
    
         
            +
                end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                # @param rules [Array<UserAgentRule, AllowRule, DisallowRule>]
         
     | 
| 
      
 137 
     | 
    
         
            +
                # @return [Array<Group>]
         
     | 
| 
      
 138 
     | 
    
         
            +
                def group_rules(rules)
         
     | 
| 
      
 139 
     | 
    
         
            +
                  rules
         
     | 
| 
      
 140 
     | 
    
         
            +
                    .slice_when { |a, b| !a.is_a?(UserAgentRule) && b.is_a?(UserAgentRule) }
         
     | 
| 
      
 141 
     | 
    
         
            +
                    .map { |group| Group.new(group) }
         
     | 
| 
      
 142 
     | 
    
         
            +
                end
         
     | 
| 
      
 143 
     | 
    
         
            +
              end
         
     | 
| 
      
 144 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,98 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "rule"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require "addressable/uri"
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 7 
     | 
    
         
            +
              # A rule that matches a path and query string.
         
     | 
| 
      
 8 
     | 
    
         
            +
              #
         
     | 
| 
      
 9 
     | 
    
         
            +
              # @abstract
         
     | 
| 
      
 10 
     | 
    
         
            +
              class PathRule < Rule
         
     | 
| 
      
 11 
     | 
    
         
            +
                # Path patter has to start with a slash and not contain control characters or hash.
         
     | 
| 
      
 12 
     | 
    
         
            +
                # It also has to be a valid UTF-8 string but this is checked diring parsing.
         
     | 
| 
      
 13 
     | 
    
         
            +
                # It also can be empty.
         
     | 
| 
      
 14 
     | 
    
         
            +
                PATH_PATTERN = %r{\A(?:[/*][^\u0000-\u0020\u0023$]*\$?)?\z}u
         
     | 
| 
      
 15 
     | 
    
         
            +
                private_constant :PATH_PATTERN
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                def valid?
         
     | 
| 
      
 18 
     | 
    
         
            +
                  value.match?(PATH_PATTERN)
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                # Match the path and query string against the rule.
         
     | 
| 
      
 22 
     | 
    
         
            +
                # Invalid rules never match.
         
     | 
| 
      
 23 
     | 
    
         
            +
                # Empty rules never match, either. This is not explicitly stated in the RFC
         
     | 
| 
      
 24 
     | 
    
         
            +
                # but it is explicitly described in previous robots.txt documents.
         
     | 
| 
      
 25 
     | 
    
         
            +
                #
         
     | 
| 
      
 26 
     | 
    
         
            +
                # @param path_and_query [String]
         
     | 
| 
      
 27 
     | 
    
         
            +
                # @return [nil, (Integer, PathRule)]
         
     | 
| 
      
 28 
     | 
    
         
            +
                #   - +nil+ if the rule does not match the path and query string.
         
     | 
| 
      
 29 
     | 
    
         
            +
                #   - An array with the number of bytes matched and the rule itself if the rule matches.
         
     | 
| 
      
 30 
     | 
    
         
            +
                def match(path_and_query)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  return nil if !valid? || value.empty?
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                  path_and_query = normalize_path_and_query(path_and_query)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  match = path_and_query.match(regexp)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  return nil unless match
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                  [match.to_s.bytesize, self]
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                # A human readable representation of the rule.
         
     | 
| 
      
 43 
     | 
    
         
            +
                #
         
     | 
| 
      
 44 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 45 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 46 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 47 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 48 
     | 
    
         
            +
                  %(#<#{self.class.name}:#{object_id} "#{value}">)
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                # Produces a pretty human readable representation of the rule.
         
     | 
| 
      
 53 
     | 
    
         
            +
                #
         
     | 
| 
      
 54 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 55 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 56 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 57 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 58 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}< #{value} >")
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                private
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                # @param path_and_query [String]
         
     | 
| 
      
 68 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 69 
     | 
    
         
            +
                def normalize_path_and_query(path_and_query)
         
     | 
| 
      
 70 
     | 
    
         
            +
                  pq = Addressable::URI.parse(path_and_query).normalize
         
     | 
| 
      
 71 
     | 
    
         
            +
                  pq.scheme = nil
         
     | 
| 
      
 72 
     | 
    
         
            +
                  pq.authority = nil
         
     | 
| 
      
 73 
     | 
    
         
            +
                  pq.fragment = nil
         
     | 
| 
      
 74 
     | 
    
         
            +
                  pq.to_s
         
     | 
| 
      
 75 
     | 
    
         
            +
                end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                # @return [Regexp]
         
     | 
| 
      
 78 
     | 
    
         
            +
                def regexp
         
     | 
| 
      
 79 
     | 
    
         
            +
                  @regexp ||=
         
     | 
| 
      
 80 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 81 
     | 
    
         
            +
                      parts = value.scan(/[^*$]+|[*$]/)
         
     | 
| 
      
 82 
     | 
    
         
            +
                      regexp_parts =
         
     | 
| 
      
 83 
     | 
    
         
            +
                        parts.map { |part|
         
     | 
| 
      
 84 
     | 
    
         
            +
                          case part
         
     | 
| 
      
 85 
     | 
    
         
            +
                          when "*"
         
     | 
| 
      
 86 
     | 
    
         
            +
                            ".*"
         
     | 
| 
      
 87 
     | 
    
         
            +
                          when "$"
         
     | 
| 
      
 88 
     | 
    
         
            +
                            "\\z"
         
     | 
| 
      
 89 
     | 
    
         
            +
                          else
         
     | 
| 
      
 90 
     | 
    
         
            +
                            Regexp.escape(Addressable::URI.normalized_encode(part))
         
     | 
| 
      
 91 
     | 
    
         
            +
                          end
         
     | 
| 
      
 92 
     | 
    
         
            +
                        }
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                      Regexp.new("\\A#{regexp_parts.join}")
         
     | 
| 
      
 95 
     | 
    
         
            +
                    end
         
     | 
| 
      
 96 
     | 
    
         
            +
                end
         
     | 
| 
      
 97 
     | 
    
         
            +
              end
         
     | 
| 
      
 98 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,96 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Represents a robots.txt file.
         
     | 
| 
      
 5 
     | 
    
         
            +
              class RobotsTxt
         
     | 
| 
      
 6 
     | 
    
         
            +
                ROBOTS_TXT_PATH = "/robots.txt"
         
     | 
| 
      
 7 
     | 
    
         
            +
                private_constant :ROBOTS_TXT_PATH
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                def initialize(rules)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  @rules = rules
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                # @return [Array<Rule, Group, InvalidLine>]
         
     | 
| 
      
 14 
     | 
    
         
            +
                attr_reader :rules
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                # Is this path allowed for the given user agent?
         
     | 
| 
      
 17 
     | 
    
         
            +
                #
         
     | 
| 
      
 18 
     | 
    
         
            +
                # @param user_agent [String]
         
     | 
| 
      
 19 
     | 
    
         
            +
                # @param path_and_query [String]
         
     | 
| 
      
 20 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 21 
     | 
    
         
            +
                # @see PathRule#match
         
     | 
| 
      
 22 
     | 
    
         
            +
                # @see #disallow?
         
     | 
| 
      
 23 
     | 
    
         
            +
                def allow?(user_agent, path_and_query)
         
     | 
| 
      
 24 
     | 
    
         
            +
                  return true if path_and_query == ROBOTS_TXT_PATH
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                  top_match =
         
     | 
| 
      
 27 
     | 
    
         
            +
                    matches(user_agent, path_and_query)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    .compact
         
     | 
| 
      
 29 
     | 
    
         
            +
                    # This is an arcane bit.
         
     | 
| 
      
 30 
     | 
    
         
            +
                    # The rules are reverse sorted by match length (i.e. longest first),
         
     | 
| 
      
 31 
     | 
    
         
            +
                    # and then by class name using the fact that allow goes before disallow.
         
     | 
| 
      
 32 
     | 
    
         
            +
                    # This is the rule precedence order defined in the RFC.
         
     | 
| 
      
 33 
     | 
    
         
            +
                    .min_by { |(match_length, rule)| [-match_length, rule.class.name] }
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                  # Allow if there is no match or the top match is an allow rule.
         
     | 
| 
      
 36 
     | 
    
         
            +
                  top_match.nil? || top_match.last.is_a?(AllowRule)
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                # Is this path disallowed for the given user agent?
         
     | 
| 
      
 40 
     | 
    
         
            +
                #
         
     | 
| 
      
 41 
     | 
    
         
            +
                # @param user_agent [String]
         
     | 
| 
      
 42 
     | 
    
         
            +
                # @param path_and_query [String]
         
     | 
| 
      
 43 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 44 
     | 
    
         
            +
                # @see PathRule#match
         
     | 
| 
      
 45 
     | 
    
         
            +
                # @see #allow?
         
     | 
| 
      
 46 
     | 
    
         
            +
                def disallow?(user_agent, path_and_query)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  !allow?(user_agent, path_and_query)
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                # A human readable representation of the robots.txt.
         
     | 
| 
      
 53 
     | 
    
         
            +
                #
         
     | 
| 
      
 54 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 55 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 56 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 57 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 58 
     | 
    
         
            +
                  "#<#{self.class.name}:#{object_id} #{rules.inspect}>"
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                # Produces a pretty human readable representation of the robots.txt.
         
     | 
| 
      
 63 
     | 
    
         
            +
                #
         
     | 
| 
      
 64 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 65 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 66 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 67 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 68 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}")
         
     | 
| 
      
 70 
     | 
    
         
            +
                  pp.group(1, "[", "]") do
         
     | 
| 
      
 71 
     | 
    
         
            +
                    pp.breakable("")
         
     | 
| 
      
 72 
     | 
    
         
            +
                    pp.seplist(rules) do |rule|
         
     | 
| 
      
 73 
     | 
    
         
            +
                      pp.pp(rule)
         
     | 
| 
      
 74 
     | 
    
         
            +
                    end
         
     | 
| 
      
 75 
     | 
    
         
            +
                    pp.breakable("")
         
     | 
| 
      
 76 
     | 
    
         
            +
                  end
         
     | 
| 
      
 77 
     | 
    
         
            +
                end
         
     | 
| 
      
 78 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                private
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                def matches(user_agent, path)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  # @type var groups: Array<Group>
         
     | 
| 
      
 86 
     | 
    
         
            +
                  groups = rules.select { |rule| rule.is_a?(Group) && rule.valid? && rule.apply?(user_agent) }
         
     | 
| 
      
 87 
     | 
    
         
            +
                  groups.flat_map do |group|
         
     | 
| 
      
 88 
     | 
    
         
            +
                    group.rules.filter_map do |rule|
         
     | 
| 
      
 89 
     | 
    
         
            +
                      next unless rule.is_a?(PathRule)
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
                      rule.match(path)
         
     | 
| 
      
 92 
     | 
    
         
            +
                    end
         
     | 
| 
      
 93 
     | 
    
         
            +
                  end
         
     | 
| 
      
 94 
     | 
    
         
            +
                end
         
     | 
| 
      
 95 
     | 
    
         
            +
              end
         
     | 
| 
      
 96 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/gort/rule.rb
    ADDED
    
    | 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Generic rule.
         
     | 
| 
      
 5 
     | 
    
         
            +
              # This represents an entry that looks like a valid rule but otherwise doesn't
         
     | 
| 
      
 6 
     | 
    
         
            +
              # have a more specialized implementation.
         
     | 
| 
      
 7 
     | 
    
         
            +
              class Rule
         
     | 
| 
      
 8 
     | 
    
         
            +
                # @param name [Symbol] The name of the rule.
         
     | 
| 
      
 9 
     | 
    
         
            +
                # @param value [String] The value of the rule.
         
     | 
| 
      
 10 
     | 
    
         
            +
                def initialize(name, value)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @name = name.downcase.to_sym
         
     | 
| 
      
 12 
     | 
    
         
            +
                  @value = value
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                # The name of the rule.
         
     | 
| 
      
 16 
     | 
    
         
            +
                # @return [Symbol]
         
     | 
| 
      
 17 
     | 
    
         
            +
                attr_reader :name
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                # The value of the rule.
         
     | 
| 
      
 20 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 21 
     | 
    
         
            +
                attr_reader :value
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                # A human readable representation of the rule.
         
     | 
| 
      
 26 
     | 
    
         
            +
                #
         
     | 
| 
      
 27 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 28 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 29 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 30 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 31 
     | 
    
         
            +
                  %(#<#{self.class.name}:#{object_id} "#{name}", "#{value}">)
         
     | 
| 
      
 32 
     | 
    
         
            +
                end
         
     | 
| 
      
 33 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                # Produces a pretty human readable representation of the rule.
         
     | 
| 
      
 36 
     | 
    
         
            +
                #
         
     | 
| 
      
 37 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 38 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 39 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 40 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 41 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}< #{name.inspect}, #{value} >")
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 47 
     | 
    
         
            +
              end
         
     | 
| 
      
 48 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,25 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Abstract rule set.
         
     | 
| 
      
 5 
     | 
    
         
            +
              #
         
     | 
| 
      
 6 
     | 
    
         
            +
              # @abstract
         
     | 
| 
      
 7 
     | 
    
         
            +
              class RuleSet
         
     | 
| 
      
 8 
     | 
    
         
            +
                # @param rules [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
         
     | 
| 
      
 9 
     | 
    
         
            +
                #    The rules. Or invalid lines.
         
     | 
| 
      
 10 
     | 
    
         
            +
                def initialize(*rules)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  @rules = rules.flatten.freeze
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                # Rules in this set
         
     | 
| 
      
 15 
     | 
    
         
            +
                # @return [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
         
     | 
| 
      
 16 
     | 
    
         
            +
                attr_reader :rules
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                # Make a new set by mergin this one with another.
         
     | 
| 
      
 19 
     | 
    
         
            +
                # @param other [RuleSet]
         
     | 
| 
      
 20 
     | 
    
         
            +
                # @return [RuleSet]
         
     | 
| 
      
 21 
     | 
    
         
            +
                def merge(other)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  self.class.new(rules + other.rules)
         
     | 
| 
      
 23 
     | 
    
         
            +
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,52 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "rule"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 6 
     | 
    
         
            +
              # User-agent rule.
         
     | 
| 
      
 7 
     | 
    
         
            +
              class UserAgentRule < Rule
         
     | 
| 
      
 8 
     | 
    
         
            +
                def initialize(value)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  super(:"user-agent", value)
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                PRODUCT_TOKEN_RE = /\A([a-z_-]+|\*)\z/i
         
     | 
| 
      
 13 
     | 
    
         
            +
                private_constant :PRODUCT_TOKEN_RE
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                # Returns +true+ if the value is a valid user agent.
         
     | 
| 
      
 16 
     | 
    
         
            +
                #
         
     | 
| 
      
 17 
     | 
    
         
            +
                # A user agent token is a sequence of letters (a—z, A—Z), digits (0—9),
         
     | 
| 
      
 18 
     | 
    
         
            +
                # underscores (_), or hyphens (-). Alternatively, a single asterisk (*) is also allowed.
         
     | 
| 
      
 19 
     | 
    
         
            +
                #
         
     | 
| 
      
 20 
     | 
    
         
            +
                # @return [Boolean]
         
     | 
| 
      
 21 
     | 
    
         
            +
                #   - +true+ if the value is a valid product token
         
     | 
| 
      
 22 
     | 
    
         
            +
                #   - +false+ otherwise
         
     | 
| 
      
 23 
     | 
    
         
            +
                def valid?
         
     | 
| 
      
 24 
     | 
    
         
            +
                  value.match?(PRODUCT_TOKEN_RE)
         
     | 
| 
      
 25 
     | 
    
         
            +
                end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                # @!group Formatting Methods
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # A human readable representation of the rule.
         
     | 
| 
      
 30 
     | 
    
         
            +
                #
         
     | 
| 
      
 31 
     | 
    
         
            +
                # @return [String]
         
     | 
| 
      
 32 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 33 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 34 
     | 
    
         
            +
                def inspect
         
     | 
| 
      
 35 
     | 
    
         
            +
                  %(#<#{self.class.name}:#{object_id} "#{value}">)
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                # Produces a pretty human readable representation of the rule.
         
     | 
| 
      
 40 
     | 
    
         
            +
                #
         
     | 
| 
      
 41 
     | 
    
         
            +
                # @param pp [PrettyPrint] pretty printer
         
     | 
| 
      
 42 
     | 
    
         
            +
                # @return [void]
         
     | 
| 
      
 43 
     | 
    
         
            +
                # @tool
         
     | 
| 
      
 44 
     | 
    
         
            +
                #   :nocov:
         
     | 
| 
      
 45 
     | 
    
         
            +
                def pretty_print(pp)
         
     | 
| 
      
 46 
     | 
    
         
            +
                  pp.text("#{self.class.name}/#{object_id}< #{value} >")
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
                # :nocov:
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                # @!endgroup Formatting Methods
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
      
 52 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/gort/version.rb
    ADDED
    
    
    
        data/lib/gort.rb
    ADDED
    
    | 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative "gort/version"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            # Gort is a robots.txt parser and evaluator.
         
     | 
| 
      
 6 
     | 
    
         
            +
            module Gort
         
     | 
| 
      
 7 
     | 
    
         
            +
              # Gort's top error class. All other errors inherit from this.
         
     | 
| 
      
 8 
     | 
    
         
            +
              class Error < StandardError; end
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
              # Parse the given robots.txt input and return a RobotsTxt instance.
         
     | 
| 
      
 11 
     | 
    
         
            +
              #
         
     | 
| 
      
 12 
     | 
    
         
            +
              # @param input [String] the robots.txt input to parse
         
     | 
| 
      
 13 
     | 
    
         
            +
              # @return [RobotsTxt] the parsed robots.txt
         
     | 
| 
      
 14 
     | 
    
         
            +
              def self.parse(input)
         
     | 
| 
      
 15 
     | 
    
         
            +
                Parser.new(input).parse
         
     | 
| 
      
 16 
     | 
    
         
            +
              end
         
     | 
| 
      
 17 
     | 
    
         
            +
            end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            require_relative "gort/parser"
         
     | 
    
        data.tar.gz.sig
    ADDED
    
    | 
         Binary file 
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,105 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: gort
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.0
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 7 
     | 
    
         
            +
            - Alexander Mankuta
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire:
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain:
         
     | 
| 
      
 11 
     | 
    
         
            +
            - |
         
     | 
| 
      
 12 
     | 
    
         
            +
              -----BEGIN CERTIFICATE-----
         
     | 
| 
      
 13 
     | 
    
         
            +
              MIIC+jCCAeKgAwIBAgIBAzANBgkqhkiG9w0BAQsFADAjMSEwHwYDVQQDDBhhbGV4
         
     | 
| 
      
 14 
     | 
    
         
            +
              L0RDPXBvaW50bGVzcy9EQz1vbmUwHhcNMjMxMTA5MTA1MzIxWhcNMjQxMTA4MTA1
         
     | 
| 
      
 15 
     | 
    
         
            +
              MzIxWjAjMSEwHwYDVQQDDBhhbGV4L0RDPXBvaW50bGVzcy9EQz1vbmUwggEiMA0G
         
     | 
| 
      
 16 
     | 
    
         
            +
              CSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDPOVLPGEK+eaP6zJfifrpWvPTg4qo3
         
     | 
| 
      
 17 
     | 
    
         
            +
              XNJJPom80SwqX2hVCVsRDK4RYgKUQqKRQzHhlx14wZHwWLETBVbNDGX3uqyCnTWU
         
     | 
| 
      
 18 
     | 
    
         
            +
              JUKh3ydiZShXpNHoV/NW7hhEYvNsDcBAjYTmbvXOhuYCo0Tz/0N2Oiun/0wIICtP
         
     | 
| 
      
 19 
     | 
    
         
            +
              vytY9TY0/lklWjAbsqJjNOu3o8IYkJBAN/rU96E/6WhFwjnxLcTnV9RfFRXdjG5j
         
     | 
| 
      
 20 
     | 
    
         
            +
              CughoB2xSwKX8gwbQ8fsnaZRmdyDGYNpz6sGF0zycfiLkTttbLA2nYATCALy98CH
         
     | 
| 
      
 21 
     | 
    
         
            +
              nsyZNsTjb4WINCuY2yEDjwesw9f/ROkNC68EgQ5M+aMjp+D0WcYGfzojAgMBAAGj
         
     | 
| 
      
 22 
     | 
    
         
            +
              OTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBRPgIwSVbeonua/
         
     | 
| 
      
 23 
     | 
    
         
            +
              Ny/8576oxdUbrjANBgkqhkiG9w0BAQsFAAOCAQEAX28QLxNNz5EgaZZuQQUkbOXB
         
     | 
| 
      
 24 
     | 
    
         
            +
              4b5luBO22535+Vgj2jw7yjV8KKoGMWKrnB00ijgntqPEPXCzaPNibOcPZV5WfWVS
         
     | 
| 
      
 25 
     | 
    
         
            +
              t0Ls8lWE/8kezPwV4SbRe4Y7C+D4J+oirs0L5PtpREV9CJ7kfdW/AN9MtvjjBFlb
         
     | 
| 
      
 26 
     | 
    
         
            +
              jHquD/MiOOMyHtuO0FiTL265m10thcAUsbyi0MehKgGbtJ5fGceHvZDqDouvbMjT
         
     | 
| 
      
 27 
     | 
    
         
            +
              hoijFk1oTY939JhjdcHuJzMiS2TrqIw8Dr5DkQu2vAjHpw0aOOWhlRjNJ7RHYJNm
         
     | 
| 
      
 28 
     | 
    
         
            +
              QugXmCnHQxSKTmc7imKuotyMdRRKFh8UEFCLRsFtBbNxkXyNuB4xBMuUYodhEw==
         
     | 
| 
      
 29 
     | 
    
         
            +
              -----END CERTIFICATE-----
         
     | 
| 
      
 30 
     | 
    
         
            +
            date: 2024-06-22 00:00:00.000000000 Z
         
     | 
| 
      
 31 
     | 
    
         
            +
            dependencies:
         
     | 
| 
      
 32 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 33 
     | 
    
         
            +
              name: addressable
         
     | 
| 
      
 34 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 35 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 36 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
      
 37 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 38 
     | 
    
         
            +
                    version: '2.8'
         
     | 
| 
      
 39 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 40 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 41 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 42 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 43 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
      
 44 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 45 
     | 
    
         
            +
                    version: '2.8'
         
     | 
| 
      
 46 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 47 
     | 
    
         
            +
              name: rchardet
         
     | 
| 
      
 48 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 49 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 50 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
      
 51 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 52 
     | 
    
         
            +
                    version: '1.8'
         
     | 
| 
      
 53 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 54 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 55 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 56 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 57 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
      
 58 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 59 
     | 
    
         
            +
                    version: '1.8'
         
     | 
| 
      
 60 
     | 
    
         
            +
            description: robots.txt parser and evaluator according to RFC 9309.
         
     | 
| 
      
 61 
     | 
    
         
            +
            email:
         
     | 
| 
      
 62 
     | 
    
         
            +
            - alex@pointless.one
         
     | 
| 
      
 63 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 64 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 65 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 66 
     | 
    
         
            +
            files:
         
     | 
| 
      
 67 
     | 
    
         
            +
            - LICENSE.txt
         
     | 
| 
      
 68 
     | 
    
         
            +
            - lib/gort.rb
         
     | 
| 
      
 69 
     | 
    
         
            +
            - lib/gort/allow_rule.rb
         
     | 
| 
      
 70 
     | 
    
         
            +
            - lib/gort/disallow_rule.rb
         
     | 
| 
      
 71 
     | 
    
         
            +
            - lib/gort/group.rb
         
     | 
| 
      
 72 
     | 
    
         
            +
            - lib/gort/invalid_line.rb
         
     | 
| 
      
 73 
     | 
    
         
            +
            - lib/gort/parser.rb
         
     | 
| 
      
 74 
     | 
    
         
            +
            - lib/gort/path_rule.rb
         
     | 
| 
      
 75 
     | 
    
         
            +
            - lib/gort/robots_txt.rb
         
     | 
| 
      
 76 
     | 
    
         
            +
            - lib/gort/rule.rb
         
     | 
| 
      
 77 
     | 
    
         
            +
            - lib/gort/rule_set.rb
         
     | 
| 
      
 78 
     | 
    
         
            +
            - lib/gort/user_agent_rule.rb
         
     | 
| 
      
 79 
     | 
    
         
            +
            - lib/gort/version.rb
         
     | 
| 
      
 80 
     | 
    
         
            +
            homepage:
         
     | 
| 
      
 81 
     | 
    
         
            +
            licenses:
         
     | 
| 
      
 82 
     | 
    
         
            +
            - MIT
         
     | 
| 
      
 83 
     | 
    
         
            +
            metadata:
         
     | 
| 
      
 84 
     | 
    
         
            +
              allowed_push_host: https://rubygems.org
         
     | 
| 
      
 85 
     | 
    
         
            +
              rubygems_mfa_required: 'true'
         
     | 
| 
      
 86 
     | 
    
         
            +
            post_install_message:
         
     | 
| 
      
 87 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 88 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 89 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 90 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 91 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 92 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 93 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 94 
     | 
    
         
            +
                  version: '3.1'
         
     | 
| 
      
 95 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 96 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 97 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 98 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 99 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 100 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 101 
     | 
    
         
            +
            rubygems_version: 3.5.9
         
     | 
| 
      
 102 
     | 
    
         
            +
            signing_key:
         
     | 
| 
      
 103 
     | 
    
         
            +
            specification_version: 4
         
     | 
| 
      
 104 
     | 
    
         
            +
            summary: robots.txt parser and evaluator.
         
     | 
| 
      
 105 
     | 
    
         
            +
            test_files: []
         
     | 
    
        metadata.gz.sig
    ADDED