gort 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data/LICENSE.txt +21 -0
- data/lib/gort/allow_rule.rb +13 -0
- data/lib/gort/disallow_rule.rb +13 -0
- data/lib/gort/group.rb +85 -0
- data/lib/gort/invalid_line.rb +45 -0
- data/lib/gort/parser.rb +144 -0
- data/lib/gort/path_rule.rb +98 -0
- data/lib/gort/robots_txt.rb +96 -0
- data/lib/gort/rule.rb +48 -0
- data/lib/gort/rule_set.rb +25 -0
- data/lib/gort/user_agent_rule.rb +52 -0
- data/lib/gort/version.rb +6 -0
- data/lib/gort.rb +19 -0
- data.tar.gz.sig +0 -0
- metadata +105 -0
- metadata.gz.sig +2 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e05f323b6f9699a6f39f7042d494e18856bd954b974cf6943d0a0fa6bdb1f263
|
4
|
+
data.tar.gz: 396813a5d7c484e5408603c17bb76d79ec4debce48edec8a6b8ecf8eeb0231da
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9978aba28e9e77ea750cf61de9a22137706fc578fbe0bfbc348d9c97b4d516ea3b6a2e11250f115a2eeae8fea9752c48dc5803556fd3e9a4c0dcfc1e4e9a02e0
|
7
|
+
data.tar.gz: cae8fc644d21a8d31da01b2390f801ceb06b0fc857d1c3d900a62496ff111f41636e09316c86520ab829add7248dcfb1d2a818eea0dd04b030429f8734388be8
|
checksums.yaml.gz.sig
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Alexander Mankuta
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "path_rule"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# Disallow rule.
|
7
|
+
class DisallowRule < PathRule
|
8
|
+
# @param value [String] the path pattern to disallow.
|
9
|
+
def initialize(value)
|
10
|
+
super(:disallow, value)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/gort/group.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule_set"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# An access group
|
7
|
+
class Group < RuleSet
|
8
|
+
# Is this group valid?
|
9
|
+
#
|
10
|
+
# A valid group has at least one valid user-agent rule.
|
11
|
+
#
|
12
|
+
# @return [Boolean]
|
13
|
+
# @see UserAgentRule#valid?
|
14
|
+
def valid?
|
15
|
+
@valid ||=
|
16
|
+
rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? }
|
17
|
+
end
|
18
|
+
|
19
|
+
# Does this group apply to this specific user agent?
|
20
|
+
#
|
21
|
+
# This performa user agent matcchign acording to the RFC.
|
22
|
+
#
|
23
|
+
# @param user_agent [String]
|
24
|
+
# @return [Boolean]
|
25
|
+
def apply?(user_agent)
|
26
|
+
apply_to_all? || user_agent.match?(user_agent_regexp)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @!group Formatting Methods
|
30
|
+
|
31
|
+
# A human readable representation of the group.
|
32
|
+
#
|
33
|
+
# @return [String]
|
34
|
+
# @tool
|
35
|
+
# :nocov:
|
36
|
+
def inspect
|
37
|
+
"#<#{self.class.name}:#{object_id} #{rules.inspect}>"
|
38
|
+
end
|
39
|
+
# :nocov:
|
40
|
+
|
41
|
+
# Produces a pretty human readable representation of the group.
|
42
|
+
#
|
43
|
+
# @param pp [PrettyPrint] pretty printer
|
44
|
+
# @return [void]
|
45
|
+
# @tool
|
46
|
+
# :nocov:
|
47
|
+
def pretty_print(pp)
|
48
|
+
pp.text("#{self.class.name}/#{object_id}")
|
49
|
+
pp.group(1, "[", "]") do
|
50
|
+
pp.breakable("")
|
51
|
+
pp.seplist(rules) do |rule|
|
52
|
+
pp.pp(rule)
|
53
|
+
end
|
54
|
+
pp.breakable("")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
# :nocov:
|
58
|
+
|
59
|
+
# @!endgroup Formatting Methods
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
# Does this rule apply to all user agents?
|
64
|
+
#
|
65
|
+
# Effectively, is this rule is a `*` rule.
|
66
|
+
#
|
67
|
+
# @return [Boolean]
|
68
|
+
def apply_to_all?
|
69
|
+
@apply_to_all ||= rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? && rule.value == "*" }
|
70
|
+
end
|
71
|
+
|
72
|
+
# A compiled Regexp that mathes all user agents in this group.
|
73
|
+
#
|
74
|
+
# @return [Regexp]
|
75
|
+
def user_agent_regexp
|
76
|
+
@user_agent_regexp ||=
|
77
|
+
begin
|
78
|
+
specific_user_agent_rules = rules.select { |rule|
|
79
|
+
rule.is_a?(UserAgentRule) && rule.valid? && (rule.value != "*")
|
80
|
+
}
|
81
|
+
Regexp.new(specific_user_agent_rules.map { Regexp.escape(_1.value) }.join("|"), Regexp::IGNORECASE)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Represents an invalid line in a robots.txt file.
|
5
|
+
#
|
6
|
+
# @note Technically, the RFC doesn't have invalid lines in its grammar
|
7
|
+
# but there are just too many broken robots.txt files on the internet.
|
8
|
+
#
|
9
|
+
# An invalid line is a line that can not be parsed as a rule and is not a comment.
|
10
|
+
class InvalidLine
|
11
|
+
# @param text [String] content of the line
|
12
|
+
def initialize(text)
|
13
|
+
@value = text
|
14
|
+
end
|
15
|
+
|
16
|
+
# Content of the line.
|
17
|
+
# @return [String]
|
18
|
+
attr_reader :value
|
19
|
+
|
20
|
+
# @!group Formatting Methods
|
21
|
+
|
22
|
+
# A human readable representation of the invalid line.
|
23
|
+
#
|
24
|
+
# @return [String]
|
25
|
+
# @tool
|
26
|
+
# :nocov:
|
27
|
+
def inspect
|
28
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
29
|
+
end
|
30
|
+
# :nocov:
|
31
|
+
|
32
|
+
# Produces a pretty human readable representation of the invalid line.
|
33
|
+
#
|
34
|
+
# @param pp [PrettyPrint] pretty printer
|
35
|
+
# @return [void]
|
36
|
+
# @tool
|
37
|
+
# :nocov:
|
38
|
+
def pretty_print(pp)
|
39
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
40
|
+
end
|
41
|
+
# :nocov:
|
42
|
+
|
43
|
+
# @!endgroup Formatting Methods
|
44
|
+
end
|
45
|
+
end
|
data/lib/gort/parser.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "invalid_line"
|
4
|
+
require_relative "rule"
|
5
|
+
require_relative "user_agent_rule"
|
6
|
+
require_relative "allow_rule"
|
7
|
+
require_relative "disallow_rule"
|
8
|
+
require_relative "rule_set"
|
9
|
+
require_relative "group"
|
10
|
+
require_relative "robots_txt"
|
11
|
+
|
12
|
+
module Gort
|
13
|
+
# robots.txt parser. It implements the parsing logic according to RFC 9309, including errata.
|
14
|
+
class Parser
|
15
|
+
# You may get this error if the input does not look like a text file.
|
16
|
+
class BinaryInputError < Error; end
|
17
|
+
|
18
|
+
# You may get this error if the input looks like a text file but its encoding is invalid.
|
19
|
+
class InvalidEncodingError < Error; end
|
20
|
+
|
21
|
+
UTF_8_BOM = "\ufeff"
|
22
|
+
private_constant :UTF_8_BOM
|
23
|
+
|
24
|
+
# @param input [String] The robots.txt content to parse. It must be encoded in UTF-8 or compatible encoding.
|
25
|
+
def initialize(input)
|
26
|
+
@input = detect_and_fix_encoding(input).then { |string| strip_bom(string) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# RFC does not explicitly define the generic rule name syntax. It only defines that it has to be case-insensitive.
|
30
|
+
# It also provides a few pre-defined rule names such as User-Agent, Allow, and Disallow.
|
31
|
+
# Things that might be different from the RFC intention:
|
32
|
+
# - The rule name must start with a letter. RFC might allow other characters.
|
33
|
+
# - The rule name might contain underscores. RFC doesn't mention underscores.
|
34
|
+
# - The rule name might contain digits. RFC doesn't mention digits, either.
|
35
|
+
#
|
36
|
+
# This is only used for plausible rule detection.
|
37
|
+
RULE_KEY = /\A[a-z][a-z0-9_-]*\s*:/i
|
38
|
+
private_constant :RULE_KEY
|
39
|
+
|
40
|
+
# Actually parse the file.
|
41
|
+
#
|
42
|
+
# @return [Gort::RobotsTxt]
|
43
|
+
def parse
|
44
|
+
content_lines =
|
45
|
+
input.lines.map { |line|
|
46
|
+
line.split("#", 2).first.strip
|
47
|
+
}
|
48
|
+
.reject(&:empty?)
|
49
|
+
|
50
|
+
rules = content_lines.map { |line| parse_line(line) }
|
51
|
+
grouped_rules, standalone_rules = partition_rules(rules)
|
52
|
+
groups = group_rules(grouped_rules)
|
53
|
+
|
54
|
+
RobotsTxt.new(groups + standalone_rules)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
# @return [String]
|
60
|
+
attr_reader :input
|
61
|
+
|
62
|
+
# @param string [String]
|
63
|
+
# @return [String]
|
64
|
+
def detect_and_fix_encoding(string)
|
65
|
+
string.encode(Encoding::UTF_8)
|
66
|
+
rescue EncodingError
|
67
|
+
require "rchardet"
|
68
|
+
result = CharDet.detect(string)
|
69
|
+
raise BinaryInputError, "Input does not look like text" if result["encoding"].nil? || result["confidence"] < 0.25
|
70
|
+
|
71
|
+
begin
|
72
|
+
string
|
73
|
+
.dup
|
74
|
+
.force_encoding(result["encoding"])
|
75
|
+
.encode(Encoding::UTF_8)
|
76
|
+
rescue EncodingError
|
77
|
+
raise InvalidEncodingError, "Input string looks like text but its encoding is invalid."
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# @param string [String]
|
82
|
+
# @return [String]
|
83
|
+
def strip_bom(string)
|
84
|
+
if string[0] == UTF_8_BOM
|
85
|
+
string[1..] # Remove BOM
|
86
|
+
else
|
87
|
+
string
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# @param line [String]
|
92
|
+
# @return [UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine]
|
93
|
+
def parse_line(line)
|
94
|
+
if line.match?(RULE_KEY)
|
95
|
+
# @type var key: String
|
96
|
+
# @type var value: String
|
97
|
+
key, value = line.split(":", 2).map(&:strip)
|
98
|
+
case key.downcase
|
99
|
+
when "user-agent"
|
100
|
+
UserAgentRule.new(value)
|
101
|
+
when "allow"
|
102
|
+
AllowRule.new(value)
|
103
|
+
when "disallow"
|
104
|
+
DisallowRule.new(value)
|
105
|
+
else
|
106
|
+
Rule.new(key, value)
|
107
|
+
end
|
108
|
+
else
|
109
|
+
InvalidLine.new(line)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# @param rules [Array<UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
114
|
+
# @return [(Array<UserAgentRule, AllowRule, DisallowRule>, Array<AllowRule, DisallowRule, Rule, InvalidLine>)]
|
115
|
+
def partition_rules(rules)
|
116
|
+
standalone_rules = []
|
117
|
+
grouped_rules = []
|
118
|
+
rules.each do |rule|
|
119
|
+
case rule
|
120
|
+
when UserAgentRule
|
121
|
+
grouped_rules << rule
|
122
|
+
when AllowRule, DisallowRule
|
123
|
+
if grouped_rules.empty?
|
124
|
+
standalone_rules << rule
|
125
|
+
else
|
126
|
+
grouped_rules << rule
|
127
|
+
end
|
128
|
+
else
|
129
|
+
standalone_rules << rule
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
[grouped_rules, standalone_rules]
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param rules [Array<UserAgentRule, AllowRule, DisallowRule>]
|
137
|
+
# @return [Array<Group>]
|
138
|
+
def group_rules(rules)
|
139
|
+
rules
|
140
|
+
.slice_when { |a, b| !a.is_a?(UserAgentRule) && b.is_a?(UserAgentRule) }
|
141
|
+
.map { |group| Group.new(group) }
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule"
|
4
|
+
require "addressable/uri"
|
5
|
+
|
6
|
+
module Gort
|
7
|
+
# A rule that matches a path and query string.
|
8
|
+
#
|
9
|
+
# @abstract
|
10
|
+
class PathRule < Rule
|
11
|
+
# Path patter has to start with a slash and not contain control characters or hash.
|
12
|
+
# It also has to be a valid UTF-8 string but this is checked diring parsing.
|
13
|
+
# It also can be empty.
|
14
|
+
PATH_PATTERN = %r{\A(?:[/*][^\u0000-\u0020\u0023$]*\$?)?\z}u
|
15
|
+
private_constant :PATH_PATTERN
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
value.match?(PATH_PATTERN)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Match the path and query string against the rule.
|
22
|
+
# Invalid rules never match.
|
23
|
+
# Empty rules never match, either. This is not explicitly stated in the RFC
|
24
|
+
# but it is explicitly described in previous robots.txt documents.
|
25
|
+
#
|
26
|
+
# @param path_and_query [String]
|
27
|
+
# @return [nil, (Integer, PathRule)]
|
28
|
+
# - +nil+ if the rule does not match the path and query string.
|
29
|
+
# - An array with the number of bytes matched and the rule itself if the rule matches.
|
30
|
+
def match(path_and_query)
|
31
|
+
return nil if !valid? || value.empty?
|
32
|
+
|
33
|
+
path_and_query = normalize_path_and_query(path_and_query)
|
34
|
+
match = path_and_query.match(regexp)
|
35
|
+
return nil unless match
|
36
|
+
|
37
|
+
[match.to_s.bytesize, self]
|
38
|
+
end
|
39
|
+
|
40
|
+
# @!group Formatting Methods
|
41
|
+
|
42
|
+
# A human readable representation of the rule.
|
43
|
+
#
|
44
|
+
# @return [String]
|
45
|
+
# @tool
|
46
|
+
# :nocov:
|
47
|
+
def inspect
|
48
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
49
|
+
end
|
50
|
+
# :nocov:
|
51
|
+
|
52
|
+
# Produces a pretty human readable representation of the rule.
|
53
|
+
#
|
54
|
+
# @param pp [PrettyPrint] pretty printer
|
55
|
+
# @return [void]
|
56
|
+
# @tool
|
57
|
+
# :nocov:
|
58
|
+
def pretty_print(pp)
|
59
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
60
|
+
end
|
61
|
+
# :nocov:
|
62
|
+
|
63
|
+
# @!endgroup Formatting Methods
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
# @param path_and_query [String]
|
68
|
+
# @return [String]
|
69
|
+
def normalize_path_and_query(path_and_query)
|
70
|
+
pq = Addressable::URI.parse(path_and_query).normalize
|
71
|
+
pq.scheme = nil
|
72
|
+
pq.authority = nil
|
73
|
+
pq.fragment = nil
|
74
|
+
pq.to_s
|
75
|
+
end
|
76
|
+
|
77
|
+
# @return [Regexp]
|
78
|
+
def regexp
|
79
|
+
@regexp ||=
|
80
|
+
begin
|
81
|
+
parts = value.scan(/[^*$]+|[*$]/)
|
82
|
+
regexp_parts =
|
83
|
+
parts.map { |part|
|
84
|
+
case part
|
85
|
+
when "*"
|
86
|
+
".*"
|
87
|
+
when "$"
|
88
|
+
"\\z"
|
89
|
+
else
|
90
|
+
Regexp.escape(Addressable::URI.normalized_encode(part))
|
91
|
+
end
|
92
|
+
}
|
93
|
+
|
94
|
+
Regexp.new("\\A#{regexp_parts.join}")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Represents a robots.txt file.
|
5
|
+
class RobotsTxt
|
6
|
+
ROBOTS_TXT_PATH = "/robots.txt"
|
7
|
+
private_constant :ROBOTS_TXT_PATH
|
8
|
+
|
9
|
+
def initialize(rules)
|
10
|
+
@rules = rules
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Array<Rule, Group, InvalidLine>]
|
14
|
+
attr_reader :rules
|
15
|
+
|
16
|
+
# Is this path allowed for the given user agent?
|
17
|
+
#
|
18
|
+
# @param user_agent [String]
|
19
|
+
# @param path_and_query [String]
|
20
|
+
# @return [Boolean]
|
21
|
+
# @see PathRule#match
|
22
|
+
# @see #disallow?
|
23
|
+
def allow?(user_agent, path_and_query)
|
24
|
+
return true if path_and_query == ROBOTS_TXT_PATH
|
25
|
+
|
26
|
+
top_match =
|
27
|
+
matches(user_agent, path_and_query)
|
28
|
+
.compact
|
29
|
+
# This is an arcane bit.
|
30
|
+
# The rules are reverse sorted by match length (i.e. longest first),
|
31
|
+
# and then by class name using the fact that allow goes before disallow.
|
32
|
+
# This is the rule precedence order defined in the RFC.
|
33
|
+
.min_by { |(match_length, rule)| [-match_length, rule.class.name] }
|
34
|
+
|
35
|
+
# Allow if there is no match or the top match is an allow rule.
|
36
|
+
top_match.nil? || top_match.last.is_a?(AllowRule)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Is this path disallowed for the given user agent?
|
40
|
+
#
|
41
|
+
# @param user_agent [String]
|
42
|
+
# @param path_and_query [String]
|
43
|
+
# @return [Boolean]
|
44
|
+
# @see PathRule#match
|
45
|
+
# @see #allow?
|
46
|
+
def disallow?(user_agent, path_and_query)
|
47
|
+
!allow?(user_agent, path_and_query)
|
48
|
+
end
|
49
|
+
|
50
|
+
# @!group Formatting Methods
|
51
|
+
|
52
|
+
# A human readable representation of the robots.txt.
|
53
|
+
#
|
54
|
+
# @return [String]
|
55
|
+
# @tool
|
56
|
+
# :nocov:
|
57
|
+
def inspect
|
58
|
+
"#<#{self.class.name}:#{object_id} #{rules.inspect}>"
|
59
|
+
end
|
60
|
+
# :nocov:
|
61
|
+
|
62
|
+
# Produces a pretty human readable representation of the robots.txt.
|
63
|
+
#
|
64
|
+
# @param pp [PrettyPrint] pretty printer
|
65
|
+
# @return [void]
|
66
|
+
# @tool
|
67
|
+
# :nocov:
|
68
|
+
def pretty_print(pp)
|
69
|
+
pp.text("#{self.class.name}/#{object_id}")
|
70
|
+
pp.group(1, "[", "]") do
|
71
|
+
pp.breakable("")
|
72
|
+
pp.seplist(rules) do |rule|
|
73
|
+
pp.pp(rule)
|
74
|
+
end
|
75
|
+
pp.breakable("")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
# :nocov:
|
79
|
+
|
80
|
+
# @!endgroup Formatting Methods
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def matches(user_agent, path)
|
85
|
+
# @type var groups: Array<Group>
|
86
|
+
groups = rules.select { |rule| rule.is_a?(Group) && rule.valid? && rule.apply?(user_agent) }
|
87
|
+
groups.flat_map do |group|
|
88
|
+
group.rules.filter_map do |rule|
|
89
|
+
next unless rule.is_a?(PathRule)
|
90
|
+
|
91
|
+
rule.match(path)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gort/rule.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Generic rule.
|
5
|
+
# This represents an entry that looks like a valid rule but otherwise doesn't
|
6
|
+
# have a more specialized implementation.
|
7
|
+
class Rule
|
8
|
+
# @param name [Symbol] The name of the rule.
|
9
|
+
# @param value [String] The value of the rule.
|
10
|
+
def initialize(name, value)
|
11
|
+
@name = name.downcase.to_sym
|
12
|
+
@value = value
|
13
|
+
end
|
14
|
+
|
15
|
+
# The name of the rule.
|
16
|
+
# @return [Symbol]
|
17
|
+
attr_reader :name
|
18
|
+
|
19
|
+
# The value of the rule.
|
20
|
+
# @return [String]
|
21
|
+
attr_reader :value
|
22
|
+
|
23
|
+
# @!group Formatting Methods
|
24
|
+
|
25
|
+
# A human readable representation of the rule.
|
26
|
+
#
|
27
|
+
# @return [String]
|
28
|
+
# @tool
|
29
|
+
# :nocov:
|
30
|
+
def inspect
|
31
|
+
%(#<#{self.class.name}:#{object_id} "#{name}", "#{value}">)
|
32
|
+
end
|
33
|
+
# :nocov:
|
34
|
+
|
35
|
+
# Produces a pretty human readable representation of the rule.
|
36
|
+
#
|
37
|
+
# @param pp [PrettyPrint] pretty printer
|
38
|
+
# @return [void]
|
39
|
+
# @tool
|
40
|
+
# :nocov:
|
41
|
+
def pretty_print(pp)
|
42
|
+
pp.text("#{self.class.name}/#{object_id}< #{name.inspect}, #{value} >")
|
43
|
+
end
|
44
|
+
# :nocov:
|
45
|
+
|
46
|
+
# @!endgroup Formatting Methods
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Abstract rule set.
|
5
|
+
#
|
6
|
+
# @abstract
|
7
|
+
class RuleSet
|
8
|
+
# @param rules [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
9
|
+
# The rules. Or invalid lines.
|
10
|
+
def initialize(*rules)
|
11
|
+
@rules = rules.flatten.freeze
|
12
|
+
end
|
13
|
+
|
14
|
+
# Rules in this set
|
15
|
+
# @return [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
16
|
+
attr_reader :rules
|
17
|
+
|
18
|
+
# Make a new set by mergin this one with another.
|
19
|
+
# @param other [RuleSet]
|
20
|
+
# @return [RuleSet]
|
21
|
+
def merge(other)
|
22
|
+
self.class.new(rules + other.rules)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# User-agent rule.
|
7
|
+
class UserAgentRule < Rule
|
8
|
+
def initialize(value)
|
9
|
+
super(:"user-agent", value)
|
10
|
+
end
|
11
|
+
|
12
|
+
PRODUCT_TOKEN_RE = /\A([a-z_-]+|\*)\z/i
|
13
|
+
private_constant :PRODUCT_TOKEN_RE
|
14
|
+
|
15
|
+
# Returns +true+ if the value is a valid user agent.
|
16
|
+
#
|
17
|
+
# A user agent token is a sequence of letters (a—z, A—Z), digits (0—9),
|
18
|
+
# underscores (_), or hyphens (-). Alternatively, a single asterisk (*) is also allowed.
|
19
|
+
#
|
20
|
+
# @return [Boolean]
|
21
|
+
# - +true+ if the value is a valid product token
|
22
|
+
# - +false+ otherwise
|
23
|
+
def valid?
|
24
|
+
value.match?(PRODUCT_TOKEN_RE)
|
25
|
+
end
|
26
|
+
|
27
|
+
# @!group Formatting Methods
|
28
|
+
|
29
|
+
# A human readable representation of the rule.
|
30
|
+
#
|
31
|
+
# @return [String]
|
32
|
+
# @tool
|
33
|
+
# :nocov:
|
34
|
+
def inspect
|
35
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
36
|
+
end
|
37
|
+
# :nocov:
|
38
|
+
|
39
|
+
# Produces a pretty human readable representation of the rule.
|
40
|
+
#
|
41
|
+
# @param pp [PrettyPrint] pretty printer
|
42
|
+
# @return [void]
|
43
|
+
# @tool
|
44
|
+
# :nocov:
|
45
|
+
def pretty_print(pp)
|
46
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
47
|
+
end
|
48
|
+
# :nocov:
|
49
|
+
|
50
|
+
# @!endgroup Formatting Methods
|
51
|
+
end
|
52
|
+
end
|
data/lib/gort/version.rb
ADDED
data/lib/gort.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "gort/version"
|
4
|
+
|
5
|
+
# Gort is a robots.txt parser and evaluator.
|
6
|
+
module Gort
|
7
|
+
# Gort's top error class. All other errors inherit from this.
|
8
|
+
class Error < StandardError; end
|
9
|
+
|
10
|
+
# Parse the given robots.txt input and return a RobotsTxt instance.
|
11
|
+
#
|
12
|
+
# @param input [String] the robots.txt input to parse
|
13
|
+
# @return [RobotsTxt] the parsed robots.txt
|
14
|
+
def self.parse(input)
|
15
|
+
Parser.new(input).parse
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
require_relative "gort/parser"
|
data.tar.gz.sig
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gort
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alexander Mankuta
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIC+jCCAeKgAwIBAgIBAzANBgkqhkiG9w0BAQsFADAjMSEwHwYDVQQDDBhhbGV4
|
14
|
+
L0RDPXBvaW50bGVzcy9EQz1vbmUwHhcNMjMxMTA5MTA1MzIxWhcNMjQxMTA4MTA1
|
15
|
+
MzIxWjAjMSEwHwYDVQQDDBhhbGV4L0RDPXBvaW50bGVzcy9EQz1vbmUwggEiMA0G
|
16
|
+
CSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDPOVLPGEK+eaP6zJfifrpWvPTg4qo3
|
17
|
+
XNJJPom80SwqX2hVCVsRDK4RYgKUQqKRQzHhlx14wZHwWLETBVbNDGX3uqyCnTWU
|
18
|
+
JUKh3ydiZShXpNHoV/NW7hhEYvNsDcBAjYTmbvXOhuYCo0Tz/0N2Oiun/0wIICtP
|
19
|
+
vytY9TY0/lklWjAbsqJjNOu3o8IYkJBAN/rU96E/6WhFwjnxLcTnV9RfFRXdjG5j
|
20
|
+
CughoB2xSwKX8gwbQ8fsnaZRmdyDGYNpz6sGF0zycfiLkTttbLA2nYATCALy98CH
|
21
|
+
nsyZNsTjb4WINCuY2yEDjwesw9f/ROkNC68EgQ5M+aMjp+D0WcYGfzojAgMBAAGj
|
22
|
+
OTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBRPgIwSVbeonua/
|
23
|
+
Ny/8576oxdUbrjANBgkqhkiG9w0BAQsFAAOCAQEAX28QLxNNz5EgaZZuQQUkbOXB
|
24
|
+
4b5luBO22535+Vgj2jw7yjV8KKoGMWKrnB00ijgntqPEPXCzaPNibOcPZV5WfWVS
|
25
|
+
t0Ls8lWE/8kezPwV4SbRe4Y7C+D4J+oirs0L5PtpREV9CJ7kfdW/AN9MtvjjBFlb
|
26
|
+
jHquD/MiOOMyHtuO0FiTL265m10thcAUsbyi0MehKgGbtJ5fGceHvZDqDouvbMjT
|
27
|
+
hoijFk1oTY939JhjdcHuJzMiS2TrqIw8Dr5DkQu2vAjHpw0aOOWhlRjNJ7RHYJNm
|
28
|
+
QugXmCnHQxSKTmc7imKuotyMdRRKFh8UEFCLRsFtBbNxkXyNuB4xBMuUYodhEw==
|
29
|
+
-----END CERTIFICATE-----
|
30
|
+
date: 2024-06-22 00:00:00.000000000 Z
|
31
|
+
dependencies:
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: addressable
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - "~>"
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '2.8'
|
39
|
+
type: :runtime
|
40
|
+
prerelease: false
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - "~>"
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.8'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rchardet
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - "~>"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '1.8'
|
53
|
+
type: :runtime
|
54
|
+
prerelease: false
|
55
|
+
version_requirements: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '1.8'
|
60
|
+
description: robots.txt parser and evaluator according to RFC 9309.
|
61
|
+
email:
|
62
|
+
- alex@pointless.one
|
63
|
+
executables: []
|
64
|
+
extensions: []
|
65
|
+
extra_rdoc_files: []
|
66
|
+
files:
|
67
|
+
- LICENSE.txt
|
68
|
+
- lib/gort.rb
|
69
|
+
- lib/gort/allow_rule.rb
|
70
|
+
- lib/gort/disallow_rule.rb
|
71
|
+
- lib/gort/group.rb
|
72
|
+
- lib/gort/invalid_line.rb
|
73
|
+
- lib/gort/parser.rb
|
74
|
+
- lib/gort/path_rule.rb
|
75
|
+
- lib/gort/robots_txt.rb
|
76
|
+
- lib/gort/rule.rb
|
77
|
+
- lib/gort/rule_set.rb
|
78
|
+
- lib/gort/user_agent_rule.rb
|
79
|
+
- lib/gort/version.rb
|
80
|
+
homepage:
|
81
|
+
licenses:
|
82
|
+
- MIT
|
83
|
+
metadata:
|
84
|
+
allowed_push_host: https://rubygems.org
|
85
|
+
rubygems_mfa_required: 'true'
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '3.1'
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
requirements: []
|
101
|
+
rubygems_version: 3.5.9
|
102
|
+
signing_key:
|
103
|
+
specification_version: 4
|
104
|
+
summary: robots.txt parser and evaluator.
|
105
|
+
test_files: []
|
metadata.gz.sig
ADDED