gort 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +2 -0
- data/LICENSE.txt +21 -0
- data/lib/gort/allow_rule.rb +13 -0
- data/lib/gort/disallow_rule.rb +13 -0
- data/lib/gort/group.rb +85 -0
- data/lib/gort/invalid_line.rb +45 -0
- data/lib/gort/parser.rb +144 -0
- data/lib/gort/path_rule.rb +98 -0
- data/lib/gort/robots_txt.rb +96 -0
- data/lib/gort/rule.rb +48 -0
- data/lib/gort/rule_set.rb +25 -0
- data/lib/gort/user_agent_rule.rb +52 -0
- data/lib/gort/version.rb +6 -0
- data/lib/gort.rb +19 -0
- data.tar.gz.sig +0 -0
- metadata +105 -0
- metadata.gz.sig +2 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e05f323b6f9699a6f39f7042d494e18856bd954b974cf6943d0a0fa6bdb1f263
|
4
|
+
data.tar.gz: 396813a5d7c484e5408603c17bb76d79ec4debce48edec8a6b8ecf8eeb0231da
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9978aba28e9e77ea750cf61de9a22137706fc578fbe0bfbc348d9c97b4d516ea3b6a2e11250f115a2eeae8fea9752c48dc5803556fd3e9a4c0dcfc1e4e9a02e0
|
7
|
+
data.tar.gz: cae8fc644d21a8d31da01b2390f801ceb06b0fc857d1c3d900a62496ff111f41636e09316c86520ab829add7248dcfb1d2a818eea0dd04b030429f8734388be8
|
checksums.yaml.gz.sig
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Alexander Mankuta
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "path_rule"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# Disallow rule.
|
7
|
+
class DisallowRule < PathRule
|
8
|
+
# @param value [String] the path pattern to disallow.
|
9
|
+
def initialize(value)
|
10
|
+
super(:disallow, value)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/gort/group.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule_set"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# An access group
|
7
|
+
class Group < RuleSet
|
8
|
+
# Is this group valid?
|
9
|
+
#
|
10
|
+
# A valid group has at least one valid user-agent rule.
|
11
|
+
#
|
12
|
+
# @return [Boolean]
|
13
|
+
# @see UserAgentRule#valid?
|
14
|
+
def valid?
|
15
|
+
@valid ||=
|
16
|
+
rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? }
|
17
|
+
end
|
18
|
+
|
19
|
+
# Does this group apply to this specific user agent?
|
20
|
+
#
|
21
|
+
# This performa user agent matcchign acording to the RFC.
|
22
|
+
#
|
23
|
+
# @param user_agent [String]
|
24
|
+
# @return [Boolean]
|
25
|
+
def apply?(user_agent)
|
26
|
+
apply_to_all? || user_agent.match?(user_agent_regexp)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @!group Formatting Methods
|
30
|
+
|
31
|
+
# A human readable representation of the group.
|
32
|
+
#
|
33
|
+
# @return [String]
|
34
|
+
# @tool
|
35
|
+
# :nocov:
|
36
|
+
def inspect
|
37
|
+
"#<#{self.class.name}:#{object_id} #{rules.inspect}>"
|
38
|
+
end
|
39
|
+
# :nocov:
|
40
|
+
|
41
|
+
# Produces a pretty human readable representation of the group.
|
42
|
+
#
|
43
|
+
# @param pp [PrettyPrint] pretty printer
|
44
|
+
# @return [void]
|
45
|
+
# @tool
|
46
|
+
# :nocov:
|
47
|
+
def pretty_print(pp)
|
48
|
+
pp.text("#{self.class.name}/#{object_id}")
|
49
|
+
pp.group(1, "[", "]") do
|
50
|
+
pp.breakable("")
|
51
|
+
pp.seplist(rules) do |rule|
|
52
|
+
pp.pp(rule)
|
53
|
+
end
|
54
|
+
pp.breakable("")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
# :nocov:
|
58
|
+
|
59
|
+
# @!endgroup Formatting Methods
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
# Does this rule apply to all user agents?
|
64
|
+
#
|
65
|
+
# Effectively, is this rule is a `*` rule.
|
66
|
+
#
|
67
|
+
# @return [Boolean]
|
68
|
+
def apply_to_all?
|
69
|
+
@apply_to_all ||= rules.any? { |rule| rule.is_a?(UserAgentRule) && rule.valid? && rule.value == "*" }
|
70
|
+
end
|
71
|
+
|
72
|
+
# A compiled Regexp that mathes all user agents in this group.
|
73
|
+
#
|
74
|
+
# @return [Regexp]
|
75
|
+
def user_agent_regexp
|
76
|
+
@user_agent_regexp ||=
|
77
|
+
begin
|
78
|
+
specific_user_agent_rules = rules.select { |rule|
|
79
|
+
rule.is_a?(UserAgentRule) && rule.valid? && (rule.value != "*")
|
80
|
+
}
|
81
|
+
Regexp.new(specific_user_agent_rules.map { Regexp.escape(_1.value) }.join("|"), Regexp::IGNORECASE)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Represents an invalid line in a robots.txt file.
|
5
|
+
#
|
6
|
+
# @note Technically, the RFC doesn't have invalid lines in its grammar
|
7
|
+
# but there are just too many broken robots.txt files on the internet.
|
8
|
+
#
|
9
|
+
# An invalid line is a line that can not be parsed as a rule and is not a comment.
|
10
|
+
class InvalidLine
|
11
|
+
# @param text [String] content of the line
|
12
|
+
def initialize(text)
|
13
|
+
@value = text
|
14
|
+
end
|
15
|
+
|
16
|
+
# Content of the line.
|
17
|
+
# @return [String]
|
18
|
+
attr_reader :value
|
19
|
+
|
20
|
+
# @!group Formatting Methods
|
21
|
+
|
22
|
+
# A human readable representation of the invalid line.
|
23
|
+
#
|
24
|
+
# @return [String]
|
25
|
+
# @tool
|
26
|
+
# :nocov:
|
27
|
+
def inspect
|
28
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
29
|
+
end
|
30
|
+
# :nocov:
|
31
|
+
|
32
|
+
# Produces a pretty human readable representation of the invalid line.
|
33
|
+
#
|
34
|
+
# @param pp [PrettyPrint] pretty printer
|
35
|
+
# @return [void]
|
36
|
+
# @tool
|
37
|
+
# :nocov:
|
38
|
+
def pretty_print(pp)
|
39
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
40
|
+
end
|
41
|
+
# :nocov:
|
42
|
+
|
43
|
+
# @!endgroup Formatting Methods
|
44
|
+
end
|
45
|
+
end
|
data/lib/gort/parser.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "invalid_line"
|
4
|
+
require_relative "rule"
|
5
|
+
require_relative "user_agent_rule"
|
6
|
+
require_relative "allow_rule"
|
7
|
+
require_relative "disallow_rule"
|
8
|
+
require_relative "rule_set"
|
9
|
+
require_relative "group"
|
10
|
+
require_relative "robots_txt"
|
11
|
+
|
12
|
+
module Gort
|
13
|
+
# robots.txt parser. It implements the parsing logic according to RFC 9309, including errata.
|
14
|
+
class Parser
|
15
|
+
# You may get this error if the input does not look like a text file.
|
16
|
+
class BinaryInputError < Error; end
|
17
|
+
|
18
|
+
# You may get this error if the input looks like a text file but its encoding is invalid.
|
19
|
+
class InvalidEncodingError < Error; end
|
20
|
+
|
21
|
+
UTF_8_BOM = "\ufeff"
|
22
|
+
private_constant :UTF_8_BOM
|
23
|
+
|
24
|
+
# @param input [String] The robots.txt content to parse. It must be encoded in UTF-8 or compatible encoding.
|
25
|
+
def initialize(input)
|
26
|
+
@input = detect_and_fix_encoding(input).then { |string| strip_bom(string) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# RFC does not explicitly define the generic rule name syntax. It only defines that it has to be case-insensitive.
|
30
|
+
# It also provides a few pre-defined rule names such as User-Agent, Allow, and Disallow.
|
31
|
+
# Things that might be different from the RFC intention:
|
32
|
+
# - The rule name must start with a letter. RFC might allow other characters.
|
33
|
+
# - The rule name might contain underscores. RFC doesn't mention underscores.
|
34
|
+
# - The rule name might contain digits. RFC doesn't mention digits, either.
|
35
|
+
#
|
36
|
+
# This is only used for plausible rule detection.
|
37
|
+
RULE_KEY = /\A[a-z][a-z0-9_-]*\s*:/i
|
38
|
+
private_constant :RULE_KEY
|
39
|
+
|
40
|
+
# Actually parse the file.
|
41
|
+
#
|
42
|
+
# @return [Gort::RobotsTxt]
|
43
|
+
def parse
|
44
|
+
content_lines =
|
45
|
+
input.lines.map { |line|
|
46
|
+
line.split("#", 2).first.strip
|
47
|
+
}
|
48
|
+
.reject(&:empty?)
|
49
|
+
|
50
|
+
rules = content_lines.map { |line| parse_line(line) }
|
51
|
+
grouped_rules, standalone_rules = partition_rules(rules)
|
52
|
+
groups = group_rules(grouped_rules)
|
53
|
+
|
54
|
+
RobotsTxt.new(groups + standalone_rules)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
# @return [String]
|
60
|
+
attr_reader :input
|
61
|
+
|
62
|
+
# @param string [String]
|
63
|
+
# @return [String]
|
64
|
+
def detect_and_fix_encoding(string)
|
65
|
+
string.encode(Encoding::UTF_8)
|
66
|
+
rescue EncodingError
|
67
|
+
require "rchardet"
|
68
|
+
result = CharDet.detect(string)
|
69
|
+
raise BinaryInputError, "Input does not look like text" if result["encoding"].nil? || result["confidence"] < 0.25
|
70
|
+
|
71
|
+
begin
|
72
|
+
string
|
73
|
+
.dup
|
74
|
+
.force_encoding(result["encoding"])
|
75
|
+
.encode(Encoding::UTF_8)
|
76
|
+
rescue EncodingError
|
77
|
+
raise InvalidEncodingError, "Input string looks like text but its encoding is invalid."
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# @param string [String]
|
82
|
+
# @return [String]
|
83
|
+
def strip_bom(string)
|
84
|
+
if string[0] == UTF_8_BOM
|
85
|
+
string[1..] # Remove BOM
|
86
|
+
else
|
87
|
+
string
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# @param line [String]
|
92
|
+
# @return [UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine]
|
93
|
+
def parse_line(line)
|
94
|
+
if line.match?(RULE_KEY)
|
95
|
+
# @type var key: String
|
96
|
+
# @type var value: String
|
97
|
+
key, value = line.split(":", 2).map(&:strip)
|
98
|
+
case key.downcase
|
99
|
+
when "user-agent"
|
100
|
+
UserAgentRule.new(value)
|
101
|
+
when "allow"
|
102
|
+
AllowRule.new(value)
|
103
|
+
when "disallow"
|
104
|
+
DisallowRule.new(value)
|
105
|
+
else
|
106
|
+
Rule.new(key, value)
|
107
|
+
end
|
108
|
+
else
|
109
|
+
InvalidLine.new(line)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# @param rules [Array<UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
114
|
+
# @return [(Array<UserAgentRule, AllowRule, DisallowRule>, Array<AllowRule, DisallowRule, Rule, InvalidLine>)]
|
115
|
+
def partition_rules(rules)
|
116
|
+
standalone_rules = []
|
117
|
+
grouped_rules = []
|
118
|
+
rules.each do |rule|
|
119
|
+
case rule
|
120
|
+
when UserAgentRule
|
121
|
+
grouped_rules << rule
|
122
|
+
when AllowRule, DisallowRule
|
123
|
+
if grouped_rules.empty?
|
124
|
+
standalone_rules << rule
|
125
|
+
else
|
126
|
+
grouped_rules << rule
|
127
|
+
end
|
128
|
+
else
|
129
|
+
standalone_rules << rule
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
[grouped_rules, standalone_rules]
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param rules [Array<UserAgentRule, AllowRule, DisallowRule>]
|
137
|
+
# @return [Array<Group>]
|
138
|
+
def group_rules(rules)
|
139
|
+
rules
|
140
|
+
.slice_when { |a, b| !a.is_a?(UserAgentRule) && b.is_a?(UserAgentRule) }
|
141
|
+
.map { |group| Group.new(group) }
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule"
|
4
|
+
require "addressable/uri"
|
5
|
+
|
6
|
+
module Gort
|
7
|
+
# A rule that matches a path and query string.
|
8
|
+
#
|
9
|
+
# @abstract
|
10
|
+
class PathRule < Rule
|
11
|
+
# Path patter has to start with a slash and not contain control characters or hash.
|
12
|
+
# It also has to be a valid UTF-8 string but this is checked diring parsing.
|
13
|
+
# It also can be empty.
|
14
|
+
PATH_PATTERN = %r{\A(?:[/*][^\u0000-\u0020\u0023$]*\$?)?\z}u
|
15
|
+
private_constant :PATH_PATTERN
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
value.match?(PATH_PATTERN)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Match the path and query string against the rule.
|
22
|
+
# Invalid rules never match.
|
23
|
+
# Empty rules never match, either. This is not explicitly stated in the RFC
|
24
|
+
# but it is explicitly described in previous robots.txt documents.
|
25
|
+
#
|
26
|
+
# @param path_and_query [String]
|
27
|
+
# @return [nil, (Integer, PathRule)]
|
28
|
+
# - +nil+ if the rule does not match the path and query string.
|
29
|
+
# - An array with the number of bytes matched and the rule itself if the rule matches.
|
30
|
+
def match(path_and_query)
|
31
|
+
return nil if !valid? || value.empty?
|
32
|
+
|
33
|
+
path_and_query = normalize_path_and_query(path_and_query)
|
34
|
+
match = path_and_query.match(regexp)
|
35
|
+
return nil unless match
|
36
|
+
|
37
|
+
[match.to_s.bytesize, self]
|
38
|
+
end
|
39
|
+
|
40
|
+
# @!group Formatting Methods
|
41
|
+
|
42
|
+
# A human readable representation of the rule.
|
43
|
+
#
|
44
|
+
# @return [String]
|
45
|
+
# @tool
|
46
|
+
# :nocov:
|
47
|
+
def inspect
|
48
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
49
|
+
end
|
50
|
+
# :nocov:
|
51
|
+
|
52
|
+
# Produces a pretty human readable representation of the rule.
|
53
|
+
#
|
54
|
+
# @param pp [PrettyPrint] pretty printer
|
55
|
+
# @return [void]
|
56
|
+
# @tool
|
57
|
+
# :nocov:
|
58
|
+
def pretty_print(pp)
|
59
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
60
|
+
end
|
61
|
+
# :nocov:
|
62
|
+
|
63
|
+
# @!endgroup Formatting Methods
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
# @param path_and_query [String]
|
68
|
+
# @return [String]
|
69
|
+
def normalize_path_and_query(path_and_query)
|
70
|
+
pq = Addressable::URI.parse(path_and_query).normalize
|
71
|
+
pq.scheme = nil
|
72
|
+
pq.authority = nil
|
73
|
+
pq.fragment = nil
|
74
|
+
pq.to_s
|
75
|
+
end
|
76
|
+
|
77
|
+
# @return [Regexp]
|
78
|
+
def regexp
|
79
|
+
@regexp ||=
|
80
|
+
begin
|
81
|
+
parts = value.scan(/[^*$]+|[*$]/)
|
82
|
+
regexp_parts =
|
83
|
+
parts.map { |part|
|
84
|
+
case part
|
85
|
+
when "*"
|
86
|
+
".*"
|
87
|
+
when "$"
|
88
|
+
"\\z"
|
89
|
+
else
|
90
|
+
Regexp.escape(Addressable::URI.normalized_encode(part))
|
91
|
+
end
|
92
|
+
}
|
93
|
+
|
94
|
+
Regexp.new("\\A#{regexp_parts.join}")
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Represents a robots.txt file.
|
5
|
+
class RobotsTxt
|
6
|
+
ROBOTS_TXT_PATH = "/robots.txt"
|
7
|
+
private_constant :ROBOTS_TXT_PATH
|
8
|
+
|
9
|
+
def initialize(rules)
|
10
|
+
@rules = rules
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Array<Rule, Group, InvalidLine>]
|
14
|
+
attr_reader :rules
|
15
|
+
|
16
|
+
# Is this path allowed for the given user agent?
|
17
|
+
#
|
18
|
+
# @param user_agent [String]
|
19
|
+
# @param path_and_query [String]
|
20
|
+
# @return [Boolean]
|
21
|
+
# @see PathRule#match
|
22
|
+
# @see #disallow?
|
23
|
+
def allow?(user_agent, path_and_query)
|
24
|
+
return true if path_and_query == ROBOTS_TXT_PATH
|
25
|
+
|
26
|
+
top_match =
|
27
|
+
matches(user_agent, path_and_query)
|
28
|
+
.compact
|
29
|
+
# This is an arcane bit.
|
30
|
+
# The rules are reverse sorted by match length (i.e. longest first),
|
31
|
+
# and then by class name using the fact that allow goes before disallow.
|
32
|
+
# This is the rule precedence order defined in the RFC.
|
33
|
+
.min_by { |(match_length, rule)| [-match_length, rule.class.name] }
|
34
|
+
|
35
|
+
# Allow if there is no match or the top match is an allow rule.
|
36
|
+
top_match.nil? || top_match.last.is_a?(AllowRule)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Is this path disallowed for the given user agent?
|
40
|
+
#
|
41
|
+
# @param user_agent [String]
|
42
|
+
# @param path_and_query [String]
|
43
|
+
# @return [Boolean]
|
44
|
+
# @see PathRule#match
|
45
|
+
# @see #allow?
|
46
|
+
def disallow?(user_agent, path_and_query)
|
47
|
+
!allow?(user_agent, path_and_query)
|
48
|
+
end
|
49
|
+
|
50
|
+
# @!group Formatting Methods
|
51
|
+
|
52
|
+
# A human readable representation of the robots.txt.
|
53
|
+
#
|
54
|
+
# @return [String]
|
55
|
+
# @tool
|
56
|
+
# :nocov:
|
57
|
+
def inspect
|
58
|
+
"#<#{self.class.name}:#{object_id} #{rules.inspect}>"
|
59
|
+
end
|
60
|
+
# :nocov:
|
61
|
+
|
62
|
+
# Produces a pretty human readable representation of the robots.txt.
|
63
|
+
#
|
64
|
+
# @param pp [PrettyPrint] pretty printer
|
65
|
+
# @return [void]
|
66
|
+
# @tool
|
67
|
+
# :nocov:
|
68
|
+
def pretty_print(pp)
|
69
|
+
pp.text("#{self.class.name}/#{object_id}")
|
70
|
+
pp.group(1, "[", "]") do
|
71
|
+
pp.breakable("")
|
72
|
+
pp.seplist(rules) do |rule|
|
73
|
+
pp.pp(rule)
|
74
|
+
end
|
75
|
+
pp.breakable("")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
# :nocov:
|
79
|
+
|
80
|
+
# @!endgroup Formatting Methods
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def matches(user_agent, path)
|
85
|
+
# @type var groups: Array<Group>
|
86
|
+
groups = rules.select { |rule| rule.is_a?(Group) && rule.valid? && rule.apply?(user_agent) }
|
87
|
+
groups.flat_map do |group|
|
88
|
+
group.rules.filter_map do |rule|
|
89
|
+
next unless rule.is_a?(PathRule)
|
90
|
+
|
91
|
+
rule.match(path)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gort/rule.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Generic rule.
|
5
|
+
# This represents an entry that looks like a valid rule but otherwise doesn't
|
6
|
+
# have a more specialized implementation.
|
7
|
+
class Rule
|
8
|
+
# @param name [Symbol] The name of the rule.
|
9
|
+
# @param value [String] The value of the rule.
|
10
|
+
def initialize(name, value)
|
11
|
+
@name = name.downcase.to_sym
|
12
|
+
@value = value
|
13
|
+
end
|
14
|
+
|
15
|
+
# The name of the rule.
|
16
|
+
# @return [Symbol]
|
17
|
+
attr_reader :name
|
18
|
+
|
19
|
+
# The value of the rule.
|
20
|
+
# @return [String]
|
21
|
+
attr_reader :value
|
22
|
+
|
23
|
+
# @!group Formatting Methods
|
24
|
+
|
25
|
+
# A human readable representation of the rule.
|
26
|
+
#
|
27
|
+
# @return [String]
|
28
|
+
# @tool
|
29
|
+
# :nocov:
|
30
|
+
def inspect
|
31
|
+
%(#<#{self.class.name}:#{object_id} "#{name}", "#{value}">)
|
32
|
+
end
|
33
|
+
# :nocov:
|
34
|
+
|
35
|
+
# Produces a pretty human readable representation of the rule.
|
36
|
+
#
|
37
|
+
# @param pp [PrettyPrint] pretty printer
|
38
|
+
# @return [void]
|
39
|
+
# @tool
|
40
|
+
# :nocov:
|
41
|
+
def pretty_print(pp)
|
42
|
+
pp.text("#{self.class.name}/#{object_id}< #{name.inspect}, #{value} >")
|
43
|
+
end
|
44
|
+
# :nocov:
|
45
|
+
|
46
|
+
# @!endgroup Formatting Methods
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gort
|
4
|
+
# Abstract rule set.
|
5
|
+
#
|
6
|
+
# @abstract
|
7
|
+
class RuleSet
|
8
|
+
# @param rules [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
9
|
+
# The rules. Or invalid lines.
|
10
|
+
def initialize(*rules)
|
11
|
+
@rules = rules.flatten.freeze
|
12
|
+
end
|
13
|
+
|
14
|
+
# Rules in this set
|
15
|
+
# @return [Array<Group, UserAgentRule, AllowRule, DisallowRule, Rule, InvalidLine>]
|
16
|
+
attr_reader :rules
|
17
|
+
|
18
|
+
# Make a new set by mergin this one with another.
|
19
|
+
# @param other [RuleSet]
|
20
|
+
# @return [RuleSet]
|
21
|
+
def merge(other)
|
22
|
+
self.class.new(rules + other.rules)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "rule"
|
4
|
+
|
5
|
+
module Gort
|
6
|
+
# User-agent rule.
|
7
|
+
class UserAgentRule < Rule
|
8
|
+
def initialize(value)
|
9
|
+
super(:"user-agent", value)
|
10
|
+
end
|
11
|
+
|
12
|
+
PRODUCT_TOKEN_RE = /\A([a-z_-]+|\*)\z/i
|
13
|
+
private_constant :PRODUCT_TOKEN_RE
|
14
|
+
|
15
|
+
# Returns +true+ if the value is a valid user agent.
|
16
|
+
#
|
17
|
+
# A user agent token is a sequence of letters (a—z, A—Z), digits (0—9),
|
18
|
+
# underscores (_), or hyphens (-). Alternatively, a single asterisk (*) is also allowed.
|
19
|
+
#
|
20
|
+
# @return [Boolean]
|
21
|
+
# - +true+ if the value is a valid product token
|
22
|
+
# - +false+ otherwise
|
23
|
+
def valid?
|
24
|
+
value.match?(PRODUCT_TOKEN_RE)
|
25
|
+
end
|
26
|
+
|
27
|
+
# @!group Formatting Methods
|
28
|
+
|
29
|
+
# A human readable representation of the rule.
|
30
|
+
#
|
31
|
+
# @return [String]
|
32
|
+
# @tool
|
33
|
+
# :nocov:
|
34
|
+
def inspect
|
35
|
+
%(#<#{self.class.name}:#{object_id} "#{value}">)
|
36
|
+
end
|
37
|
+
# :nocov:
|
38
|
+
|
39
|
+
# Produces a pretty human readable representation of the rule.
|
40
|
+
#
|
41
|
+
# @param pp [PrettyPrint] pretty printer
|
42
|
+
# @return [void]
|
43
|
+
# @tool
|
44
|
+
# :nocov:
|
45
|
+
def pretty_print(pp)
|
46
|
+
pp.text("#{self.class.name}/#{object_id}< #{value} >")
|
47
|
+
end
|
48
|
+
# :nocov:
|
49
|
+
|
50
|
+
# @!endgroup Formatting Methods
|
51
|
+
end
|
52
|
+
end
|
data/lib/gort/version.rb
ADDED
data/lib/gort.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "gort/version"
|
4
|
+
|
5
|
+
# Gort is a robots.txt parser and evaluator.
|
6
|
+
module Gort
|
7
|
+
# Gort's top error class. All other errors inherit from this.
|
8
|
+
class Error < StandardError; end
|
9
|
+
|
10
|
+
# Parse the given robots.txt input and return a RobotsTxt instance.
|
11
|
+
#
|
12
|
+
# @param input [String] the robots.txt input to parse
|
13
|
+
# @return [RobotsTxt] the parsed robots.txt
|
14
|
+
def self.parse(input)
|
15
|
+
Parser.new(input).parse
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
require_relative "gort/parser"
|
data.tar.gz.sig
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gort
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alexander Mankuta
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIC+jCCAeKgAwIBAgIBAzANBgkqhkiG9w0BAQsFADAjMSEwHwYDVQQDDBhhbGV4
|
14
|
+
L0RDPXBvaW50bGVzcy9EQz1vbmUwHhcNMjMxMTA5MTA1MzIxWhcNMjQxMTA4MTA1
|
15
|
+
MzIxWjAjMSEwHwYDVQQDDBhhbGV4L0RDPXBvaW50bGVzcy9EQz1vbmUwggEiMA0G
|
16
|
+
CSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDPOVLPGEK+eaP6zJfifrpWvPTg4qo3
|
17
|
+
XNJJPom80SwqX2hVCVsRDK4RYgKUQqKRQzHhlx14wZHwWLETBVbNDGX3uqyCnTWU
|
18
|
+
JUKh3ydiZShXpNHoV/NW7hhEYvNsDcBAjYTmbvXOhuYCo0Tz/0N2Oiun/0wIICtP
|
19
|
+
vytY9TY0/lklWjAbsqJjNOu3o8IYkJBAN/rU96E/6WhFwjnxLcTnV9RfFRXdjG5j
|
20
|
+
CughoB2xSwKX8gwbQ8fsnaZRmdyDGYNpz6sGF0zycfiLkTttbLA2nYATCALy98CH
|
21
|
+
nsyZNsTjb4WINCuY2yEDjwesw9f/ROkNC68EgQ5M+aMjp+D0WcYGfzojAgMBAAGj
|
22
|
+
OTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBRPgIwSVbeonua/
|
23
|
+
Ny/8576oxdUbrjANBgkqhkiG9w0BAQsFAAOCAQEAX28QLxNNz5EgaZZuQQUkbOXB
|
24
|
+
4b5luBO22535+Vgj2jw7yjV8KKoGMWKrnB00ijgntqPEPXCzaPNibOcPZV5WfWVS
|
25
|
+
t0Ls8lWE/8kezPwV4SbRe4Y7C+D4J+oirs0L5PtpREV9CJ7kfdW/AN9MtvjjBFlb
|
26
|
+
jHquD/MiOOMyHtuO0FiTL265m10thcAUsbyi0MehKgGbtJ5fGceHvZDqDouvbMjT
|
27
|
+
hoijFk1oTY939JhjdcHuJzMiS2TrqIw8Dr5DkQu2vAjHpw0aOOWhlRjNJ7RHYJNm
|
28
|
+
QugXmCnHQxSKTmc7imKuotyMdRRKFh8UEFCLRsFtBbNxkXyNuB4xBMuUYodhEw==
|
29
|
+
-----END CERTIFICATE-----
|
30
|
+
date: 2024-06-22 00:00:00.000000000 Z
|
31
|
+
dependencies:
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: addressable
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - "~>"
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '2.8'
|
39
|
+
type: :runtime
|
40
|
+
prerelease: false
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - "~>"
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.8'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rchardet
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - "~>"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '1.8'
|
53
|
+
type: :runtime
|
54
|
+
prerelease: false
|
55
|
+
version_requirements: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '1.8'
|
60
|
+
description: robots.txt parser and evaluator according to RFC 9309.
|
61
|
+
email:
|
62
|
+
- alex@pointless.one
|
63
|
+
executables: []
|
64
|
+
extensions: []
|
65
|
+
extra_rdoc_files: []
|
66
|
+
files:
|
67
|
+
- LICENSE.txt
|
68
|
+
- lib/gort.rb
|
69
|
+
- lib/gort/allow_rule.rb
|
70
|
+
- lib/gort/disallow_rule.rb
|
71
|
+
- lib/gort/group.rb
|
72
|
+
- lib/gort/invalid_line.rb
|
73
|
+
- lib/gort/parser.rb
|
74
|
+
- lib/gort/path_rule.rb
|
75
|
+
- lib/gort/robots_txt.rb
|
76
|
+
- lib/gort/rule.rb
|
77
|
+
- lib/gort/rule_set.rb
|
78
|
+
- lib/gort/user_agent_rule.rb
|
79
|
+
- lib/gort/version.rb
|
80
|
+
homepage:
|
81
|
+
licenses:
|
82
|
+
- MIT
|
83
|
+
metadata:
|
84
|
+
allowed_push_host: https://rubygems.org
|
85
|
+
rubygems_mfa_required: 'true'
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '3.1'
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
requirements: []
|
101
|
+
rubygems_version: 3.5.9
|
102
|
+
signing_key:
|
103
|
+
specification_version: 4
|
104
|
+
summary: robots.txt parser and evaluator.
|
105
|
+
test_files: []
|
metadata.gz.sig
ADDED