public_suffix 2.0.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,11 @@
4
4
  #
5
5
  # Copyright (c) 2009-2017 Simone Carletti <weppos@weppos.net>
6
6
 
7
- require "public_suffix/domain"
8
- require "public_suffix/version"
9
- require "public_suffix/errors"
10
- require "public_suffix/rule"
11
- require "public_suffix/list"
7
+ require_relative "public_suffix/domain"
8
+ require_relative "public_suffix/version"
9
+ require_relative "public_suffix/errors"
10
+ require_relative "public_suffix/rule"
11
+ require_relative "public_suffix/list"
12
12
 
13
13
  # PublicSuffix is a Ruby domain name parser based on the Public Suffix List.
14
14
  #
@@ -28,27 +28,31 @@ module PublicSuffix
28
28
  #
29
29
  # @example Parse a valid domain
30
30
  # PublicSuffix.parse("google.com")
31
- # # => #<PublicSuffix::Domain ...>
31
+ # # => #<PublicSuffix::Domain:0x007fec2e51e588 @sld="google", @tld="com", @trd=nil>
32
32
  #
33
33
  # @example Parse a valid subdomain
34
34
  # PublicSuffix.parse("www.google.com")
35
- # # => #<PublicSuffix::Domain ...>
35
+ # # => #<PublicSuffix::Domain:0x007fec276d4cf8 @sld="google", @tld="com", @trd="www">
36
36
  #
37
37
  # @example Parse a fully qualified domain
38
38
  # PublicSuffix.parse("google.com.")
39
- # # => #<PublicSuffix::Domain ...>
39
+ # # => #<PublicSuffix::Domain:0x007fec257caf38 @sld="google", @tld="com", @trd=nil>
40
40
  #
41
41
  # @example Parse a fully qualified domain (subdomain)
42
42
  # PublicSuffix.parse("www.google.com.")
43
- # # => #<PublicSuffix::Domain ...>
43
+ # # => #<PublicSuffix::Domain:0x007fec27b6bca8 @sld="google", @tld="com", @trd="www">
44
44
  #
45
- # @example Parse an invalid domain
45
+ # @example Parse an invalid (unlisted) domain
46
46
  # PublicSuffix.parse("x.yz")
47
- # # => PublicSuffix::DomainInvalid
47
+ # # => #<PublicSuffix::Domain:0x007fec2f49bec0 @sld="x", @tld="yz", @trd=nil>
48
+ #
49
+ # @example Parse an invalid (unlisted) domain with strict checking (without applying the default * rule)
50
+ # PublicSuffix.parse("x.yz", default_rule: nil)
51
+ # # => PublicSuffix::DomainInvalid: `x.yz` is not a valid domain
48
52
  #
49
53
  # @example Parse an URL (not supported, only domains)
50
54
  # PublicSuffix.parse("http://www.google.com")
51
- # # => PublicSuffix::DomainInvalid
55
+ # # => PublicSuffix::DomainInvalid: http://www.google.com is not expected to contain a scheme
52
56
  #
53
57
  #
54
58
  # @param [String, #to_s] name The domain name or fully qualified domain name to parse.
@@ -95,11 +99,11 @@ module PublicSuffix
95
99
  # PublicSuffix.valid?("example.tldnotlisted")
96
100
  # # => true
97
101
  #
98
- # @example Validate a not-allowed domain
99
- # PublicSuffix.valid?("example.do")
100
- # # => false
101
- # PublicSuffix.valid?("www.example.do")
102
+ # @example Validate a not-listed domain with strict checking (without applying the default * rule)
103
+ # PublicSuffix.valid?("example.tldnotlisted")
102
104
  # # => true
105
+ # PublicSuffix.valid?("example.tldnotlisted", default_rule: nil)
106
+ # # => false
103
107
  #
104
108
  # @example Validate a fully qualified domain
105
109
  # PublicSuffix.valid?("google.com.")
@@ -35,12 +35,9 @@ module PublicSuffix
35
35
  # The {PublicSuffix::List.default} rule list is used
36
36
  # to tokenize and validate a domain.
37
37
  #
38
- # {PublicSuffix::List} implements +Enumerable+ module.
39
- #
40
38
  class List
41
- include Enumerable
42
39
 
43
- DEFAULT_LIST_PATH = File.join(File.dirname(__FILE__), "..", "..", "data", "list.txt")
40
+ DEFAULT_LIST_PATH = File.expand_path("../../data/list.txt", __dir__)
44
41
 
45
42
  # Gets the default rule list.
46
43
  #
@@ -62,22 +59,12 @@ module PublicSuffix
62
59
  @default = value
63
60
  end
64
61
 
65
- # Sets the default rule list to +nil+.
66
- #
67
- # @return [self]
68
- def self.clear
69
- self.default = nil
70
- self
71
- end
72
-
73
- # rubocop:disable Metrics/MethodLength
74
-
75
62
  # Parse given +input+ treating the content as Public Suffix List.
76
63
  #
77
64
  # See http://publicsuffix.org/format/ for more details about input format.
78
65
  #
79
66
  # @param string [#each_line] The list to parse.
80
- # @param private_domain [Boolean] whether to ignore the private domains section.
67
+ # @param private_domains [Boolean] whether to ignore the private domains section.
81
68
  # @return [Array<PublicSuffix::Rule::*>]
82
69
  def self.parse(input, private_domains: true)
83
70
  comment_token = "//".freeze
@@ -103,53 +90,21 @@ module PublicSuffix
103
90
  next
104
91
 
105
92
  else
106
- list.add(Rule.factory(line, private: section == 2), reindex: false)
93
+ list.add(Rule.factory(line, private: section == 2))
107
94
 
108
95
  end
109
96
  end
110
97
  end
111
98
  end
112
- # rubocop:enable Metrics/MethodLength
113
-
114
-
115
- # Gets the array of rules.
116
- #
117
- # @return [Array<PublicSuffix::Rule::*>]
118
- attr_reader :rules
119
99
 
120
100
 
121
101
  # Initializes an empty {PublicSuffix::List}.
122
102
  #
123
103
  # @yield [self] Yields on self.
124
104
  # @yieldparam [PublicSuffix::List] self The newly created instance.
125
- #
126
105
  def initialize
127
- @rules = []
106
+ @rules = {}
128
107
  yield(self) if block_given?
129
- reindex!
130
- end
131
-
132
-
133
- # Creates a naive index for +@rules+. Just a hash that will tell
134
- # us where the elements of +@rules+ are relative to its first
135
- # {PublicSuffix::Rule::Base#labels} element.
136
- #
137
- # For instance if @rules[5] and @rules[4] are the only elements of the list
138
- # where Rule#labels.first is 'us' @indexes['us'] #=> [5,4], that way in
139
- # select we can avoid mapping every single rule against the candidate domain.
140
- def reindex!
141
- @indexes = {}
142
- @rules.each_with_index do |rule, index|
143
- tld = Domain.name_to_labels(rule.value).last
144
- @indexes[tld] ||= []
145
- @indexes[tld] << index
146
- end
147
- end
148
-
149
- # Gets the naive index, a hash that with the keys being the first label of
150
- # every rule pointing to an array of integers (indexes of the rules in @rules).
151
- def indexes
152
- @indexes.dup
153
108
  end
154
109
 
155
110
 
@@ -159,42 +114,35 @@ module PublicSuffix
159
114
  # {PublicSuffix::List} and each +PublicSuffix::Rule::*+
160
115
  # in list <tt>one</tt> is available in list <tt>two</tt>, in the same order.
161
116
  #
162
- # @param [PublicSuffix::List] other
163
- # The List to compare.
164
- #
117
+ # @param other [PublicSuffix::List] the List to compare
165
118
  # @return [Boolean]
166
119
  def ==(other)
167
120
  return false unless other.is_a?(List)
168
- equal?(other) || rules == other.rules
121
+ equal?(other) || @rules == other.rules
169
122
  end
170
123
  alias eql? ==
171
124
 
172
125
  # Iterates each rule in the list.
173
- def each(*args, &block)
174
- @rules.each(*args, &block)
126
+ def each(&block)
127
+ Enumerator.new do |y|
128
+ @rules.each do |key, node|
129
+ y << entry_to_rule(node, key)
130
+ end
131
+ end.each(&block)
175
132
  end
176
133
 
177
134
 
178
135
  # Adds the given object to the list and optionally refreshes the rule index.
179
136
  #
180
- # @param [PublicSuffix::Rule::*] rule
181
- # The rule to add to the list.
182
- # @param [Boolean] reindex
183
- # Set to true to recreate the rule index
184
- # after the rule has been added to the list.
185
- #
137
+ # @param rule [PublicSuffix::Rule::*] the rule to add to the list
186
138
  # @return [self]
187
- #
188
- # @see #reindex!
189
- #
190
- def add(rule, reindex: true)
191
- @rules << rule
192
- reindex! if reindex
139
+ def add(rule)
140
+ @rules[rule.value] = rule_to_entry(rule)
193
141
  self
194
142
  end
195
143
  alias << add
196
144
 
197
- # Gets the number of elements in the list.
145
+ # Gets the number of rules in the list.
198
146
  #
199
147
  # @return [Integer]
200
148
  def size
@@ -208,37 +156,18 @@ module PublicSuffix
208
156
  @rules.empty?
209
157
  end
210
158
 
211
- # Removes all elements.
159
+ # Removes all rules.
212
160
  #
213
161
  # @return [self]
214
162
  def clear
215
163
  @rules.clear
216
- reindex!
217
164
  self
218
165
  end
219
166
 
220
- # Finds and returns the most appropriate rule for the domain name.
221
- #
222
- # From the Public Suffix List documentation:
223
- #
224
- # - If a hostname matches more than one rule in the file,
225
- # the longest matching rule (the one with the most levels) will be used.
226
- # - An exclamation mark (!) at the start of a rule marks an exception to a previous wildcard rule.
227
- # An exception rule takes priority over any other matching rule.
228
- #
229
- # ## Algorithm description
230
- #
231
- # 1. Match domain against all rules and take note of the matching ones.
232
- # 2. If no rules match, the prevailing rule is "*".
233
- # 3. If more than one rule matches, the prevailing rule is the one which is an exception rule.
234
- # 4. If there is no matching exception rule, the prevailing rule is the one with the most labels.
235
- # 5. If the prevailing rule is a exception rule, modify it by removing the leftmost label.
236
- # 6. The public suffix is the set of labels from the domain
237
- # which directly match the labels of the prevailing rule (joined by dots).
238
- # 7. The registered domain is the public suffix plus one additional label.
167
+ # Finds and returns the rule corresponding to the longest public suffix for the hostname.
239
168
  #
240
- # @param name [String, #to_s] The domain name.
241
- # @param [PublicSuffix::Rule::*] default The default rule to return in case no rule matches.
169
+ # @param name [#to_s] the hostname
170
+ # @param default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
242
171
  # @return [PublicSuffix::Rule::*]
243
172
  def find(name, default: default_rule, **options)
244
173
  rule = select(name, **options).inject do |l, r|
@@ -248,30 +177,44 @@ module PublicSuffix
248
177
  rule || default
249
178
  end
250
179
 
251
- # Selects all the rules matching given domain.
180
+ # Selects all the rules matching given hostame.
252
181
  #
253
- # Internally, the lookup heavily rely on the `@indexes`. The input is split into labels,
254
- # and we retriever from the index only the rules that end with the input label. After that,
255
- # a sequential scan is performed. In most cases, where the number of rules for the same label
256
- # is limited, this algorithm is efficient enough.
257
- #
258
- # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as private domain.
259
- # Note that the rules will still be part of the loop. If you frequently need to access lists
260
- # ignoring the private domains, you should create a list that doesn't include these domains setting the
182
+ # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
183
+ # private domain. Note that the rules will still be part of the loop.
184
+ # If you frequently need to access lists ignoring the private domains,
185
+ # you should create a list that doesn't include these domains setting the
261
186
  # `private_domains: false` option when calling {.parse}.
262
187
  #
263
- # @param [String, #to_s] name The domain name.
264
- # @param [Boolean] ignore_private
188
+ # Note that this method is currently private, as you should not rely on it. Instead,
189
+ # the public interface is {#find}. The current internal algorithm allows to return all
190
+ # matching rules, but different data structures may not be able to do it, and instead would
191
+ # return only the match. For this reason, you should rely on {#find}.
192
+ #
193
+ # @param name [#to_s] the hostname
194
+ # @param ignore_private [Boolean]
265
195
  # @return [Array<PublicSuffix::Rule::*>]
266
196
  def select(name, ignore_private: false)
267
197
  name = name.to_s
268
- indices = (@indexes[Domain.name_to_labels(name).last] || [])
269
198
 
270
- finder = @rules.values_at(*indices).lazy
271
- finder = finder.select { |rule| rule.match?(name) }
272
- finder = finder.select { |rule| !rule.private } if ignore_private
273
- finder.to_a
199
+ parts = name.split(DOT).reverse!
200
+ index = 0
201
+ query = parts[index]
202
+ rules = []
203
+
204
+ loop do
205
+ match = @rules[query]
206
+ if !match.nil? && (ignore_private == false || match.private == false)
207
+ rules << entry_to_rule(match, query)
208
+ end
209
+
210
+ index += 1
211
+ break if index >= parts.size
212
+ query = parts[index] + DOT + query
213
+ end
214
+
215
+ rules
274
216
  end
217
+ private :select
275
218
 
276
219
  # Gets the default rule.
277
220
  #
@@ -282,5 +225,21 @@ module PublicSuffix
282
225
  PublicSuffix::Rule.default
283
226
  end
284
227
 
228
+
229
+ protected
230
+
231
+ attr_reader :rules
232
+
233
+
234
+ private
235
+
236
+ def entry_to_rule(entry, value)
237
+ entry.type.new(value: value, length: entry.length, private: entry.private)
238
+ end
239
+
240
+ def rule_to_entry(rule)
241
+ Rule::Entry.new(rule.class, rule.length, rule.private)
242
+ end
243
+
285
244
  end
286
245
  end
@@ -19,6 +19,9 @@ module PublicSuffix
19
19
  #
20
20
  module Rule
21
21
 
22
+ # @api internal
23
+ Entry = Struct.new(:type, :length, :private)
24
+
22
25
  # = Abstract rule class
23
26
  #
24
27
  # This represent the base class for a Rule definition
@@ -99,16 +102,28 @@ module PublicSuffix
99
102
  # @return [String] the rule definition
100
103
  attr_reader :value
101
104
 
105
+ # @return [String] the length of the rule
106
+ attr_reader :length
107
+
102
108
  # @return [Boolean] true if the rule is a private domain
103
109
  attr_reader :private
104
110
 
105
111
 
106
- # Initializes a new rule with name and value.
107
- # If value is +nil+, name also becomes the value for this rule.
112
+ # Initializes a new rule from the content.
113
+ #
114
+ # @param content [String] the content of the rule
115
+ # @param private [Boolean]
116
+ def self.build(content, private: false)
117
+ new(value: content, private: private)
118
+ end
119
+
120
+ # Initializes a new rule.
108
121
  #
109
- # @param value [String] the value of the rule
110
- def initialize(value, private: false)
122
+ # @param value [String]
123
+ # @param private [Boolean]
124
+ def initialize(value:, length: nil, private: false)
111
125
  @value = value.to_s
126
+ @length = length || @value.count(DOT) + 1
112
127
  @private = private
113
128
  end
114
129
 
@@ -137,12 +152,12 @@ module PublicSuffix
137
152
  # @see https://publicsuffix.org/list/
138
153
  #
139
154
  # @example
140
- # Rule.factory("com").match?("example.com")
155
+ # PublicSuffix::Rule.factory("com").match?("example.com")
141
156
  # # => true
142
- # Rule.factory("com").match?("example.net")
157
+ # PublicSuffix::Rule.factory("com").match?("example.net")
143
158
  # # => false
144
159
  #
145
- # @param name [String, #to_s] The domain name to check.
160
+ # @param name [String] the domain name to check
146
161
  # @return [Boolean]
147
162
  def match?(name)
148
163
  # Note: it works because of the assumption there are no
@@ -150,7 +165,7 @@ module PublicSuffix
150
165
  # we need to properly walk the input and skip parts according
151
166
  # to wildcard component.
152
167
  diff = name.chomp(value)
153
- diff.empty? || diff[-1] == "."
168
+ diff.empty? || diff[-1] == DOT
154
169
  end
155
170
 
156
171
  # @abstract
@@ -158,11 +173,6 @@ module PublicSuffix
158
173
  raise NotImplementedError
159
174
  end
160
175
 
161
- # @abstract
162
- def length
163
- raise NotImplementedError
164
- end
165
-
166
176
  # @abstract
167
177
  # @param [String, #to_s] name The domain name to decompose
168
178
  # @return [Array<String, nil>]
@@ -200,27 +210,26 @@ module PublicSuffix
200
210
  @value.split(DOT)
201
211
  end
202
212
 
203
- # Gets the length of this rule for comparison,
204
- # represented by the number of dot-separated parts in the rule.
205
- #
206
- # @return [Integer] The length of the rule.
207
- def length
208
- @length ||= parts.length
209
- end
210
-
211
213
  end
212
214
 
213
215
  # Wildcard represents a wildcard rule (e.g. *.co.uk).
214
216
  class Wildcard < Base
215
217
 
216
- # Initializes a new rule from +definition+.
218
+ # Initializes a new rule from the content.
217
219
  #
218
- # The wildcard "*" is removed from the value, as it's common
219
- # for each wildcard rule.
220
+ # @param content [String] the content of the rule
221
+ # @param private [Boolean]
222
+ def self.build(content, private: false)
223
+ new(value: content.to_s[2..-1], private: private)
224
+ end
225
+
226
+ # Initializes a new rule.
220
227
  #
221
- # @param definition [String] the rule as defined in the PSL
222
- def initialize(definition, private: false)
223
- super(definition.to_s[2..-1], private: private)
228
+ # @param value [String]
229
+ # @param private [Boolean]
230
+ def initialize(value:, length: nil, private: false)
231
+ super(value: value, length: length, private: private)
232
+ length or @length += 1 # * counts as 1
224
233
  end
225
234
 
226
235
  # Gets the original rule definition.
@@ -248,28 +257,17 @@ module PublicSuffix
248
257
  @value.split(DOT)
249
258
  end
250
259
 
251
- # Gets the length of this rule for comparison,
252
- # represented by the number of dot-separated parts in the rule
253
- # plus 1 for the *.
254
- #
255
- # @return [Integer] The length of the rule.
256
- def length
257
- @length ||= parts.length + 1 # * counts as 1
258
- end
259
-
260
260
  end
261
261
 
262
262
  # Exception represents an exception rule (e.g. !parliament.uk).
263
263
  class Exception < Base
264
264
 
265
- # Initializes a new rule from +definition+.
266
- #
267
- # The bang ! is removed from the value, as it's common
268
- # for each wildcard rule.
265
+ # Initializes a new rule from the content.
269
266
  #
270
- # @param definition [String] the rule as defined in the PSL
271
- def initialize(definition, private: false)
272
- super(definition.to_s[1..-1], private: private)
267
+ # @param content [String] the content of the rule
268
+ # @param private [Boolean]
269
+ def self.build(content, private: false)
270
+ new(value: content.to_s[1..-1], private: private)
273
271
  end
274
272
 
275
273
  # Gets the original rule definition.
@@ -302,14 +300,6 @@ module PublicSuffix
302
300
  @value.split(DOT)[1..-1]
303
301
  end
304
302
 
305
- # Gets the length of this rule for comparison,
306
- # represented by the number of dot-separated parts in the rule.
307
- #
308
- # @return [Integer] The length of the rule.
309
- def length
310
- @length ||= parts.length
311
- end
312
-
313
303
  end
314
304
 
315
305
 
@@ -339,7 +329,7 @@ module PublicSuffix
339
329
  Exception
340
330
  else
341
331
  Normal
342
- end.new(content, private: private)
332
+ end.build(content, private: private)
343
333
  end
344
334
 
345
335
  # The default rule to use if no rule match.