public_suffix 2.0.5 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,11 +4,11 @@
4
4
  #
5
5
  # Copyright (c) 2009-2017 Simone Carletti <weppos@weppos.net>
6
6
 
7
- require "public_suffix/domain"
8
- require "public_suffix/version"
9
- require "public_suffix/errors"
10
- require "public_suffix/rule"
11
- require "public_suffix/list"
7
+ require_relative "public_suffix/domain"
8
+ require_relative "public_suffix/version"
9
+ require_relative "public_suffix/errors"
10
+ require_relative "public_suffix/rule"
11
+ require_relative "public_suffix/list"
12
12
 
13
13
  # PublicSuffix is a Ruby domain name parser based on the Public Suffix List.
14
14
  #
@@ -28,27 +28,31 @@ module PublicSuffix
28
28
  #
29
29
  # @example Parse a valid domain
30
30
  # PublicSuffix.parse("google.com")
31
- # # => #<PublicSuffix::Domain ...>
31
+ # # => #<PublicSuffix::Domain:0x007fec2e51e588 @sld="google", @tld="com", @trd=nil>
32
32
  #
33
33
  # @example Parse a valid subdomain
34
34
  # PublicSuffix.parse("www.google.com")
35
- # # => #<PublicSuffix::Domain ...>
35
+ # # => #<PublicSuffix::Domain:0x007fec276d4cf8 @sld="google", @tld="com", @trd="www">
36
36
  #
37
37
  # @example Parse a fully qualified domain
38
38
  # PublicSuffix.parse("google.com.")
39
- # # => #<PublicSuffix::Domain ...>
39
+ # # => #<PublicSuffix::Domain:0x007fec257caf38 @sld="google", @tld="com", @trd=nil>
40
40
  #
41
41
  # @example Parse a fully qualified domain (subdomain)
42
42
  # PublicSuffix.parse("www.google.com.")
43
- # # => #<PublicSuffix::Domain ...>
43
+ # # => #<PublicSuffix::Domain:0x007fec27b6bca8 @sld="google", @tld="com", @trd="www">
44
44
  #
45
- # @example Parse an invalid domain
45
+ # @example Parse an invalid (unlisted) domain
46
46
  # PublicSuffix.parse("x.yz")
47
- # # => PublicSuffix::DomainInvalid
47
+ # # => #<PublicSuffix::Domain:0x007fec2f49bec0 @sld="x", @tld="yz", @trd=nil>
48
+ #
49
+ # @example Parse an invalid (unlisted) domain with strict checking (without applying the default * rule)
50
+ # PublicSuffix.parse("x.yz", default_rule: nil)
51
+ # # => PublicSuffix::DomainInvalid: `x.yz` is not a valid domain
48
52
  #
49
53
  # @example Parse an URL (not supported, only domains)
50
54
  # PublicSuffix.parse("http://www.google.com")
51
- # # => PublicSuffix::DomainInvalid
55
+ # # => PublicSuffix::DomainInvalid: http://www.google.com is not expected to contain a scheme
52
56
  #
53
57
  #
54
58
  # @param [String, #to_s] name The domain name or fully qualified domain name to parse.
@@ -95,11 +99,11 @@ module PublicSuffix
95
99
  # PublicSuffix.valid?("example.tldnotlisted")
96
100
  # # => true
97
101
  #
98
- # @example Validate a not-allowed domain
99
- # PublicSuffix.valid?("example.do")
100
- # # => false
101
- # PublicSuffix.valid?("www.example.do")
102
+ # @example Validate a not-listed domain with strict checking (without applying the default * rule)
103
+ # PublicSuffix.valid?("example.tldnotlisted")
102
104
  # # => true
105
+ # PublicSuffix.valid?("example.tldnotlisted", default_rule: nil)
106
+ # # => false
103
107
  #
104
108
  # @example Validate a fully qualified domain
105
109
  # PublicSuffix.valid?("google.com.")
@@ -35,12 +35,9 @@ module PublicSuffix
35
35
  # The {PublicSuffix::List.default} rule list is used
36
36
  # to tokenize and validate a domain.
37
37
  #
38
- # {PublicSuffix::List} implements +Enumerable+ module.
39
- #
40
38
  class List
41
- include Enumerable
42
39
 
43
- DEFAULT_LIST_PATH = File.join(File.dirname(__FILE__), "..", "..", "data", "list.txt")
40
+ DEFAULT_LIST_PATH = File.expand_path("../../data/list.txt", __dir__)
44
41
 
45
42
  # Gets the default rule list.
46
43
  #
@@ -62,22 +59,12 @@ module PublicSuffix
62
59
  @default = value
63
60
  end
64
61
 
65
- # Sets the default rule list to +nil+.
66
- #
67
- # @return [self]
68
- def self.clear
69
- self.default = nil
70
- self
71
- end
72
-
73
- # rubocop:disable Metrics/MethodLength
74
-
75
62
  # Parse given +input+ treating the content as Public Suffix List.
76
63
  #
77
64
  # See http://publicsuffix.org/format/ for more details about input format.
78
65
  #
79
66
  # @param string [#each_line] The list to parse.
80
- # @param private_domain [Boolean] whether to ignore the private domains section.
67
+ # @param private_domains [Boolean] whether to ignore the private domains section.
81
68
  # @return [Array<PublicSuffix::Rule::*>]
82
69
  def self.parse(input, private_domains: true)
83
70
  comment_token = "//".freeze
@@ -103,53 +90,21 @@ module PublicSuffix
103
90
  next
104
91
 
105
92
  else
106
- list.add(Rule.factory(line, private: section == 2), reindex: false)
93
+ list.add(Rule.factory(line, private: section == 2))
107
94
 
108
95
  end
109
96
  end
110
97
  end
111
98
  end
112
- # rubocop:enable Metrics/MethodLength
113
-
114
-
115
- # Gets the array of rules.
116
- #
117
- # @return [Array<PublicSuffix::Rule::*>]
118
- attr_reader :rules
119
99
 
120
100
 
121
101
  # Initializes an empty {PublicSuffix::List}.
122
102
  #
123
103
  # @yield [self] Yields on self.
124
104
  # @yieldparam [PublicSuffix::List] self The newly created instance.
125
- #
126
105
  def initialize
127
- @rules = []
106
+ @rules = {}
128
107
  yield(self) if block_given?
129
- reindex!
130
- end
131
-
132
-
133
- # Creates a naive index for +@rules+. Just a hash that will tell
134
- # us where the elements of +@rules+ are relative to its first
135
- # {PublicSuffix::Rule::Base#labels} element.
136
- #
137
- # For instance if @rules[5] and @rules[4] are the only elements of the list
138
- # where Rule#labels.first is 'us' @indexes['us'] #=> [5,4], that way in
139
- # select we can avoid mapping every single rule against the candidate domain.
140
- def reindex!
141
- @indexes = {}
142
- @rules.each_with_index do |rule, index|
143
- tld = Domain.name_to_labels(rule.value).last
144
- @indexes[tld] ||= []
145
- @indexes[tld] << index
146
- end
147
- end
148
-
149
- # Gets the naive index, a hash that with the keys being the first label of
150
- # every rule pointing to an array of integers (indexes of the rules in @rules).
151
- def indexes
152
- @indexes.dup
153
108
  end
154
109
 
155
110
 
@@ -159,42 +114,35 @@ module PublicSuffix
159
114
  # {PublicSuffix::List} and each +PublicSuffix::Rule::*+
160
115
  # in list <tt>one</tt> is available in list <tt>two</tt>, in the same order.
161
116
  #
162
- # @param [PublicSuffix::List] other
163
- # The List to compare.
164
- #
117
+ # @param other [PublicSuffix::List] the List to compare
165
118
  # @return [Boolean]
166
119
  def ==(other)
167
120
  return false unless other.is_a?(List)
168
- equal?(other) || rules == other.rules
121
+ equal?(other) || @rules == other.rules
169
122
  end
170
123
  alias eql? ==
171
124
 
172
125
  # Iterates each rule in the list.
173
- def each(*args, &block)
174
- @rules.each(*args, &block)
126
+ def each(&block)
127
+ Enumerator.new do |y|
128
+ @rules.each do |key, node|
129
+ y << entry_to_rule(node, key)
130
+ end
131
+ end.each(&block)
175
132
  end
176
133
 
177
134
 
178
135
  # Adds the given object to the list and optionally refreshes the rule index.
179
136
  #
180
- # @param [PublicSuffix::Rule::*] rule
181
- # The rule to add to the list.
182
- # @param [Boolean] reindex
183
- # Set to true to recreate the rule index
184
- # after the rule has been added to the list.
185
- #
137
+ # @param rule [PublicSuffix::Rule::*] the rule to add to the list
186
138
  # @return [self]
187
- #
188
- # @see #reindex!
189
- #
190
- def add(rule, reindex: true)
191
- @rules << rule
192
- reindex! if reindex
139
+ def add(rule)
140
+ @rules[rule.value] = rule_to_entry(rule)
193
141
  self
194
142
  end
195
143
  alias << add
196
144
 
197
- # Gets the number of elements in the list.
145
+ # Gets the number of rules in the list.
198
146
  #
199
147
  # @return [Integer]
200
148
  def size
@@ -208,37 +156,18 @@ module PublicSuffix
208
156
  @rules.empty?
209
157
  end
210
158
 
211
- # Removes all elements.
159
+ # Removes all rules.
212
160
  #
213
161
  # @return [self]
214
162
  def clear
215
163
  @rules.clear
216
- reindex!
217
164
  self
218
165
  end
219
166
 
220
- # Finds and returns the most appropriate rule for the domain name.
221
- #
222
- # From the Public Suffix List documentation:
223
- #
224
- # - If a hostname matches more than one rule in the file,
225
- # the longest matching rule (the one with the most levels) will be used.
226
- # - An exclamation mark (!) at the start of a rule marks an exception to a previous wildcard rule.
227
- # An exception rule takes priority over any other matching rule.
228
- #
229
- # ## Algorithm description
230
- #
231
- # 1. Match domain against all rules and take note of the matching ones.
232
- # 2. If no rules match, the prevailing rule is "*".
233
- # 3. If more than one rule matches, the prevailing rule is the one which is an exception rule.
234
- # 4. If there is no matching exception rule, the prevailing rule is the one with the most labels.
235
- # 5. If the prevailing rule is a exception rule, modify it by removing the leftmost label.
236
- # 6. The public suffix is the set of labels from the domain
237
- # which directly match the labels of the prevailing rule (joined by dots).
238
- # 7. The registered domain is the public suffix plus one additional label.
167
+ # Finds and returns the rule corresponding to the longest public suffix for the hostname.
239
168
  #
240
- # @param name [String, #to_s] The domain name.
241
- # @param [PublicSuffix::Rule::*] default The default rule to return in case no rule matches.
169
+ # @param name [#to_s] the hostname
170
+ # @param default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
242
171
  # @return [PublicSuffix::Rule::*]
243
172
  def find(name, default: default_rule, **options)
244
173
  rule = select(name, **options).inject do |l, r|
@@ -248,30 +177,44 @@ module PublicSuffix
248
177
  rule || default
249
178
  end
250
179
 
251
- # Selects all the rules matching given domain.
180
+ # Selects all the rules matching given hostame.
252
181
  #
253
- # Internally, the lookup heavily rely on the `@indexes`. The input is split into labels,
254
- # and we retriever from the index only the rules that end with the input label. After that,
255
- # a sequential scan is performed. In most cases, where the number of rules for the same label
256
- # is limited, this algorithm is efficient enough.
257
- #
258
- # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as private domain.
259
- # Note that the rules will still be part of the loop. If you frequently need to access lists
260
- # ignoring the private domains, you should create a list that doesn't include these domains setting the
182
+ # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
183
+ # private domain. Note that the rules will still be part of the loop.
184
+ # If you frequently need to access lists ignoring the private domains,
185
+ # you should create a list that doesn't include these domains setting the
261
186
  # `private_domains: false` option when calling {.parse}.
262
187
  #
263
- # @param [String, #to_s] name The domain name.
264
- # @param [Boolean] ignore_private
188
+ # Note that this method is currently private, as you should not rely on it. Instead,
189
+ # the public interface is {#find}. The current internal algorithm allows to return all
190
+ # matching rules, but different data structures may not be able to do it, and instead would
191
+ # return only the match. For this reason, you should rely on {#find}.
192
+ #
193
+ # @param name [#to_s] the hostname
194
+ # @param ignore_private [Boolean]
265
195
  # @return [Array<PublicSuffix::Rule::*>]
266
196
  def select(name, ignore_private: false)
267
197
  name = name.to_s
268
- indices = (@indexes[Domain.name_to_labels(name).last] || [])
269
198
 
270
- finder = @rules.values_at(*indices).lazy
271
- finder = finder.select { |rule| rule.match?(name) }
272
- finder = finder.select { |rule| !rule.private } if ignore_private
273
- finder.to_a
199
+ parts = name.split(DOT).reverse!
200
+ index = 0
201
+ query = parts[index]
202
+ rules = []
203
+
204
+ loop do
205
+ match = @rules[query]
206
+ if !match.nil? && (ignore_private == false || match.private == false)
207
+ rules << entry_to_rule(match, query)
208
+ end
209
+
210
+ index += 1
211
+ break if index >= parts.size
212
+ query = parts[index] + DOT + query
213
+ end
214
+
215
+ rules
274
216
  end
217
+ private :select
275
218
 
276
219
  # Gets the default rule.
277
220
  #
@@ -282,5 +225,21 @@ module PublicSuffix
282
225
  PublicSuffix::Rule.default
283
226
  end
284
227
 
228
+
229
+ protected
230
+
231
+ attr_reader :rules
232
+
233
+
234
+ private
235
+
236
+ def entry_to_rule(entry, value)
237
+ entry.type.new(value: value, length: entry.length, private: entry.private)
238
+ end
239
+
240
+ def rule_to_entry(rule)
241
+ Rule::Entry.new(rule.class, rule.length, rule.private)
242
+ end
243
+
285
244
  end
286
245
  end
@@ -19,6 +19,9 @@ module PublicSuffix
19
19
  #
20
20
  module Rule
21
21
 
22
+ # @api internal
23
+ Entry = Struct.new(:type, :length, :private)
24
+
22
25
  # = Abstract rule class
23
26
  #
24
27
  # This represent the base class for a Rule definition
@@ -99,16 +102,28 @@ module PublicSuffix
99
102
  # @return [String] the rule definition
100
103
  attr_reader :value
101
104
 
105
+ # @return [String] the length of the rule
106
+ attr_reader :length
107
+
102
108
  # @return [Boolean] true if the rule is a private domain
103
109
  attr_reader :private
104
110
 
105
111
 
106
- # Initializes a new rule with name and value.
107
- # If value is +nil+, name also becomes the value for this rule.
112
+ # Initializes a new rule from the content.
113
+ #
114
+ # @param content [String] the content of the rule
115
+ # @param private [Boolean]
116
+ def self.build(content, private: false)
117
+ new(value: content, private: private)
118
+ end
119
+
120
+ # Initializes a new rule.
108
121
  #
109
- # @param value [String] the value of the rule
110
- def initialize(value, private: false)
122
+ # @param value [String]
123
+ # @param private [Boolean]
124
+ def initialize(value:, length: nil, private: false)
111
125
  @value = value.to_s
126
+ @length = length || @value.count(DOT) + 1
112
127
  @private = private
113
128
  end
114
129
 
@@ -137,12 +152,12 @@ module PublicSuffix
137
152
  # @see https://publicsuffix.org/list/
138
153
  #
139
154
  # @example
140
- # Rule.factory("com").match?("example.com")
155
+ # PublicSuffix::Rule.factory("com").match?("example.com")
141
156
  # # => true
142
- # Rule.factory("com").match?("example.net")
157
+ # PublicSuffix::Rule.factory("com").match?("example.net")
143
158
  # # => false
144
159
  #
145
- # @param name [String, #to_s] The domain name to check.
160
+ # @param name [String] the domain name to check
146
161
  # @return [Boolean]
147
162
  def match?(name)
148
163
  # Note: it works because of the assumption there are no
@@ -150,7 +165,7 @@ module PublicSuffix
150
165
  # we need to properly walk the input and skip parts according
151
166
  # to wildcard component.
152
167
  diff = name.chomp(value)
153
- diff.empty? || diff[-1] == "."
168
+ diff.empty? || diff[-1] == DOT
154
169
  end
155
170
 
156
171
  # @abstract
@@ -158,11 +173,6 @@ module PublicSuffix
158
173
  raise NotImplementedError
159
174
  end
160
175
 
161
- # @abstract
162
- def length
163
- raise NotImplementedError
164
- end
165
-
166
176
  # @abstract
167
177
  # @param [String, #to_s] name The domain name to decompose
168
178
  # @return [Array<String, nil>]
@@ -200,27 +210,26 @@ module PublicSuffix
200
210
  @value.split(DOT)
201
211
  end
202
212
 
203
- # Gets the length of this rule for comparison,
204
- # represented by the number of dot-separated parts in the rule.
205
- #
206
- # @return [Integer] The length of the rule.
207
- def length
208
- @length ||= parts.length
209
- end
210
-
211
213
  end
212
214
 
213
215
  # Wildcard represents a wildcard rule (e.g. *.co.uk).
214
216
  class Wildcard < Base
215
217
 
216
- # Initializes a new rule from +definition+.
218
+ # Initializes a new rule from the content.
217
219
  #
218
- # The wildcard "*" is removed from the value, as it's common
219
- # for each wildcard rule.
220
+ # @param content [String] the content of the rule
221
+ # @param private [Boolean]
222
+ def self.build(content, private: false)
223
+ new(value: content.to_s[2..-1], private: private)
224
+ end
225
+
226
+ # Initializes a new rule.
220
227
  #
221
- # @param definition [String] the rule as defined in the PSL
222
- def initialize(definition, private: false)
223
- super(definition.to_s[2..-1], private: private)
228
+ # @param value [String]
229
+ # @param private [Boolean]
230
+ def initialize(value:, length: nil, private: false)
231
+ super(value: value, length: length, private: private)
232
+ length or @length += 1 # * counts as 1
224
233
  end
225
234
 
226
235
  # Gets the original rule definition.
@@ -248,28 +257,17 @@ module PublicSuffix
248
257
  @value.split(DOT)
249
258
  end
250
259
 
251
- # Gets the length of this rule for comparison,
252
- # represented by the number of dot-separated parts in the rule
253
- # plus 1 for the *.
254
- #
255
- # @return [Integer] The length of the rule.
256
- def length
257
- @length ||= parts.length + 1 # * counts as 1
258
- end
259
-
260
260
  end
261
261
 
262
262
  # Exception represents an exception rule (e.g. !parliament.uk).
263
263
  class Exception < Base
264
264
 
265
- # Initializes a new rule from +definition+.
266
- #
267
- # The bang ! is removed from the value, as it's common
268
- # for each wildcard rule.
265
+ # Initializes a new rule from the content.
269
266
  #
270
- # @param definition [String] the rule as defined in the PSL
271
- def initialize(definition, private: false)
272
- super(definition.to_s[1..-1], private: private)
267
+ # @param content [String] the content of the rule
268
+ # @param private [Boolean]
269
+ def self.build(content, private: false)
270
+ new(value: content.to_s[1..-1], private: private)
273
271
  end
274
272
 
275
273
  # Gets the original rule definition.
@@ -302,14 +300,6 @@ module PublicSuffix
302
300
  @value.split(DOT)[1..-1]
303
301
  end
304
302
 
305
- # Gets the length of this rule for comparison,
306
- # represented by the number of dot-separated parts in the rule.
307
- #
308
- # @return [Integer] The length of the rule.
309
- def length
310
- @length ||= parts.length
311
- end
312
-
313
303
  end
314
304
 
315
305
 
@@ -339,7 +329,7 @@ module PublicSuffix
339
329
  Exception
340
330
  else
341
331
  Normal
342
- end.new(content, private: private)
332
+ end.build(content, private: private)
343
333
  end
344
334
 
345
335
  # The default rule to use if no rule match.