public_suffix 3.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rubocop.yml +36 -0
  4. data/.rubocop_defaults.yml +179 -0
  5. data/.ruby-gemset +1 -0
  6. data/.travis.yml +31 -0
  7. data/.yardopts +1 -0
  8. data/2.0-Upgrade.md +52 -0
  9. data/CHANGELOG.md +353 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +22 -0
  12. data/README.md +202 -0
  13. data/Rakefile +51 -0
  14. data/bin/console +15 -0
  15. data/data/list.txt +12966 -0
  16. data/lib/public_suffix.rb +179 -0
  17. data/lib/public_suffix/domain.rb +235 -0
  18. data/lib/public_suffix/errors.rb +41 -0
  19. data/lib/public_suffix/list.rb +247 -0
  20. data/lib/public_suffix/rule.rb +350 -0
  21. data/lib/public_suffix/version.rb +13 -0
  22. data/public_suffix.gemspec +25 -0
  23. data/test/.empty +2 -0
  24. data/test/acceptance_test.rb +129 -0
  25. data/test/benchmarks/bm_find.rb +66 -0
  26. data/test/benchmarks/bm_find_all.rb +102 -0
  27. data/test/benchmarks/bm_names.rb +91 -0
  28. data/test/benchmarks/bm_select.rb +26 -0
  29. data/test/benchmarks/bm_select_incremental.rb +25 -0
  30. data/test/benchmarks/bm_valid.rb +101 -0
  31. data/test/profilers/domain_profiler.rb +12 -0
  32. data/test/profilers/find_profiler.rb +12 -0
  33. data/test/profilers/find_profiler_jp.rb +12 -0
  34. data/test/profilers/initialization_profiler.rb +11 -0
  35. data/test/profilers/list_profsize.rb +11 -0
  36. data/test/profilers/object_binsize.rb +57 -0
  37. data/test/psl_test.rb +52 -0
  38. data/test/test_helper.rb +18 -0
  39. data/test/tests.txt +98 -0
  40. data/test/unit/domain_test.rb +106 -0
  41. data/test/unit/errors_test.rb +25 -0
  42. data/test/unit/list_test.rb +241 -0
  43. data/test/unit/public_suffix_test.rb +188 -0
  44. data/test/unit/rule_test.rb +222 -0
  45. metadata +151 -0
@@ -0,0 +1,247 @@
1
+ # frozen_string_literal: true
2
+
3
+ # = Public Suffix
4
+ #
5
+ # Domain name parser based on the Public Suffix List.
6
+ #
7
+ # Copyright (c) 2009-2019 Simone Carletti <weppos@weppos.net>
8
+
9
+ module PublicSuffix
10
+
11
+ # A {PublicSuffix::List} is a collection of one
12
+ # or more {PublicSuffix::Rule}.
13
+ #
14
+ # Given a {PublicSuffix::List},
15
+ # you can add or remove {PublicSuffix::Rule},
16
+ # iterate all items in the list or search for the first rule
17
+ # which matches a specific domain name.
18
+ #
19
+ # # Create a new list
20
+ # list = PublicSuffix::List.new
21
+ #
22
+ # # Push two rules to the list
23
+ # list << PublicSuffix::Rule.factory("it")
24
+ # list << PublicSuffix::Rule.factory("com")
25
+ #
26
+ # # Get the size of the list
27
+ # list.size
28
+ # # => 2
29
+ #
30
+ # # Search for the rule matching given domain
31
+ # list.find("example.com")
32
+ # # => #<PublicSuffix::Rule::Normal>
33
+ # list.find("example.org")
34
+ # # => nil
35
+ #
36
+ # You can create as many {PublicSuffix::List} you want.
37
+ # The {PublicSuffix::List.default} rule list is used
38
+ # to tokenize and validate a domain.
39
+ #
40
+ class List
41
+
42
+ DEFAULT_LIST_PATH = File.expand_path("../../data/list.txt", __dir__)
43
+
44
+ # Gets the default rule list.
45
+ #
46
+ # Initializes a new {PublicSuffix::List} parsing the content
47
+ # of {PublicSuffix::List.default_list_content}, if required.
48
+ #
49
+ # @return [PublicSuffix::List]
50
+ def self.default(**options)
51
+ @default ||= parse(File.read(DEFAULT_LIST_PATH), options)
52
+ end
53
+
54
+ # Sets the default rule list to +value+.
55
+ #
56
+ # @param value [PublicSuffix::List] the new list
57
+ # @return [PublicSuffix::List]
58
+ def self.default=(value)
59
+ @default = value
60
+ end
61
+
62
+ # Parse given +input+ treating the content as Public Suffix List.
63
+ #
64
+ # See http://publicsuffix.org/format/ for more details about input format.
65
+ #
66
+ # @param string [#each_line] the list to parse
67
+ # @param private_domains [Boolean] whether to ignore the private domains section
68
+ # @return [PublicSuffix::List]
69
+ def self.parse(input, private_domains: true)
70
+ comment_token = "//"
71
+ private_token = "===BEGIN PRIVATE DOMAINS==="
72
+ section = nil # 1 == ICANN, 2 == PRIVATE
73
+
74
+ new do |list|
75
+ input.each_line do |line|
76
+ line.strip!
77
+ case # rubocop:disable Style/EmptyCaseCondition
78
+
79
+ # skip blank lines
80
+ when line.empty?
81
+ next
82
+
83
+ # include private domains or stop scanner
84
+ when line.include?(private_token)
85
+ break if !private_domains
86
+
87
+ section = 2
88
+
89
+ # skip comments
90
+ when line.start_with?(comment_token)
91
+ next
92
+
93
+ else
94
+ list.add(Rule.factory(line, private: section == 2))
95
+
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+
102
+ # Initializes an empty {PublicSuffix::List}.
103
+ #
104
+ # @yield [self] Yields on self.
105
+ # @yieldparam [PublicSuffix::List] self The newly created instance.
106
+ def initialize
107
+ @rules = {}
108
+ yield(self) if block_given?
109
+ end
110
+
111
+
112
+ # Checks whether two lists are equal.
113
+ #
114
+ # List <tt>one</tt> is equal to <tt>two</tt>, if <tt>two</tt> is an instance of
115
+ # {PublicSuffix::List} and each +PublicSuffix::Rule::*+
116
+ # in list <tt>one</tt> is available in list <tt>two</tt>, in the same order.
117
+ #
118
+ # @param other [PublicSuffix::List] the List to compare
119
+ # @return [Boolean]
120
+ def ==(other)
121
+ return false unless other.is_a?(List)
122
+
123
+ equal?(other) || @rules == other.rules
124
+ end
125
+ alias eql? ==
126
+
127
+ # Iterates each rule in the list.
128
+ def each(&block)
129
+ Enumerator.new do |y|
130
+ @rules.each do |key, node|
131
+ y << entry_to_rule(node, key)
132
+ end
133
+ end.each(&block)
134
+ end
135
+
136
+
137
+ # Adds the given object to the list and optionally refreshes the rule index.
138
+ #
139
+ # @param rule [PublicSuffix::Rule::*] the rule to add to the list
140
+ # @return [self]
141
+ def add(rule)
142
+ @rules[rule.value] = rule_to_entry(rule)
143
+ self
144
+ end
145
+ alias << add
146
+
147
+ # Gets the number of rules in the list.
148
+ #
149
+ # @return [Integer]
150
+ def size
151
+ @rules.size
152
+ end
153
+
154
+ # Checks whether the list is empty.
155
+ #
156
+ # @return [Boolean]
157
+ def empty?
158
+ @rules.empty?
159
+ end
160
+
161
+ # Removes all rules.
162
+ #
163
+ # @return [self]
164
+ def clear
165
+ @rules.clear
166
+ self
167
+ end
168
+
169
+ # Finds and returns the rule corresponding to the longest public suffix for the hostname.
170
+ #
171
+ # @param name [#to_s] the hostname
172
+ # @param default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
173
+ # @return [PublicSuffix::Rule::*]
174
+ def find(name, default: default_rule, **options)
175
+ rule = select(name, **options).inject do |l, r|
176
+ return r if r.class == Rule::Exception
177
+
178
+ l.length > r.length ? l : r
179
+ end
180
+ rule || default
181
+ end
182
+
183
+ # Selects all the rules matching given hostame.
184
+ #
185
+ # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
186
+ # private domain. Note that the rules will still be part of the loop.
187
+ # If you frequently need to access lists ignoring the private domains,
188
+ # you should create a list that doesn't include these domains setting the
189
+ # `private_domains: false` option when calling {.parse}.
190
+ #
191
+ # Note that this method is currently private, as you should not rely on it. Instead,
192
+ # the public interface is {#find}. The current internal algorithm allows to return all
193
+ # matching rules, but different data structures may not be able to do it, and instead would
194
+ # return only the match. For this reason, you should rely on {#find}.
195
+ #
196
+ # @param name [#to_s] the hostname
197
+ # @param ignore_private [Boolean]
198
+ # @return [Array<PublicSuffix::Rule::*>]
199
+ def select(name, ignore_private: false)
200
+ name = name.to_s
201
+
202
+ parts = name.split(DOT).reverse!
203
+ index = 0
204
+ query = parts[index]
205
+ rules = []
206
+
207
+ loop do
208
+ match = @rules[query]
209
+ rules << entry_to_rule(match, query) if !match.nil? && (ignore_private == false || match.private == false)
210
+
211
+ index += 1
212
+ break if index >= parts.size
213
+
214
+ query = parts[index] + DOT + query
215
+ end
216
+
217
+ rules
218
+ end
219
+ private :select # rubocop:disable Style/AccessModifierDeclarations
220
+
221
+ # Gets the default rule.
222
+ #
223
+ # @see PublicSuffix::Rule.default_rule
224
+ #
225
+ # @return [PublicSuffix::Rule::*]
226
+ def default_rule
227
+ PublicSuffix::Rule.default
228
+ end
229
+
230
+
231
+ protected
232
+
233
+ attr_reader :rules
234
+
235
+
236
+ private
237
+
238
+ def entry_to_rule(entry, value)
239
+ entry.type.new(value: value, length: entry.length, private: entry.private)
240
+ end
241
+
242
+ def rule_to_entry(rule)
243
+ Rule::Entry.new(rule.class, rule.length, rule.private)
244
+ end
245
+
246
+ end
247
+ end
@@ -0,0 +1,350 @@
1
+ # frozen_string_literal: true
2
+
3
+ # = Public Suffix
4
+ #
5
+ # Domain name parser based on the Public Suffix List.
6
+ #
7
+ # Copyright (c) 2009-2019 Simone Carletti <weppos@weppos.net>
8
+
9
+ module PublicSuffix
10
+
11
+ # A Rule is a special object which holds a single definition
12
+ # of the Public Suffix List.
13
+ #
14
+ # There are 3 types of rules, each one represented by a specific
15
+ # subclass within the +PublicSuffix::Rule+ namespace.
16
+ #
17
+ # To create a new Rule, use the {PublicSuffix::Rule#factory} method.
18
+ #
19
+ # PublicSuffix::Rule.factory("ar")
20
+ # # => #<PublicSuffix::Rule::Normal>
21
+ #
22
+ module Rule
23
+
24
+ # @api internal
25
+ Entry = Struct.new(:type, :length, :private)
26
+
27
+ # = Abstract rule class
28
+ #
29
+ # This represent the base class for a Rule definition
30
+ # in the {Public Suffix List}[https://publicsuffix.org].
31
+ #
32
+ # This is intended to be an Abstract class
33
+ # and you shouldn't create a direct instance. The only purpose
34
+ # of this class is to expose a common interface
35
+ # for all the available subclasses.
36
+ #
37
+ # * {PublicSuffix::Rule::Normal}
38
+ # * {PublicSuffix::Rule::Exception}
39
+ # * {PublicSuffix::Rule::Wildcard}
40
+ #
41
+ # ## Properties
42
+ #
43
+ # A rule is composed by 4 properties:
44
+ #
45
+ # value - A normalized version of the rule name.
46
+ # The normalization process depends on rule tpe.
47
+ #
48
+ # Here's an example
49
+ #
50
+ # PublicSuffix::Rule.factory("*.google.com")
51
+ # #<PublicSuffix::Rule::Wildcard:0x1015c14b0
52
+ # @value="google.com"
53
+ # >
54
+ #
55
+ # ## Rule Creation
56
+ #
57
+ # The best way to create a new rule is passing the rule name
58
+ # to the <tt>PublicSuffix::Rule.factory</tt> method.
59
+ #
60
+ # PublicSuffix::Rule.factory("com")
61
+ # # => PublicSuffix::Rule::Normal
62
+ #
63
+ # PublicSuffix::Rule.factory("*.com")
64
+ # # => PublicSuffix::Rule::Wildcard
65
+ #
66
+ # This method will detect the rule type and create an instance
67
+ # from the proper rule class.
68
+ #
69
+ # ## Rule Usage
70
+ #
71
+ # A rule describes the composition of a domain name and explains how to tokenize
72
+ # the name into tld, sld and trd.
73
+ #
74
+ # To use a rule, you first need to be sure the name you want to tokenize
75
+ # can be handled by the current rule.
76
+ # You can use the <tt>#match?</tt> method.
77
+ #
78
+ # rule = PublicSuffix::Rule.factory("com")
79
+ #
80
+ # rule.match?("google.com")
81
+ # # => true
82
+ #
83
+ # rule.match?("google.com")
84
+ # # => false
85
+ #
86
+ # Rule order is significant. A name can match more than one rule.
87
+ # See the {Public Suffix Documentation}[http://publicsuffix.org/format/]
88
+ # to learn more about rule priority.
89
+ #
90
+ # When you have the right rule, you can use it to tokenize the domain name.
91
+ #
92
+ # rule = PublicSuffix::Rule.factory("com")
93
+ #
94
+ # rule.decompose("google.com")
95
+ # # => ["google", "com"]
96
+ #
97
+ # rule.decompose("www.google.com")
98
+ # # => ["www.google", "com"]
99
+ #
100
+ # @abstract
101
+ #
102
+ class Base
103
+
104
+ # @return [String] the rule definition
105
+ attr_reader :value
106
+
107
+ # @return [String] the length of the rule
108
+ attr_reader :length
109
+
110
+ # @return [Boolean] true if the rule is a private domain
111
+ attr_reader :private
112
+
113
+
114
+ # Initializes a new rule from the content.
115
+ #
116
+ # @param content [String] the content of the rule
117
+ # @param private [Boolean]
118
+ def self.build(content, private: false)
119
+ new(value: content, private: private)
120
+ end
121
+
122
+ # Initializes a new rule.
123
+ #
124
+ # @param value [String]
125
+ # @param private [Boolean]
126
+ def initialize(value:, length: nil, private: false)
127
+ @value = value.to_s
128
+ @length = length || @value.count(DOT) + 1
129
+ @private = private
130
+ end
131
+
132
+ # Checks whether this rule is equal to <tt>other</tt>.
133
+ #
134
+ # @param [PublicSuffix::Rule::*] other The rule to compare
135
+ # @return [Boolean]
136
+ # Returns true if this rule and other are instances of the same class
137
+ # and has the same value, false otherwise.
138
+ def ==(other)
139
+ equal?(other) || (self.class == other.class && value == other.value)
140
+ end
141
+ alias eql? ==
142
+
143
+ # Checks if this rule matches +name+.
144
+ #
145
+ # A domain name is said to match a rule if and only if
146
+ # all of the following conditions are met:
147
+ #
148
+ # - When the domain and rule are split into corresponding labels,
149
+ # that the domain contains as many or more labels than the rule.
150
+ # - Beginning with the right-most labels of both the domain and the rule,
151
+ # and continuing for all labels in the rule, one finds that for every pair,
152
+ # either they are identical, or that the label from the rule is "*".
153
+ #
154
+ # @see https://publicsuffix.org/list/
155
+ #
156
+ # @example
157
+ # PublicSuffix::Rule.factory("com").match?("example.com")
158
+ # # => true
159
+ # PublicSuffix::Rule.factory("com").match?("example.net")
160
+ # # => false
161
+ #
162
+ # @param name [String] the domain name to check
163
+ # @return [Boolean]
164
+ def match?(name)
165
+ # Note: it works because of the assumption there are no
166
+ # rules like foo.*.com. If the assumption is incorrect,
167
+ # we need to properly walk the input and skip parts according
168
+ # to wildcard component.
169
+ diff = name.chomp(value)
170
+ diff.empty? || diff.end_with?(DOT)
171
+ end
172
+
173
+ # @abstract
174
+ def parts
175
+ raise NotImplementedError
176
+ end
177
+
178
+ # @abstract
179
+ # @param [String, #to_s] name The domain name to decompose
180
+ # @return [Array<String, nil>]
181
+ def decompose(*)
182
+ raise NotImplementedError
183
+ end
184
+
185
+ end
186
+
187
+ # Normal represents a standard rule (e.g. com).
188
+ class Normal < Base
189
+
190
+ # Gets the original rule definition.
191
+ #
192
+ # @return [String] The rule definition.
193
+ def rule
194
+ value
195
+ end
196
+
197
+ # Decomposes the domain name according to rule properties.
198
+ #
199
+ # @param [String, #to_s] name The domain name to decompose
200
+ # @return [Array<String>] The array with [trd + sld, tld].
201
+ def decompose(domain)
202
+ suffix = parts.join('\.')
203
+ matches = domain.to_s.match(/^(.*)\.(#{suffix})$/)
204
+ matches ? matches[1..2] : [nil, nil]
205
+ end
206
+
207
+ # dot-split rule value and returns all rule parts
208
+ # in the order they appear in the value.
209
+ #
210
+ # @return [Array<String>]
211
+ def parts
212
+ @value.split(DOT)
213
+ end
214
+
215
+ end
216
+
217
+ # Wildcard represents a wildcard rule (e.g. *.co.uk).
218
+ class Wildcard < Base
219
+
220
+ # Initializes a new rule from the content.
221
+ #
222
+ # @param content [String] the content of the rule
223
+ # @param private [Boolean]
224
+ def self.build(content, private: false)
225
+ new(value: content.to_s[2..-1], private: private)
226
+ end
227
+
228
+ # Initializes a new rule.
229
+ #
230
+ # @param value [String]
231
+ # @param private [Boolean]
232
+ def initialize(value:, length: nil, private: false)
233
+ super(value: value, length: length, private: private)
234
+ length or @length += 1 # * counts as 1
235
+ end
236
+
237
+ # Gets the original rule definition.
238
+ #
239
+ # @return [String] The rule definition.
240
+ def rule
241
+ value == "" ? STAR : STAR + DOT + value
242
+ end
243
+
244
+ # Decomposes the domain name according to rule properties.
245
+ #
246
+ # @param [String, #to_s] name The domain name to decompose
247
+ # @return [Array<String>] The array with [trd + sld, tld].
248
+ def decompose(domain)
249
+ suffix = ([".*?"] + parts).join('\.')
250
+ matches = domain.to_s.match(/^(.*)\.(#{suffix})$/)
251
+ matches ? matches[1..2] : [nil, nil]
252
+ end
253
+
254
+ # dot-split rule value and returns all rule parts
255
+ # in the order they appear in the value.
256
+ #
257
+ # @return [Array<String>]
258
+ def parts
259
+ @value.split(DOT)
260
+ end
261
+
262
+ end
263
+
264
+ # Exception represents an exception rule (e.g. !parliament.uk).
265
+ class Exception < Base
266
+
267
+ # Initializes a new rule from the content.
268
+ #
269
+ # @param content [String] the content of the rule
270
+ # @param private [Boolean]
271
+ def self.build(content, private: false)
272
+ new(value: content.to_s[1..-1], private: private)
273
+ end
274
+
275
+ # Gets the original rule definition.
276
+ #
277
+ # @return [String] The rule definition.
278
+ def rule
279
+ BANG + value
280
+ end
281
+
282
+ # Decomposes the domain name according to rule properties.
283
+ #
284
+ # @param [String, #to_s] name The domain name to decompose
285
+ # @return [Array<String>] The array with [trd + sld, tld].
286
+ def decompose(domain)
287
+ suffix = parts.join('\.')
288
+ matches = domain.to_s.match(/^(.*)\.(#{suffix})$/)
289
+ matches ? matches[1..2] : [nil, nil]
290
+ end
291
+
292
+ # dot-split rule value and returns all rule parts
293
+ # in the order they appear in the value.
294
+ # The leftmost label is not considered a label.
295
+ #
296
+ # See http://publicsuffix.org/format/:
297
+ # If the prevailing rule is a exception rule,
298
+ # modify it by removing the leftmost label.
299
+ #
300
+ # @return [Array<String>]
301
+ def parts
302
+ @value.split(DOT)[1..-1]
303
+ end
304
+
305
+ end
306
+
307
+
308
+ # Takes the +name+ of the rule, detects the specific rule class
309
+ # and creates a new instance of that class.
310
+ # The +name+ becomes the rule +value+.
311
+ #
312
+ # @example Creates a Normal rule
313
+ # PublicSuffix::Rule.factory("ar")
314
+ # # => #<PublicSuffix::Rule::Normal>
315
+ #
316
+ # @example Creates a Wildcard rule
317
+ # PublicSuffix::Rule.factory("*.ar")
318
+ # # => #<PublicSuffix::Rule::Wildcard>
319
+ #
320
+ # @example Creates an Exception rule
321
+ # PublicSuffix::Rule.factory("!congresodelalengua3.ar")
322
+ # # => #<PublicSuffix::Rule::Exception>
323
+ #
324
+ # @param [String] content The rule content.
325
+ # @return [PublicSuffix::Rule::*] A rule instance.
326
+ def self.factory(content, private: false)
327
+ case content.to_s[0, 1]
328
+ when STAR
329
+ Wildcard
330
+ when BANG
331
+ Exception
332
+ else
333
+ Normal
334
+ end.build(content, private: private)
335
+ end
336
+
337
+ # The default rule to use if no rule match.
338
+ #
339
+ # The default rule is "*". From https://publicsuffix.org/list/:
340
+ #
341
+ # > If no rules match, the prevailing rule is "*".
342
+ #
343
+ # @return [PublicSuffix::Rule::Wildcard] The default rule.
344
+ def self.default
345
+ factory(STAR)
346
+ end
347
+
348
+ end
349
+
350
+ end