RubyGems - public_suffix - Versions diffs - 2.0.5 → 3.0.0 - Mend

public_suffix 2.0.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/.rubocop.yml +14 -0
data/.rubocop_defaults.yml +54 -15
data/.travis.yml +6 -3
data/CHANGELOG.md +18 -9
data/Gemfile +2 -2
data/README.md +14 -0
data/Rakefile +5 -3
data/bin/console +14 -0
data/data/list.txt +462 -82
data/lib/public_suffix.rb +20 -16
data/lib/public_suffix/list.rb +66 -107
data/lib/public_suffix/rule.rb +42 -52
data/lib/public_suffix/version.rb +1 -1
data/public_suffix.gemspec +1 -1
data/test/.empty +2 -0
data/test/acceptance_test.rb +10 -2
data/test/benchmarks/bm_find.rb +66 -0
data/test/benchmarks/bm_find_all.rb +102 -0
data/test/benchmarks/bm_names.rb +91 -0
data/test/benchmarks/bm_select.rb +26 -0
data/test/benchmarks/bm_select_incremental.rb +25 -0
data/test/benchmarks/bm_valid.rb +101 -0
data/test/profilers/domain_profiler.rb +12 -0
data/test/profilers/find_profiler.rb +12 -0
data/test/profilers/find_profiler_jp.rb +12 -0
data/test/{initialization_profiler.rb → profilers/initialization_profiler.rb} +1 -1
data/test/profilers/list_profsize.rb +11 -0
data/test/profilers/object_binsize.rb +57 -0
data/test/psl_test.rb +1 -1
data/test/test_helper.rb +0 -8
data/test/unit/domain_test.rb +15 -15
data/test/unit/list_test.rb +46 -66
data/test/unit/public_suffix_test.rb +5 -5
data/test/unit/rule_test.rb +45 -49
metadata +30 -12
data/test/benchmark_helper.rb +0 -4
data/test/execution_profiler.rb +0 -14
data/test/performance_benchmark.rb +0 -38

data/lib/public_suffix.rb CHANGED

@@ -4,11 +4,11 @@
 #
 # Copyright (c) 2009-2017 Simone Carletti <weppos@weppos.net>
-require "public_suffix/domain"
-require "public_suffix/version"
-require "public_suffix/errors"
-require "public_suffix/rule"
-require "public_suffix/list"
+require_relative "public_suffix/domain"
+require_relative "public_suffix/version"
+require_relative "public_suffix/errors"
+require_relative "public_suffix/rule"
+require_relative "public_suffix/list"
 # PublicSuffix is a Ruby domain name parser based on the Public Suffix List.
 #
@@ -28,27 +28,31 @@ module PublicSuffix
   #
   # @example Parse a valid domain
   #   PublicSuffix.parse("google.com")
-  #   # => #<PublicSuffix::Domain ...>
+  #   # => #<PublicSuffix::Domain:0x007fec2e51e588 @sld="google", @tld="com", @trd=nil>
   #
   # @example Parse a valid subdomain
   #   PublicSuffix.parse("www.google.com")
-  #   # => #<PublicSuffix::Domain ...>
+  #   # => #<PublicSuffix::Domain:0x007fec276d4cf8 @sld="google", @tld="com", @trd="www">
   #
   # @example Parse a fully qualified domain
   #   PublicSuffix.parse("google.com.")
-  #   # => #<PublicSuffix::Domain ...>
+  #   # => #<PublicSuffix::Domain:0x007fec257caf38 @sld="google", @tld="com", @trd=nil>
   #
   # @example Parse a fully qualified domain (subdomain)
   #   PublicSuffix.parse("www.google.com.")
-  #   # => #<PublicSuffix::Domain ...>
+  #   # => #<PublicSuffix::Domain:0x007fec27b6bca8 @sld="google", @tld="com", @trd="www">
   #
-  # @example Parse an invalid domain
+  # @example Parse an invalid (unlisted) domain
   #   PublicSuffix.parse("x.yz")
-  #   # => PublicSuffix::DomainInvalid
+  #   # => #<PublicSuffix::Domain:0x007fec2f49bec0 @sld="x", @tld="yz", @trd=nil>
+  #
+  # @example Parse an invalid (unlisted) domain with strict checking (without applying the default * rule)
+  #   PublicSuffix.parse("x.yz", default_rule: nil)
+  #   # => PublicSuffix::DomainInvalid: `x.yz` is not a valid domain
   #
   # @example Parse an URL (not supported, only domains)
   #   PublicSuffix.parse("http://www.google.com")
-  #   # => PublicSuffix::DomainInvalid
+  #   # => PublicSuffix::DomainInvalid: http://www.google.com is not expected to contain a scheme
   #
   #
   # @param  [String, #to_s] name The domain name or fully qualified domain name to parse.
@@ -95,11 +99,11 @@ module PublicSuffix
   #   PublicSuffix.valid?("example.tldnotlisted")
   #   # => true
   #
-  # @example Validate a not-allowed domain
-  #   PublicSuffix.valid?("example.do")
-  #   # => false
-  #   PublicSuffix.valid?("www.example.do")
+  # @example Validate a not-listed domain with strict checking (without applying the default * rule)
+  #   PublicSuffix.valid?("example.tldnotlisted")
   #   # => true
+  #   PublicSuffix.valid?("example.tldnotlisted", default_rule: nil)
+  #   # => false
   #
   # @example Validate a fully qualified domain
   #   PublicSuffix.valid?("google.com.")

data/lib/public_suffix/list.rb CHANGED

@@ -35,12 +35,9 @@ module PublicSuffix
   # The {PublicSuffix::List.default} rule list is used
   # to tokenize and validate a domain.
   #
-  # {PublicSuffix::List} implements +Enumerable+ module.
-  #
   class List
-    include Enumerable
-    DEFAULT_LIST_PATH = File.join(File.dirname(__FILE__), "..", "..", "data", "list.txt")
+    DEFAULT_LIST_PATH = File.expand_path("../../data/list.txt", __dir__)
     # Gets the default rule list.
     #
@@ -62,22 +59,12 @@ module PublicSuffix
       @default = value
     end
-    # Sets the default rule list to +nil+.
-    #
-    # @return [self]
-    def self.clear
-      self.default = nil
-      self
-    end
-    # rubocop:disable Metrics/MethodLength
     # Parse given +input+ treating the content as Public Suffix List.
     #
     # See http://publicsuffix.org/format/ for more details about input format.
     #
     # @param  string [#each_line] The list to parse.
-    # @param  private_domain [Boolean] whether to ignore the private domains section.
+    # @param  private_domains [Boolean] whether to ignore the private domains section.
     # @return [Array<PublicSuffix::Rule::*>]
     def self.parse(input, private_domains: true)
       comment_token = "//".freeze
@@ -103,53 +90,21 @@ module PublicSuffix
             next
           else
-            list.add(Rule.factory(line, private: section == 2), reindex: false)
+            list.add(Rule.factory(line, private: section == 2))
           end
         end
       end
     end
-    # rubocop:enable Metrics/MethodLength
-    # Gets the array of rules.
-    #
-    # @return [Array<PublicSuffix::Rule::*>]
-    attr_reader :rules
     # Initializes an empty {PublicSuffix::List}.
     #
     # @yield [self] Yields on self.
     # @yieldparam [PublicSuffix::List] self The newly created instance.
-    #
     def initialize
-      @rules = []
+      @rules = {}
       yield(self) if block_given?
-      reindex!
-    end
-    # Creates a naive index for +@rules+. Just a hash that will tell
-    # us where the elements of +@rules+ are relative to its first
-    # {PublicSuffix::Rule::Base#labels} element.
-    #
-    # For instance if @rules[5] and @rules[4] are the only elements of the list
-    # where Rule#labels.first is 'us' @indexes['us'] #=> [5,4], that way in
-    # select we can avoid mapping every single rule against the candidate domain.
-    def reindex!
-      @indexes = {}
-      @rules.each_with_index do |rule, index|
-        tld = Domain.name_to_labels(rule.value).last
-        @indexes[tld] ||= []
-        @indexes[tld] << index
-      end
-    end
-    # Gets the naive index, a hash that with the keys being the first label of
-    # every rule pointing to an array of integers (indexes of the rules in @rules).
-    def indexes
-      @indexes.dup
     end
@@ -159,42 +114,35 @@ module PublicSuffix
     # {PublicSuffix::List} and each +PublicSuffix::Rule::*+
     # in list <tt>one</tt> is available in list <tt>two</tt>, in the same order.
     #
-    # @param [PublicSuffix::List] other
-    #   The List to compare.
-    #
+    # @param  other [PublicSuffix::List] the List to compare
     # @return [Boolean]
     def ==(other)
       return false unless other.is_a?(List)
-      equal?(other) || rules == other.rules
+      equal?(other) || @rules == other.rules
     end
     alias eql? ==
     # Iterates each rule in the list.
-    def each(*args, &block)
-      @rules.each(*args, &block)
+    def each(&block)
+      Enumerator.new do |y|
+        @rules.each do |key, node|
+          y << entry_to_rule(node, key)
+        end
+      end.each(&block)
     end
     # Adds the given object to the list and optionally refreshes the rule index.
     #
-    # @param [PublicSuffix::Rule::*] rule
-    #   The rule to add to the list.
-    # @param [Boolean] reindex
-    #   Set to true to recreate the rule index
-    #   after the rule has been added to the list.
-    #
+    # @param  rule [PublicSuffix::Rule::*] the rule to add to the list
     # @return [self]
-    #
-    # @see #reindex!
-    #
-    def add(rule, reindex: true)
-      @rules << rule
-      reindex! if reindex
+    def add(rule)
+      @rules[rule.value] = rule_to_entry(rule)
       self
     end
     alias << add
-    # Gets the number of elements in the list.
+    # Gets the number of rules in the list.
     #
     # @return [Integer]
     def size
@@ -208,37 +156,18 @@ module PublicSuffix
       @rules.empty?
     end
-    # Removes all elements.
+    # Removes all rules.
     #
     # @return [self]
     def clear
       @rules.clear
-      reindex!
       self
     end
-    # Finds and returns the most appropriate rule for the domain name.
-    #
-    # From the Public Suffix List documentation:
-    #
-    # - If a hostname matches more than one rule in the file,
-    #   the longest matching rule (the one with the most levels) will be used.
-    # - An exclamation mark (!) at the start of a rule marks an exception to a previous wildcard rule.
-    #   An exception rule takes priority over any other matching rule.
-    #
-    # ## Algorithm description
-    #
-    # 1. Match domain against all rules and take note of the matching ones.
-    # 2. If no rules match, the prevailing rule is "*".
-    # 3. If more than one rule matches, the prevailing rule is the one which is an exception rule.
-    # 4. If there is no matching exception rule, the prevailing rule is the one with the most labels.
-    # 5. If the prevailing rule is a exception rule, modify it by removing the leftmost label.
-    # 6. The public suffix is the set of labels from the domain
-    #    which directly match the labels of the prevailing rule (joined by dots).
-    # 7. The registered domain is the public suffix plus one additional label.
+    # Finds and returns the rule corresponding to the longest public suffix for the hostname.
     #
-    # @param  name [String, #to_s] The domain name.
-    # @param  [PublicSuffix::Rule::*] default The default rule to return in case no rule matches.
+    # @param  name [#to_s] the hostname
+    # @param  default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
     # @return [PublicSuffix::Rule::*]
     def find(name, default: default_rule, **options)
       rule = select(name, **options).inject do |l, r|
@@ -248,30 +177,44 @@ module PublicSuffix
       rule || default
     end
-    # Selects all the rules matching given domain.
+    # Selects all the rules matching given hostame.
     #
-    # Internally, the lookup heavily rely on the `@indexes`. The input is split into labels,
-    # and we retriever from the index only the rules that end with the input label. After that,
-    # a sequential scan is performed. In most cases, where the number of rules for the same label
-    # is limited, this algorithm is efficient enough.
-    #
-    # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as private domain.
-    # Note that the rules will still be part of the loop. If you frequently need to access lists
-    # ignoring the private domains, you should create a list that doesn't include these domains setting the
+    # If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
+    # private domain. Note that the rules will still be part of the loop.
+    # If you frequently need to access lists ignoring the private domains,
+    # you should create a list that doesn't include these domains setting the
     # `private_domains: false` option when calling {.parse}.
     #
-    # @param  [String, #to_s] name The domain name.
-    # @param  [Boolean] ignore_private
+    # Note that this method is currently private, as you should not rely on it. Instead,
+    # the public interface is {#find}. The current internal algorithm allows to return all
+    # matching rules, but different data structures may not be able to do it, and instead would
+    # return only the match. For this reason, you should rely on {#find}.
+    #
+    # @param  name [#to_s] the hostname
+    # @param  ignore_private [Boolean]
     # @return [Array<PublicSuffix::Rule::*>]
     def select(name, ignore_private: false)
       name = name.to_s
-      indices = (@indexes[Domain.name_to_labels(name).last] || [])
-      finder = @rules.values_at(*indices).lazy
-      finder = finder.select { |rule| rule.match?(name) }
-      finder = finder.select { |rule| !rule.private } if ignore_private
-      finder.to_a
+      parts = name.split(DOT).reverse!
+      index = 0
+      query = parts[index]
+      rules = []
+      loop do
+        match = @rules[query]
+        if !match.nil? && (ignore_private == false || match.private == false)
+          rules << entry_to_rule(match, query)
+        end
+        index += 1
+        break if index >= parts.size
+        query = parts[index] + DOT + query
+      end
+      rules
     end
+    private :select
     # Gets the default rule.
     #
@@ -282,5 +225,21 @@ module PublicSuffix
       PublicSuffix::Rule.default
     end
+    protected
+    attr_reader :rules
+    private
+    def entry_to_rule(entry, value)
+      entry.type.new(value: value, length: entry.length, private: entry.private)
+    end
+    def rule_to_entry(rule)
+      Rule::Entry.new(rule.class, rule.length, rule.private)
+    end
   end
 end

data/lib/public_suffix/rule.rb CHANGED

@@ -19,6 +19,9 @@ module PublicSuffix
   #
   module Rule
+    # @api internal
+    Entry = Struct.new(:type, :length, :private)
     # = Abstract rule class
     #
     # This represent the base class for a Rule definition
@@ -99,16 +102,28 @@ module PublicSuffix
       # @return [String] the rule definition
       attr_reader :value
+      # @return [String] the length of the rule
+      attr_reader :length
       # @return [Boolean] true if the rule is a private domain
       attr_reader :private
-      # Initializes a new rule with name and value.
-      # If value is +nil+, name also becomes the value for this rule.
+      # Initializes a new rule from the content.
+      #
+      # @param  content [String] the content of the rule
+      # @param  private [Boolean]
+      def self.build(content, private: false)
+        new(value: content, private: private)
+      end
+      # Initializes a new rule.
       #
-      # @param value [String] the value of the rule
-      def initialize(value, private: false)
+      # @param  value [String]
+      # @param  private [Boolean]
+      def initialize(value:, length: nil, private: false)
         @value    = value.to_s
+        @length   = length || @value.count(DOT) + 1
         @private  = private
       end
@@ -137,12 +152,12 @@ module PublicSuffix
       # @see https://publicsuffix.org/list/
       #
       # @example
-      #   Rule.factory("com").match?("example.com")
+      #   PublicSuffix::Rule.factory("com").match?("example.com")
       #   # => true
-      #   Rule.factory("com").match?("example.net")
+      #   PublicSuffix::Rule.factory("com").match?("example.net")
       #   # => false
       #
-      # @param  name [String, #to_s] The domain name to check.
+      # @param  name [String] the domain name to check
       # @return [Boolean]
       def match?(name)
         # Note: it works because of the assumption there are no
@@ -150,7 +165,7 @@ module PublicSuffix
         # we need to properly walk the input and skip parts according
         # to wildcard component.
         diff = name.chomp(value)
-        diff.empty? || diff[-1] == "."
+        diff.empty? || diff[-1] == DOT
       end
       # @abstract
@@ -158,11 +173,6 @@ module PublicSuffix
         raise NotImplementedError
       end
-      # @abstract
-      def length
-        raise NotImplementedError
-      end
       # @abstract
       # @param  [String, #to_s] name The domain name to decompose
       # @return [Array<String, nil>]
@@ -200,27 +210,26 @@ module PublicSuffix
         @value.split(DOT)
       end
-      # Gets the length of this rule for comparison,
-      # represented by the number of dot-separated parts in the rule.
-      #
-      # @return [Integer] The length of the rule.
-      def length
-        @length ||= parts.length
-      end
     end
     # Wildcard represents a wildcard rule (e.g. *.co.uk).
     class Wildcard < Base
-      # Initializes a new rule from +definition+.
+      # Initializes a new rule from the content.
       #
-      # The wildcard "*" is removed from the value, as it's common
-      # for each wildcard rule.
+      # @param  content [String] the content of the rule
+      # @param  private [Boolean]
+      def self.build(content, private: false)
+        new(value: content.to_s[2..-1], private: private)
+      end
+      # Initializes a new rule.
       #
-      # @param definition [String] the rule as defined in the PSL
-      def initialize(definition, private: false)
-        super(definition.to_s[2..-1], private: private)
+      # @param  value [String]
+      # @param  private [Boolean]
+      def initialize(value:, length: nil, private: false)
+        super(value: value, length: length, private: private)
+        length or @length += 1 # * counts as 1
       end
       # Gets the original rule definition.
@@ -248,28 +257,17 @@ module PublicSuffix
         @value.split(DOT)
       end
-      # Gets the length of this rule for comparison,
-      # represented by the number of dot-separated parts in the rule
-      # plus 1 for the *.
-      #
-      # @return [Integer] The length of the rule.
-      def length
-        @length ||= parts.length + 1 # * counts as 1
-      end
     end
     # Exception represents an exception rule (e.g. !parliament.uk).
     class Exception < Base
-      # Initializes a new rule from +definition+.
-      #
-      # The bang ! is removed from the value, as it's common
-      # for each wildcard rule.
+      # Initializes a new rule from the content.
       #
-      # @param definition [String] the rule as defined in the PSL
-      def initialize(definition, private: false)
-        super(definition.to_s[1..-1], private: private)
+      # @param  content [String] the content of the rule
+      # @param  private [Boolean]
+      def self.build(content, private: false)
+        new(value: content.to_s[1..-1], private: private)
       end
       # Gets the original rule definition.
@@ -302,14 +300,6 @@ module PublicSuffix
         @value.split(DOT)[1..-1]
       end
-      # Gets the length of this rule for comparison,
-      # represented by the number of dot-separated parts in the rule.
-      #
-      # @return [Integer] The length of the rule.
-      def length
-        @length ||= parts.length
-      end
     end
@@ -339,7 +329,7 @@ module PublicSuffix
         Exception
       else
         Normal
-      end.new(content, private: private)
+      end.build(content, private: private)
     end
     # The default rule to use if no rule match.