RubyGems - twitter_cldr - Versions diffs - 2.4.3 → 3.0.0.beta1 - Mend

twitter_cldr 2.4.3 → 3.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/History.txt +3 -2
data/README.md +6 -6
data/Rakefile +8 -0
data/lib/twitter_cldr/collation/collator.rb +4 -2
data/lib/twitter_cldr/collation/sort_key_builder.rb +20 -8
data/lib/twitter_cldr/normalization.rb +7 -6
data/lib/twitter_cldr/normalization/base.rb +10 -1
data/lib/twitter_cldr/normalization/hangul.rb +36 -25
data/lib/twitter_cldr/normalization/nfkc.rb +7 -1
data/lib/twitter_cldr/normalization/nfkd.rb +8 -5
data/lib/twitter_cldr/normalization/quick_check.rb +41 -0
data/lib/twitter_cldr/resources.rb +1 -0
data/lib/twitter_cldr/resources/normalization_quick_check_importer.rb +86 -0
data/lib/twitter_cldr/shared/code_point.rb +41 -15
data/lib/twitter_cldr/version.rb +1 -1
data/resources/custom/locales/en-GB/units.yml +1 -1
data/resources/unicode_data/nfc_quick_check.yml +293 -0
data/resources/unicode_data/nfd_quick_check.yml +909 -0
data/resources/unicode_data/nfkc_quick_check.yml +989 -0
data/resources/unicode_data/nfkd_quick_check.yml +1537 -0
data/spec/collation/collator_spec.rb +19 -5
data/spec/collation/sort_key_builder_spec.rb +31 -9
data/spec/normalization/normalization_spec.rb +4 -0
data/spec/shared/code_point_spec.rb +9 -4
data/spec/utils/yaml/yaml_spec.rb +52 -63
data/twitter_cldr.gemspec +1 -0
metadata +171 -151

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e7f44e818f4e5166f45c42579d91ecc27f28a3d5
-  data.tar.gz: ae6815b54c99d68d8b1cf1fec574d5f0ae09097c
+  metadata.gz: 937a3b56e72068594f42c3c09d38da78c33e2a2c
+  data.tar.gz: d2f86005a0d9c50ba0ac90f4553e779544bd4ddf
 SHA512:
-  metadata.gz: 8573dcbf880ff7da628f00a0cfbb03e60f358cb9ee1aec72edf2ec5a0e2bf30a259355a7827fb48ca6591faecb114e0e0b4363a2928e879b22cf26ac3a9028f7
-  data.tar.gz: c2ccdcac7d53d42ff5901f832578e42ac7ba31bd6ff47ba26ac8939d68938532427cd442ca799fdce72aaaa5fff42602b4d78d1371a7570ca7990b19045c4c38
+  metadata.gz: eab5fae457ea2a42ac4a9943763eadda90c9505874f454a6fcf35b94af94f11aa09197d087965e3b06f1e1dffe7d389a59fb3c09c9792369d004ac35590d132d
+  data.tar.gz: 803107aec40257f79a901a02e1252e58b80f539590053641b579442d142d82a5ab78b4a0f6a0e298d4c2cf3512c29ac2dced848410b5c4ae0d6ef4a33d4ec0eb

data/History.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-== 2.4.3
+== 3.0.0
-* Fixing abbreviated timespan formats for en-GB.
+* Adding maximum_level option to SortKeyBuilder to limit the size of collation sort keys (@jrochkind).
+* Significant performance enhancements for normalization, estimated ~70% speed improvement.
 == 2.4.2

data/README.md CHANGED Viewed

@@ -86,7 +86,7 @@ In addition to formatting regular decimals, TwitterCLDR supports short and long
 ### Dates and Times
-`Date`, `Time`, and `DateTime` objects are supported:
+`Time`, and `DateTime` objects are supported.  `Date` objects are supported transiently:
 ```ruby
 DateTime.now.localize(:es).to_full_s               # "lunes, 12 de diciembre de 2011 21:44:57 UTC -08:00"
@@ -94,15 +94,15 @@ DateTime.now.localize(:es).to_long_s               # "12 de diciembre de 2011 21
 DateTime.now.localize(:es).to_medium_s             # "12/12/2011 21:44:57"
 DateTime.now.localize(:es).to_short_s              # "12/12/11 21:44"
-Date.today.localize(:es).to_full_s                 # "lunes 12 de diciembre de 2011"
-Date.today.localize(:es).to_long_s                 # "12 de diciembre de 2011"
-Date.today.localize(:es).to_medium_s               # "12/12/2011"
-Date.today.localize(:es).to_short_s                # "12/12/11"
 Time.now.localize(:es).to_full_s                   # "21:44:57 UTC -0800"
 Time.now.localize(:es).to_long_s                   # "21:44:57 UTC"
 Time.now.localize(:es).to_medium_s                 # "21:44:57"
 Time.now.localize(:es).to_short_s                  # "21:44"
+DateTime.now.localize(:es).to_date.to_full_s       # "lunes 12 de diciembre de 2011"
+DateTime.now.localize(:es).to_date.to_long_s       # "12 de diciembre de 2011"
+DateTime.now.localize(:es).to_date.to_medium_s     # "12/12/2011"
+DateTime.now.localize(:es).to_date.to_short_s      # "12/12/11"
 ```
 The default CLDR data set only includes 4 date formats, full, long, medium, and short.  See below for a list of additional formats.

data/Rakefile CHANGED Viewed

@@ -133,4 +133,12 @@ namespace :update do
   task :canonical_compositions do
     TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
   end
+  desc 'Import normalization quick check data'
+  task :normalization_quick_check do
+    TwitterCldr::Resources::NormalizationQuickCheckImporter.new(
+      './vendor',
+      './resources/unicode_data'
+    ).import
+  end
 end

data/lib/twitter_cldr/collation/collator.rb CHANGED Viewed

@@ -32,8 +32,10 @@ module TwitterCldr
         string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b)
       end
-      def get_sort_key(string_or_code_points)
-        TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), @options[:case_first])
+      # Second arg options, supports an option :maximum_level, to
+      # pass on to SortKeyBuilder :maximum_level.
+      def get_sort_key(string_or_code_points, method_options = {})
+        TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), :case_first => @options[:case_first], :maximum_level => method_options[:maximum_level])
       end
       def get_collation_elements(string_or_code_points)

data/lib/twitter_cldr/collation/sort_key_builder.rb CHANGED Viewed

@@ -17,7 +17,8 @@ module TwitterCldr
       LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
-      VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
+      VALID_CASE_FIRST_OPTIONS    = [nil, :lower, :upper]
+      VALID_MAXIMUM_LEVEL_OPTIONS = [nil, 1, 2, 3]
       attr_reader :collation_elements, :case_first
@@ -26,25 +27,36 @@ module TwitterCldr
       # Arguments:
       #
       #   collation_elements - an array of collation elements, represented as arrays of integer weights.
-      #   case_first         - case-first sorting order setting.
+      #   options            - hash of options:
+      #     case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
+      #     maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
       #
       # An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
       # method into another while forming the sort key.
       #
-      def self.build(collation_elements, case_first = nil)
-        new(collation_elements, case_first).bytes_array
+      def self.build(collation_elements, options = nil)
+        new(collation_elements, options).bytes_array
       end
       # Arguments:
       #
       #   collation_elements - an array of collation elements, represented as arrays of integer weights.
-      #   case_first         - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
+      #   options            - hash of options:
+      #     case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
+      #     maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
       #
-      def initialize(collation_elements, case_first = nil)
+      def initialize(collation_elements, options = {})
+        raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash
+        case_first = options[:case_first]
         raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)
+        maximum_level = options[:maximum_level]
+        raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level)
         @collation_elements = collation_elements
         @case_first         = case_first
+        @maximum_level      = maximum_level
         init_tertiary_constants
       end
@@ -59,8 +71,8 @@ module TwitterCldr
         @bytes_array = []
         append_primary_bytes
-        append_secondary_bytes
-        append_tertiary_bytes
+        append_secondary_bytes unless @maximum_level && (@maximum_level < 2)
+        append_tertiary_bytes  unless @maximum_level && (@maximum_level < 3)
         @bytes_array
       end

data/lib/twitter_cldr/normalization.rb CHANGED Viewed

@@ -5,12 +5,13 @@
 module TwitterCldr
   module Normalization
-    autoload :Base,   'twitter_cldr/normalization/base'
-    autoload :Hangul, 'twitter_cldr/normalization/hangul'
-    autoload :NFC,    'twitter_cldr/normalization/nfc'
-    autoload :NFD,    'twitter_cldr/normalization/nfd'
-    autoload :NFKC,   'twitter_cldr/normalization/nfkc'
-    autoload :NFKD,   'twitter_cldr/normalization/nfkd'
+    autoload :Base,       'twitter_cldr/normalization/base'
+    autoload :Hangul,     'twitter_cldr/normalization/hangul'
+    autoload :QuickCheck, 'twitter_cldr/normalization/quick_check'
+    autoload :NFC,        'twitter_cldr/normalization/nfc'
+    autoload :NFD,        'twitter_cldr/normalization/nfd'
+    autoload :NFKC,       'twitter_cldr/normalization/nfkc'
+    autoload :NFKD,       'twitter_cldr/normalization/nfkd'
     VALID_NORMALIZERS  = [:NFD, :NFKD, :NFC, :NFKC]
     DEFAULT_NORMALIZER = :NFD

data/lib/twitter_cldr/normalization/base.rb CHANGED Viewed

@@ -3,6 +3,8 @@
 # Copyright 2012 Twitter, Inc
 # http://www.apache.org/licenses/LICENSE-2.0
+require 'hamster'
 module TwitterCldr
   module Normalization
     class Base
@@ -16,11 +18,18 @@ module TwitterCldr
         end
         def combining_class_for(code_point)
-          TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
+          combining_class_cache[code_point] ||=
+            TwitterCldr::Shared::CodePoint.find(code_point).combining_class.to_i
         rescue NoMethodError
           0
         end
+        protected
+        def combining_class_cache
+          @combining_class_cache ||= {}
+        end
       end
     end

data/lib/twitter_cldr/normalization/hangul.rb CHANGED Viewed

@@ -9,6 +9,23 @@ module TwitterCldr
       class << self
+        SBASE  = 0xAC00
+        LBASE  = 0x1100
+        VBASE  = 0x1161
+        TBASE  = 0x11A7
+        LCOUNT = 19
+        VCOUNT = 21
+        TCOUNT = 28
+        NCOUNT = VCOUNT * TCOUNT # 588
+        SCOUNT = LCOUNT * NCOUNT # 11172
+        LLIMIT = LBASE + LCOUNT  # 0x1113 = 4371
+        VLIMIT = VBASE + VCOUNT  # 0x1176 = 4470
+        TLIMIT = TBASE + TCOUNT  # 0x11C3 = 4547
+        SLIMIT = SBASE + SCOUNT  # 0xD7A4 = 55204
         # Special composition for Hangul syllables. Documented in Section 3.12 at
         # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
         #
@@ -24,45 +41,39 @@ module TwitterCldr
         # Also see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm#Hangul_Implicit_CEs
         #
         def decompose(code_point)
-          l = code_point - SBASE
+          decomposition_cache[code_point] ||= begin
+            l = code_point - SBASE
-          t = l % TCOUNT
-          l /= TCOUNT
-          v = l % VCOUNT
-          l /= VCOUNT
+            t = l % TCOUNT
+            l /= TCOUNT
+            v = l % VCOUNT
+            l /= VCOUNT
-          result = []
+            result = []
-          result << LBASE + l
-          result << VBASE + v
-          result << TBASE + t if t > 0
+            result << LBASE + l
+            result << VBASE + v
+            result << TBASE + t if t > 0
-          result
+            result
+          end
         end
         def hangul_syllable?(code_point)
           (SBASE...SLIMIT).include?(code_point)
         end
-        SBASE  = 0xAC00
-        LBASE  = 0x1100
-        VBASE  = 0x1161
-        TBASE  = 0x11A7
+        private
-        LCOUNT = 19
-        VCOUNT = 21
-        TCOUNT = 28
-        NCOUNT = VCOUNT * TCOUNT # 588
-        SCOUNT = LCOUNT * NCOUNT # 11172
+        def syllable_cache
+          @syllable_cache ||= {}
+        end
-        LLIMIT = LBASE + LCOUNT  # 0x1113 = 4371
-        VLIMIT = VBASE + VCOUNT  # 0x1176 = 4470
-        TLIMIT = TBASE + TCOUNT  # 0x11C3 = 4547
-        SLIMIT = SBASE + SCOUNT  # 0xD7A4 = 55204
+        def decomposition_cache
+          @decomposition_cache ||= {}
+        end
       end
     end
   end
 end

data/lib/twitter_cldr/normalization/nfkc.rb CHANGED Viewed

@@ -13,6 +13,12 @@ module TwitterCldr
       class << self
+        VALID_HANGUL_SEQUENCES = [
+          [0, :lparts],
+          [1, :vparts],
+          [2, :tparts]
+        ]
         def normalize_code_points(code_points)
           compose(TwitterCldr::Normalization::NFKD.normalize_code_points(code_points))
         end
@@ -49,7 +55,7 @@ module TwitterCldr
         end
         def valid_hangul_sequence?(buffer_size, hangul_type)
-          [[0, :lparts], [1, :vparts], [2, :tparts]].include?([buffer_size, hangul_type])
+          VALID_HANGUL_SEQUENCES.include?([buffer_size, hangul_type])
         end
         def compose_hangul(code_points)

data/lib/twitter_cldr/normalization/nfkd.rb CHANGED Viewed

@@ -16,7 +16,6 @@ module TwitterCldr
     #
     class NFKD < Base
       class << self
         def normalize_code_points(code_points)
@@ -26,14 +25,19 @@ module TwitterCldr
         protected
         def decompose(code_points)
-          code_points.map { |code_point| decompose_recursively(code_point) }.flatten
+          code_points.inject(Hamster.list) do |ret, code_point|
+            decompose_recursively(code_point).each do |decomp_cp|
+              ret = ret.cons(decomp_cp)
+            end
+            ret
+          end.reverse.to_a
         end
         # Recursively decomposes a given code point with the values in its Decomposition Mapping property.
         #
         def decompose_recursively(code_point)
           unicode_data = TwitterCldr::Shared::CodePoint.find(code_point)
-          return code_point unless unicode_data
+          return [code_point] unless unicode_data
           if unicode_data.hangul_type == :compositions
             decompose_hangul(code_point)
@@ -48,7 +52,7 @@ module TwitterCldr
           if decompose?(unicode_data)
             unicode_data.decomposition.map { |code_point| decompose_recursively(code_point) }.flatten
           else
-            unicode_data.code_point
+            [unicode_data.code_point]
           end
         end
@@ -82,7 +86,6 @@ module TwitterCldr
           end
           result.concat(stable_sort(accum)) unless accum.empty?
           result.map { |cp_with_cc| cp_with_cc[0] }
         end

data/lib/twitter_cldr/normalization/quick_check.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Normalization
+    # This class isn't used anywhere because it was found that it negatively
+    # affects normalization performance.
+    module QuickCheck
+      class << self
+        def requires_normalization?(code_point, algorithm)
+          key = TwitterCldr::Utils.compute_cache_key(code_point, algorithm)
+          requires_cache[key] = if requires_cache[key].nil?
+            resource_for(algorithm).any? do |range|
+              range.include?(code_point)
+            end
+          else
+            requires_cache[key]
+          end
+        end
+        protected
+        def requires_cache
+          @requires_cache ||= {}
+        end
+        def resource_for(algorithm)
+          @resources ||= {}
+          @resources[algorithm] ||= TwitterCldr.get_resource("unicode_data", "#{algorithm.to_s.downcase}_quick_check")
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/resources.rb CHANGED Viewed

@@ -18,5 +18,6 @@ module TwitterCldr
     autoload :TailoringImporter,                  'twitter_cldr/resources/tailoring_importer'
     autoload :UnicodeDataImporter,                'twitter_cldr/resources/unicode_data_importer'
     autoload :BidiTestImporter,                   'twitter_cldr/resources/bidi_test_importer'
+    autoload :NormalizationQuickCheckImporter,    'twitter_cldr/resources/normalization_quick_check_importer'
   end
 end

data/lib/twitter_cldr/resources/normalization_quick_check_importer.rb ADDED Viewed

@@ -0,0 +1,86 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'twitter_cldr/resources/download'
+module TwitterCldr
+  module Resources
+    class NormalizationQuickCheckImporter
+      PROPS_FILE_URL = "ftp://ftp.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt"
+      # Arguments:
+      #
+      #   input_path  - path to a directory containing DerivedNormalizationProps.txt
+      #   output_path - output directory for imported YAML files
+      #
+      def initialize(input_path, output_path)
+        @input_path  = input_path
+        @output_path = output_path
+      end
+      def import
+        parse_props_file.each_pair do |algorithm, code_point_list|
+          File.open(File.join(@output_path, "#{algorithm.downcase}_quick_check.yml"), "w+") do |f|
+            f.write(YAML.dump(rangify(partition_prop_list(code_point_list))))
+          end
+        end
+      end
+      private
+      def rangify(lists)
+        lists.map { |list| (list.first..list.last) }
+      end
+      def partition_prop_list(list)
+        last_item = 0
+        list.inject([]) do |ret, item|
+          (item - last_item == 1) ? ret[-1] << item : ret << [item]
+          last_item = item
+          ret
+        end
+      end
+      def parse_props_file
+        check_table = {}
+        cur_type = nil
+        File.open(props_file) do |input|
+          input.each_line do |line|
+            cur_type = nil if line =~ /=Maybe/
+            type = line.scan(/#\s*Property:\s*(NF[KDC]+)_Quick_Check/).flatten
+            if type.size > 0
+              cur_type = type.first
+              check_table[cur_type] = []
+            end
+            if check_table.size > 0 && line[0...1] != "#" && !line.strip.empty? && cur_type
+              start, finish = line.scan(/(\h+(\.\.\h+)?)/).first.first.split("..").map { |num| num.to_i(16) }
+              if finish
+                check_table[cur_type] += (start..finish).to_a
+              else
+                check_table[cur_type] << start
+              end
+            end
+            break if line =~ /={5,}/ && check_table.size >= 4 && check_table.all? { |key, val| val.size > 0 }
+          end
+        end
+        check_table
+      end
+      def props_file
+        TwitterCldr::Resources.download_if_necessary(File.join(@input_path, 'DerivedNormalizationProps.txt'), PROPS_FILE_URL)
+      end
+    end
+  end
+end