RubyGems - twitter_cldr - Versions diffs - 5.2.0 → 5.3.0 - Mend

twitter_cldr 5.2.0 → 5.3.0

Files changed (110) hide show

checksums.yaml +4 -4
data/Gemfile +0 -4
data/Rakefile +19 -8
data/lib/twitter_cldr/normalization.rb +18 -5
data/lib/twitter_cldr/resources.rb +3 -1
data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
data/lib/twitter_cldr/resources/loader.rb +22 -1
data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
data/lib/twitter_cldr/segmentation.rb +25 -10
data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
data/lib/twitter_cldr/shared.rb +1 -0
data/lib/twitter_cldr/shared/caser.rb +3 -3
data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
data/lib/twitter_cldr/utils/range_set.rb +10 -1
data/lib/twitter_cldr/version.rb +1 -1
data/resources/collation/tailoring/km.yml +82 -0
data/resources/collation/tailoring/lo.yml +4 -0
data/resources/collation/tailoring/my.yml +940 -0
data/resources/collation/tries/km.dump +0 -0
data/resources/collation/tries/lo.dump +0 -0
data/resources/collation/tries/my.dump +0 -0
data/resources/locales/km/calendars.yml +373 -0
data/resources/locales/km/currencies.yml +654 -0
data/resources/locales/km/day_periods.yml +96 -0
data/resources/locales/km/fields.yml +495 -0
data/resources/locales/km/languages.yml +397 -0
data/resources/locales/km/layout.yml +5 -0
data/resources/locales/km/lists.yml +37 -0
data/resources/locales/km/numbers.yml +402 -0
data/resources/locales/km/plural_rules.yml +6 -0
data/resources/locales/km/plurals.yml +12 -0
data/resources/locales/km/rbnf.yml +131 -0
data/resources/locales/km/territories.yml +267 -0
data/resources/locales/km/timezones.yml +1471 -0
data/resources/locales/km/units.yml +721 -0
data/resources/locales/lo/calendars.yml +368 -0
data/resources/locales/lo/currencies.yml +918 -0
data/resources/locales/lo/day_periods.yml +96 -0
data/resources/locales/lo/fields.yml +437 -0
data/resources/locales/lo/languages.yml +529 -0
data/resources/locales/lo/layout.yml +5 -0
data/resources/locales/lo/lists.yml +42 -0
data/resources/locales/lo/numbers.yml +476 -0
data/resources/locales/lo/plural_rules.yml +7 -0
data/resources/locales/lo/plurals.yml +14 -0
data/resources/locales/lo/rbnf.yml +119 -0
data/resources/locales/lo/territories.yml +265 -0
data/resources/locales/lo/timezones.yml +1513 -0
data/resources/locales/lo/units.yml +750 -0
data/resources/locales/my/calendars.yml +374 -0
data/resources/locales/my/currencies.yml +697 -0
data/resources/locales/my/day_periods.yml +96 -0
data/resources/locales/my/fields.yml +459 -0
data/resources/locales/my/languages.yml +420 -0
data/resources/locales/my/layout.yml +5 -0
data/resources/locales/my/lists.yml +43 -0
data/resources/locales/my/numbers.yml +417 -0
data/resources/locales/my/plural_rules.yml +6 -0
data/resources/locales/my/plurals.yml +12 -0
data/resources/locales/my/rbnf.yml +145 -0
data/resources/locales/my/territories.yml +265 -0
data/resources/locales/my/timezones.yml +1479 -0
data/resources/locales/my/units.yml +759 -0
data/resources/locales/th/plurals.yml +1 -1
data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
data/resources/shared/segments/dictionaries/laodict.dump +0 -0
data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
data/resources/shared/segments/tests/line_break_test.yml +68 -68
data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
data/resources/supported_locales.yml +3 -0
data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
data/spec/segmentation/dictionary_break_spec.rb +42 -0
data/spec/segmentation/rule_set_spec.rb +3 -1
data/spec/timezones/tests/km.yml +12475 -0
data/spec/timezones/tests/lo.yml +12475 -0
data/spec/timezones/tests/my.yml +12475 -0
metadata +87 -3

data/lib/twitter_cldr/segmentation/cursor.rb CHANGED

@@ -27,7 +27,7 @@ module TwitterCldr
         position >= text.size
       end
-      def codepoint(pos = position)
+      def codepoint(pos = @position)
         codepoints[pos]
       end

data/lib/twitter_cldr/segmentation/dictionary.rb ADDED

@@ -0,0 +1,84 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class Dictionary
+      class << self
+        def burmese
+          get('burmese')
+        end
+        def cj
+          get('cj')
+        end
+        def khmer
+          get('khmer')
+        end
+        def lao
+          get('lao')
+        end
+        def thai
+          get('thai')
+        end
+        def get(name)
+          dictionary_cache[name] ||= begin
+            resource = TwitterCldr.get_resource(
+              'shared', 'segments', 'dictionaries', "#{name}dict.dump"
+            )
+            new(resource)
+          end
+        end
+        private
+        def dictionary_cache
+          @dictionary_cache ||= {}
+        end
+      end
+      attr_reader :trie
+      def initialize(trie)
+        @trie = trie
+      end
+      def matches(cursor, max_search_length, limit)
+        return 0 if cursor.length == 0
+        count = 0
+        num_chars = 1
+        current = trie.root.child(cursor.codepoint)
+        values = []
+        lengths = []
+        until current.nil?
+          if current.has_value? && count < limit
+            values << current.value
+            lengths << num_chars
+            count += 1
+          end
+          break if num_chars >= max_search_length
+          current = current.child(
+            cursor.codepoint(cursor.position + num_chars)
+          )
+          num_chars += 1
+        end
+        [count, values, lengths, num_chars]
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb ADDED

@@ -0,0 +1,34 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class DictionaryBreakEngine
+      def each_boundary(cursor, &block)
+        return to_enum(__method__, cursor) unless block_given?
+        stop = cursor.position
+        while !cursor.eos? && word_set.include?(cursor.codepoints[stop])
+          stop += 1
+        end
+        divide_up_dictionary_range(cursor, stop, &block)
+      end
+      def word_set(*args)
+        raise NotImplementedError, "#{__method__} must be defined in derived classes"
+      end
+      private
+      def divide_up_dictionary_range(*args)
+        raise NotImplementedError, "#{__method__} must be defined in derived classes"
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/khmer_break_engine.rb ADDED

@@ -0,0 +1,83 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+require 'forwardable'
+module TwitterCldr
+  module Segmentation
+    # https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/KhmerBreakEngine.java
+    class KhmerBreakEngine
+      include Singleton
+      extend Forwardable
+      def_delegators :engine, :each_boundary
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]]')
+          uset.to_set
+        end
+      end
+      private
+      # All Brahmic scripts (including Khmer) can make use of the same break
+      # logic, so we use composition here and defer to the Brahmic break engine.
+      def engine
+        @engine ||= BrahmicBreakEngine.new(
+          # How many words in a row are "good enough"?
+          lookahead: 3,
+          # Will not combine a non-word with a preceding dictionary word longer than this
+          root_combine_threshold: 3,
+          # Will not combine a non-word that shares at least this much prefix with a
+          # dictionary word with a preceding word
+          prefix_combine_threshold: 3,
+          # Minimum word size
+          min_word: 4,
+          # Minimum number of characters for two words (same as min_word for Khmer)
+          min_word_span: 4,
+          word_set: self.class.word_set,
+          mark_set: mark_set,
+          end_word_set: end_word_set,
+          begin_word_set: begin_word_set,
+          dictionary: Dictionary.khmer,
+          advance_past_suffix: -> (*) do
+            0  # not applicable to Khmer
+          end
+        )
+      end
+      def mark_set
+        @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.apply_pattern('[[:Khmer:]&[:Line_Break=SA:]&[:M:]]')
+          set.add(0x0020)
+        end
+      end
+      def end_word_set
+        @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_list(self.class.word_set)
+          set.subtract(0x17D2) # KHMER SIGN COENG that combines some characters
+        end
+      end
+      def begin_word_set
+        @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_range(0x1780..0x17B3)
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/korean_break_engine.rb ADDED

@@ -0,0 +1,30 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+module TwitterCldr
+  module Segmentation
+    class KoreanBreakEngine < CjBreakEngine
+      include Singleton
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.add_range(0xAC00..0xD7A3)
+          uset.to_set
+        end
+      end
+      private
+      def word_set
+        self.class.word_set
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/lao_break_engine.rb ADDED

@@ -0,0 +1,85 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+require 'forwardable'
+module TwitterCldr
+  module Segmentation
+    # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/LaoBreakEngine.java
+    class LaoBreakEngine
+      include Singleton
+      extend Forwardable
+      def_delegators :engine, :each_boundary
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.apply_pattern('[[:Laoo:]&[:Line_Break=SA:]]')
+          uset.to_set
+        end
+      end
+      private
+      # All Brahmic scripts (including Lao) can make use of the same break
+      # logic, so we use composition here and defer to the Brahmic break engine.
+      def engine
+        @engine ||= BrahmicBreakEngine.new(
+          # How many words in a row are "good enough"?
+          lookahead: 3,
+          # Will not combine a non-word with a preceding dictionary word longer than this
+          root_combine_threshold: 3,
+          # Will not combine a non-word that shares at least this much prefix with a
+          # dictionary word with a preceding word
+          prefix_combine_threshold: 3,
+          # Minimum word size
+          min_word: 2,
+          # Minimum number of characters for two words (same as min_word for Lao)
+          min_word_span: 2,
+          word_set: self.class.word_set,
+          mark_set: mark_set,
+          end_word_set: end_word_set,
+          begin_word_set: begin_word_set,
+          dictionary: Dictionary.lao,
+          advance_past_suffix: -> (*) do
+            0  # not applicable to Lao
+          end
+        )
+      end
+      def mark_set
+        @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.apply_pattern('[[:Laoo:]&[:Line_Break=SA:]&[:M:]]')
+          set.add(0x0020)
+        end
+      end
+      def end_word_set
+        @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_list(self.class.word_set)
+          set.subtract_range(0x0EC0..0x0EC4) # prefix vowels
+        end
+      end
+      def begin_word_set
+        @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_range(0x0E81..0x0EAE)  # basic consonants (including holes for corresponding Thai characters)
+          set.add_range(0x0EDC..0x0EDD)  # digraph consonants (no Thai equivalent)
+          set.add_range(0x0EC0..0x0EC4)  # prefix vowels
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/line_iterator.rb ADDED

@@ -0,0 +1,23 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class LineIterator < SegmentIterator
+      def each_boundary(str, &block)
+        return to_enum(__method__, str) unless block_given?
+        # Let the state machine find the first boundary for the line
+        # boundary type (i.e. don't yield 0 here). This helps pass
+        # nearly all the Unicode segmentation tests, so it must be
+        # the right thing to do. Normally the first boundary is the
+        # implicit start of text boundary, but potentially not for
+        # the line rules?
+        cursor = create_cursor(str)
+        rule_set.each_boundary(cursor, &block)
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/possible_word.rb ADDED

@@ -0,0 +1,74 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class PossibleWord
+      # list size, limited by the maximum number of words in the dictionary
+      # that form a nested sequence.
+      POSSIBLE_WORD_LIST_MAX = 20
+      def initialize
+        @lengths = []
+        @count = nil
+        @offset = -1
+      end
+      # fill the list of candidates if needed, select the longest, and return the number found
+      def candidates(cursor, dictionary, end_pos)
+        start = cursor.position
+        if start != @offset
+          @offset = start
+          @count, _, @lengths, @prefix = dictionary.matches(
+            cursor, end_pos - start, POSSIBLE_WORD_LIST_MAX
+          )
+          # dictionary leaves text after longest prefix, not longest word, so back up.
+          if @count <= 0
+            cursor.position = start
+          end
+        end
+        if @count > 0
+          cursor.position = start + @lengths[@count - 1]
+        end
+        @current = @count - 1
+        @mark = @current
+        return @count
+      end
+      # select the currently marked candidate, point after it in the text, and invalidate self
+      def accept_marked(cursor)
+        cursor.position = @offset + @lengths[@mark]
+        @lengths[@mark]
+      end
+      # back up from the current candidate to the next shorter one; return true if that exists
+      # and point the text after it
+      def back_up(cursor)
+        if @current > 0
+          @current -= 1
+          cursor.position = @offset + @lengths[@current]
+          return true
+        end
+        false
+      end
+      # return the longest prefix this candidate location shares with a dictionary word
+      def longest_prefix
+        @prefix
+      end
+      # mark the current candidate as the one we like
+      def mark_current
+        @mark = @current
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/possible_word_list.rb ADDED

@@ -0,0 +1,23 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class PossibleWordList
+      attr_reader :length, :items
+      def initialize(length)
+        @items = Array.new(length) { PossibleWord.new }
+        @length = length
+      end
+      def [](idx)
+        items[idx % length]
+      end
+    end
+  end
+end