RubyGems - twitter_cldr - Versions diffs - 5.2.0 → 5.3.0 - Mend

twitter_cldr 5.2.0 → 5.3.0

Files changed (110) hide show

checksums.yaml +4 -4
data/Gemfile +0 -4
data/Rakefile +19 -8
data/lib/twitter_cldr/normalization.rb +18 -5
data/lib/twitter_cldr/resources.rb +3 -1
data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
data/lib/twitter_cldr/resources/loader.rb +22 -1
data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
data/lib/twitter_cldr/segmentation.rb +25 -10
data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
data/lib/twitter_cldr/shared.rb +1 -0
data/lib/twitter_cldr/shared/caser.rb +3 -3
data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
data/lib/twitter_cldr/utils/range_set.rb +10 -1
data/lib/twitter_cldr/version.rb +1 -1
data/resources/collation/tailoring/km.yml +82 -0
data/resources/collation/tailoring/lo.yml +4 -0
data/resources/collation/tailoring/my.yml +940 -0
data/resources/collation/tries/km.dump +0 -0
data/resources/collation/tries/lo.dump +0 -0
data/resources/collation/tries/my.dump +0 -0
data/resources/locales/km/calendars.yml +373 -0
data/resources/locales/km/currencies.yml +654 -0
data/resources/locales/km/day_periods.yml +96 -0
data/resources/locales/km/fields.yml +495 -0
data/resources/locales/km/languages.yml +397 -0
data/resources/locales/km/layout.yml +5 -0
data/resources/locales/km/lists.yml +37 -0
data/resources/locales/km/numbers.yml +402 -0
data/resources/locales/km/plural_rules.yml +6 -0
data/resources/locales/km/plurals.yml +12 -0
data/resources/locales/km/rbnf.yml +131 -0
data/resources/locales/km/territories.yml +267 -0
data/resources/locales/km/timezones.yml +1471 -0
data/resources/locales/km/units.yml +721 -0
data/resources/locales/lo/calendars.yml +368 -0
data/resources/locales/lo/currencies.yml +918 -0
data/resources/locales/lo/day_periods.yml +96 -0
data/resources/locales/lo/fields.yml +437 -0
data/resources/locales/lo/languages.yml +529 -0
data/resources/locales/lo/layout.yml +5 -0
data/resources/locales/lo/lists.yml +42 -0
data/resources/locales/lo/numbers.yml +476 -0
data/resources/locales/lo/plural_rules.yml +7 -0
data/resources/locales/lo/plurals.yml +14 -0
data/resources/locales/lo/rbnf.yml +119 -0
data/resources/locales/lo/territories.yml +265 -0
data/resources/locales/lo/timezones.yml +1513 -0
data/resources/locales/lo/units.yml +750 -0
data/resources/locales/my/calendars.yml +374 -0
data/resources/locales/my/currencies.yml +697 -0
data/resources/locales/my/day_periods.yml +96 -0
data/resources/locales/my/fields.yml +459 -0
data/resources/locales/my/languages.yml +420 -0
data/resources/locales/my/layout.yml +5 -0
data/resources/locales/my/lists.yml +43 -0
data/resources/locales/my/numbers.yml +417 -0
data/resources/locales/my/plural_rules.yml +6 -0
data/resources/locales/my/plurals.yml +12 -0
data/resources/locales/my/rbnf.yml +145 -0
data/resources/locales/my/territories.yml +265 -0
data/resources/locales/my/timezones.yml +1479 -0
data/resources/locales/my/units.yml +759 -0
data/resources/locales/th/plurals.yml +1 -1
data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
data/resources/shared/segments/dictionaries/laodict.dump +0 -0
data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
data/resources/shared/segments/tests/line_break_test.yml +68 -68
data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
data/resources/supported_locales.yml +3 -0
data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
data/spec/segmentation/dictionary_break_spec.rb +42 -0
data/spec/segmentation/rule_set_spec.rb +3 -1
data/spec/timezones/tests/km.yml +12475 -0
data/spec/timezones/tests/lo.yml +12475 -0
data/spec/timezones/tests/my.yml +12475 -0
metadata +87 -3

data/lib/twitter_cldr/segmentation/rule_set.rb CHANGED

@@ -26,19 +26,10 @@ module TwitterCldr
         )
       end
-      def each_boundary(str)
-        return to_enum(__method__, str) unless block_given?
+      def each_boundary(cursor, stop = cursor.length)
+        return to_enum(__method__, cursor, stop) unless block_given?
-        cursor = Cursor.new(str)
-        # Let the state machine find the first boundary for the line
-        # boundary type. This helps pass nearly all the Unicode
-        # segmentation tests, so it must be the right thing to do.
-        # Normally the first boundary is the implicit start of text
-        # boundary, but potentially not for the line rules?
-        yield 0 unless state_machine.boundary_type == 'line'
-        until cursor.eos?
+        until cursor.position >= stop || cursor.eos?
           state_machine.handle_next(cursor)
           yield cursor.position if suppressions.should_break?(cursor)
         end

data/lib/twitter_cldr/segmentation/segment_iterator.rb ADDED

@@ -0,0 +1,40 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    class SegmentIterator
+      attr_reader :rule_set
+      def initialize(rule_set)
+        @rule_set = rule_set
+      end
+      def each_segment(str)
+        return to_enum(__method__, str) unless block_given?
+        each_boundary(str).each_cons(2) do |start, stop|
+          yield str[start...stop], start, stop
+        end
+      end
+      def each_boundary(str, &block)
+        return to_enum(__method__, str) unless block_given?
+        # implicit start of text boundary
+        yield 0
+        cursor = create_cursor(str)
+        rule_set.each_boundary(cursor, &block)
+      end
+      private
+      def create_cursor(str)
+        Cursor.new(str)
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/state_machine.rb CHANGED

@@ -70,7 +70,7 @@ module TwitterCldr
       def handle_next(cursor)
         result = initial_position = cursor.position
         state = START_STATE
-        row = row_index_for(state)
+        row = state * (metadata.category_count + 4)
         category = 3
         mode = :run
@@ -97,7 +97,7 @@ module TwitterCldr
           end
           state = ftable[row + NEXT_STATES + category]
-          row = row_index_for(state)
+          row = state * (metadata.category_count + 4)
           if ftable[row + ACCEPTING] == -1
             # match found
@@ -114,12 +114,6 @@ module TwitterCldr
         result
       end
-      private
-      def row_index_for(state)
-        state * (metadata.category_count + 4)
-      end
     end
   end
 end

data/lib/twitter_cldr/segmentation/thai_break_engine.rb ADDED

@@ -0,0 +1,141 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+require 'forwardable'
+module TwitterCldr
+  module Segmentation
+    # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java
+    class ThaiBreakEngine
+      include Singleton
+      extend Forwardable
+      def_delegators :engine, :each_boundary
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
+          uset.to_set
+        end
+      end
+      # ellision character
+      THAI_PAIYANNOI = 0x0E2F
+      # repeat character
+      THAI_MAIYAMOK = 0x0E46
+      def each_boundary(*args, &block)
+        engine.each_boundary(*args, &block)
+      end
+      private
+      def engine
+        @engine ||= BrahmicBreakEngine.new(
+          # How many words in a row are "good enough"?
+          lookahead: 3,
+          # Will not combine a non-word with a preceding dictionary word longer than this
+          root_combine_threshold: 3,
+          # Will not combine a non-word that shares at least this much prefix with a
+          # dictionary word with a preceding word
+          prefix_combine_threshold: 3,
+          # Minimum word size
+          min_word: 2,
+          # Minimum number of characters for two words (min_word * 2)
+          min_word_span: 4,
+          word_set: self.class.word_set,
+          mark_set: mark_set,
+          end_word_set: end_word_set,
+          begin_word_set: begin_word_set,
+          dictionary: Dictionary.thai,
+          advance_past_suffix: -> (*args) do
+            advance_past_suffix(*args)
+          end
+        )
+      end
+      def advance_past_suffix(cursor, end_pos, state)
+        suffix_length = 0
+        if cursor.position < end_pos && state.word_length > 0
+          uc = cursor.codepoint
+          candidates = state.words[state.words_found].candidates(
+            cursor, engine.dictionary, end_pos
+          )
+          if candidates <= 0 && suffix_set.include?(uc)
+            if uc == THAI_PAIYANNOI
+              unless suffix_set.include?(cursor.previous)
+                # skip over previous end and PAIYANNOI
+                cursor.advance(2)
+                suffix_length += 1
+                uc = cursor.codepoint
+              else
+                # restore prior position
+                cursor.advance
+              end
+            end
+            if uc == THAI_MAIYAMOK
+              if cursor.previous != THAI_MAIYAMOK
+                # skip over previous end and MAIYAMOK
+                cursor.advance(2)
+                suffix_length += 1
+              else
+                # restore prior position
+                cursor.advance
+              end
+            end
+          else
+            cursor.position = state.current + state.word_length
+          end
+        end
+        suffix_length
+      end
+      def mark_set
+        @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
+          set.add(0x0020)
+        end
+      end
+      def end_word_set
+        @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_list(self.class.word_set)
+          set.subtract(0x0E31)  # MAI HAN-AKAT
+          set.subtract_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
+        end
+      end
+      def begin_word_set
+        @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_range(0x0E01..0x0E2E)  # KO KAI through HO NOKHUK
+          set.add_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
+        end
+      end
+      def suffix_set
+        @suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add(THAI_PAIYANNOI)
+          set.add(THAI_MAIYAMOK)
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb ADDED

@@ -0,0 +1,21 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+module TwitterCldr
+  module Segmentation
+    class UnhandledBreakEngine
+      include Singleton
+      def each_boundary(cursor, &block)
+        return to_enum(__method__, cursor) unless block_given?
+        cursor.advance
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/word_iterator.rb ADDED

@@ -0,0 +1,170 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'set'
+module TwitterCldr
+  module Segmentation
+    class DeormalizedStringError < StandardError; end
+    class WordIterator < SegmentIterator
+      DICTIONARY_BREAK_ENGINES = [
+        CjBreakEngine,
+        KoreanBreakEngine,
+        BurmeseBreakEngine,
+        KhmerBreakEngine,
+        LaoBreakEngine,
+        ThaiBreakEngine
+      ]
+      def each_boundary(str, &block)
+        return to_enum(__method__, str) unless block_given?
+        # Rather than put a bunch of duplicate logic in
+        # each_boundary_helper to make sure we don't yield the same
+        # boundary twice, we wrap it in this additional de-duping
+        # enumerator and call it a day.
+        last_boundary = nil
+        each_boundary_helper(str) do |boundary|
+          yield boundary if boundary != last_boundary
+          last_boundary = boundary
+        end
+      end
+      private
+      def each_boundary_helper(str, &block)
+        # Set up two independent cursors so the algorithm can iterate
+        # over those portions of the input string that require a
+        # dictionary-based break iterator independently from those that
+        # only need the normal, rule-based break iterator. Cursors
+        # hold references to the input text and a list of all the
+        # corresponding Unicode codepoints, meaning they are fairly
+        # expensive to create. The duplication below should only
+        # produce a shallow copy however. The text and codepoint list
+        # are not duplicated, but the cursor's integer position can
+        # be varied independently.
+        dict_cursor = create_cursor(str)
+        rule_cursor = dict_cursor.dup
+        # implicit start of text boundary
+        yield 0
+        until dict_cursor.eos? || rule_cursor.eos?
+          # We use a regex to identify the beginnings of potential runs
+          # of dictionary characters. This regex was benchmarked and
+          # found to be pretty fast, but could become a bottleneck if
+          # other parts of the algorithm are improved in the future.
+          m = dictionary_re.match(dict_cursor.text, dict_cursor.position)
+          break unless m
+          dict_cursor.position = m.begin(0)
+          dict_break_engine = dictionary_break_engine_for(dict_cursor.codepoint)
+          # It's possible to encounter a dictionary character that can't
+          # be handled by any of the dictionary-based break engines
+          # because it's too short to make up an actual word. The
+          # break engine will simply yield no breaks in such a case, which
+          # we test for below by peeking for the first boundary value and
+          # rescuing a StopIteration error. Since the run of dictionary
+          # characters may be arbitrarily long, peeking should be more
+          # performant than attempting to calculate all the boundary
+          # positions for the run at once.
+          #
+          # It should be noted that, despite our best efforts here in
+          # WordIterator, certain dictionary-based break engines (eg.
+          # CjBreakEngine) cannot yield word boundaries without first
+          # examining the entire run of dictionary characters. In practice
+          # this shouldn't be too big an issue, since Chinese text often
+          # contains punctuation that should limit the average run length.
+          dict_enum = dict_break_engine.each_boundary(dict_cursor)
+          dict_boundary = begin
+            dict_enum.peek
+          rescue StopIteration
+            nil
+          end
+          # If a dictionary boundary was found, attempt to use the rule-based
+          # break iterator to find breaks in the text immediately before it.
+          # Otherwise, since none of the dictionary-based break engines could
+          # find any boundaries in the current run, we advance the dictionary
+          # cursor in an attempt to find the next dictionary boundary. Doing
+          # so effectively causes the algorithm to fall back to the rule-based
+          # break engine.
+          if dict_boundary
+            # Only use the rule-based break engine if there are characters to
+            # process.
+            if rule_cursor.position < m.begin(0)
+              rule_set.each_boundary(rule_cursor, m.begin(0), &block)
+            end
+            # Yield all the dictionary breaks from the enum. We can't use .each
+            # here because that will restart the iteration. Ruby's loop
+            # construct automatically rescues StopIteration.
+            loop do
+              yield dict_enum.next
+            end
+            # We've reached the end of a dictionary character run, so yield
+            # the end of text boundary.
+            yield dict_cursor.position
+            # These should be the same after a successful dictionary run, i.e.
+            # they should both be positioned at the end of the current rule-based
+            # and dictionary-based portions of the run, ready for the next one.
+            rule_cursor.position = dict_cursor.position
+          else
+            dict_cursor.advance
+          end
+        end
+        # Find boundaries in the straggler, non-dictionary run at the end of
+        # the input text.
+        unless rule_cursor.eos?
+          rule_set.each_boundary(rule_cursor, &block)
+        end
+        # implicit end of text boundary
+        yield rule_cursor.length
+      end
+      # all dictionary characters, i.e. characters that must be handled
+      # by one of the dictionary-based break engines
+      def dictionary_set
+        @@dictionary_set ||= Set.new.tap do |set|
+          DICTIONARY_BREAK_ENGINES.each do |break_engine|
+            set.merge(break_engine.word_set)
+          end
+        end
+      end
+      def dictionary_break_engine_for(codepoint)
+        codepoint_to_engine_cache[codepoint] ||= begin
+          engine = DICTIONARY_BREAK_ENGINES.find do |break_engine|
+            break_engine.word_set.include?(codepoint)
+          end
+          (engine || UnhandledBreakEngine).instance
+        end
+      end
+      def dictionary_re
+        @@dictionary_re ||= begin
+          ranges = TwitterCldr::Utils::RangeSet.from_array(dictionary_set).ranges.map do |r|
+            "\\u{#{r.first.to_s(16)}}-\\u{#{r.last.to_s(16)}}"
+          end
+          /[#{ranges.join}]/
+        end
+      end
+      def codepoint_to_engine_cache
+        @@codepoint_to_engine_cache ||= {}
+      end
+    end
+  end
+end

data/lib/twitter_cldr/shared.rb CHANGED

@@ -32,6 +32,7 @@ module TwitterCldr
     autoload :TerritoriesContainment, 'twitter_cldr/shared/territories_containment'
     autoload :Territory,              'twitter_cldr/shared/territory'
     autoload :UnicodeRegex,           'twitter_cldr/shared/unicode_regex'
+    autoload :UnicodeSet,             'twitter_cldr/shared/unicode_set'
     autoload :Unit,                   'twitter_cldr/shared/unit'
   end
 end

data/lib/twitter_cldr/shared/caser.rb CHANGED

@@ -25,7 +25,7 @@ module TwitterCldr
         # and the following word boundary to Lowercase_Mapping(C).
         def titlecase(string)
           string.dup.tap do |result|
-            boundary_rule_set.each_boundary(result).each_cons(2) do |boundary_pair|
+            word_iterator.each_word(result) do |_, *boundary_pair|
               if cased_pos = first_cased(string, *boundary_pair)
                 result[cased_pos] = titlecasing_hash[result[cased_pos]]
@@ -47,8 +47,8 @@ module TwitterCldr
           end
         end
-        def boundary_rule_set
-          @boundary_rule_set ||= Segmentation::RuleSet.create(:en, 'word')
+        def word_iterator
+          @word_iterator ||= Segmentation::BreakIterator.new(:en)
         end
         def cased?(char)