RubyGems - twitter_cldr - Versions diffs - 5.2.0 → 5.3.0 - Mend

twitter_cldr 5.2.0 → 5.3.0

Files changed (110) hide show

checksums.yaml +4 -4
data/Gemfile +0 -4
data/Rakefile +19 -8
data/lib/twitter_cldr/normalization.rb +18 -5
data/lib/twitter_cldr/resources.rb +3 -1
data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
data/lib/twitter_cldr/resources/loader.rb +22 -1
data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
data/lib/twitter_cldr/segmentation.rb +25 -10
data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
data/lib/twitter_cldr/shared.rb +1 -0
data/lib/twitter_cldr/shared/caser.rb +3 -3
data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
data/lib/twitter_cldr/utils/range_set.rb +10 -1
data/lib/twitter_cldr/version.rb +1 -1
data/resources/collation/tailoring/km.yml +82 -0
data/resources/collation/tailoring/lo.yml +4 -0
data/resources/collation/tailoring/my.yml +940 -0
data/resources/collation/tries/km.dump +0 -0
data/resources/collation/tries/lo.dump +0 -0
data/resources/collation/tries/my.dump +0 -0
data/resources/locales/km/calendars.yml +373 -0
data/resources/locales/km/currencies.yml +654 -0
data/resources/locales/km/day_periods.yml +96 -0
data/resources/locales/km/fields.yml +495 -0
data/resources/locales/km/languages.yml +397 -0
data/resources/locales/km/layout.yml +5 -0
data/resources/locales/km/lists.yml +37 -0
data/resources/locales/km/numbers.yml +402 -0
data/resources/locales/km/plural_rules.yml +6 -0
data/resources/locales/km/plurals.yml +12 -0
data/resources/locales/km/rbnf.yml +131 -0
data/resources/locales/km/territories.yml +267 -0
data/resources/locales/km/timezones.yml +1471 -0
data/resources/locales/km/units.yml +721 -0
data/resources/locales/lo/calendars.yml +368 -0
data/resources/locales/lo/currencies.yml +918 -0
data/resources/locales/lo/day_periods.yml +96 -0
data/resources/locales/lo/fields.yml +437 -0
data/resources/locales/lo/languages.yml +529 -0
data/resources/locales/lo/layout.yml +5 -0
data/resources/locales/lo/lists.yml +42 -0
data/resources/locales/lo/numbers.yml +476 -0
data/resources/locales/lo/plural_rules.yml +7 -0
data/resources/locales/lo/plurals.yml +14 -0
data/resources/locales/lo/rbnf.yml +119 -0
data/resources/locales/lo/territories.yml +265 -0
data/resources/locales/lo/timezones.yml +1513 -0
data/resources/locales/lo/units.yml +750 -0
data/resources/locales/my/calendars.yml +374 -0
data/resources/locales/my/currencies.yml +697 -0
data/resources/locales/my/day_periods.yml +96 -0
data/resources/locales/my/fields.yml +459 -0
data/resources/locales/my/languages.yml +420 -0
data/resources/locales/my/layout.yml +5 -0
data/resources/locales/my/lists.yml +43 -0
data/resources/locales/my/numbers.yml +417 -0
data/resources/locales/my/plural_rules.yml +6 -0
data/resources/locales/my/plurals.yml +12 -0
data/resources/locales/my/rbnf.yml +145 -0
data/resources/locales/my/territories.yml +265 -0
data/resources/locales/my/timezones.yml +1479 -0
data/resources/locales/my/units.yml +759 -0
data/resources/locales/th/plurals.yml +1 -1
data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
data/resources/shared/segments/dictionaries/laodict.dump +0 -0
data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
data/resources/shared/segments/tests/line_break_test.yml +68 -68
data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
data/resources/supported_locales.yml +3 -0
data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
data/spec/segmentation/dictionary_break_spec.rb +42 -0
data/spec/segmentation/rule_set_spec.rb +3 -1
data/spec/timezones/tests/km.yml +12475 -0
data/spec/timezones/tests/lo.yml +12475 -0
data/spec/timezones/tests/my.yml +12475 -0
metadata +87 -3

data/lib/twitter_cldr/segmentation.rb CHANGED

@@ -5,15 +5,30 @@
 module TwitterCldr
   module Segmentation
-    autoload :BreakIterator,    'twitter_cldr/segmentation/break_iterator'
-    autoload :CategoryTable,    'twitter_cldr/segmentation/category_table'
-    autoload :Cursor,           'twitter_cldr/segmentation/cursor'
-    autoload :Metadata,         'twitter_cldr/segmentation/metadata'
-    autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
-    autoload :RuleSet,          'twitter_cldr/segmentation/rule_set'
-    autoload :StateMachine,     'twitter_cldr/segmentation/state_machine'
-    autoload :StateTable,       'twitter_cldr/segmentation/state_table'
-    autoload :StatusTable,      'twitter_cldr/segmentation/status_table'
-    autoload :Suppressions,     'twitter_cldr/segmentation/suppressions'
+    autoload :BrahmicBreakEngine,    'twitter_cldr/segmentation/brahmic_break_engine'
+    autoload :BreakIterator,         'twitter_cldr/segmentation/break_iterator'
+    autoload :BurmeseBreakEngine,    'twitter_cldr/segmentation/burmese_break_engine'
+    autoload :CategoryTable,         'twitter_cldr/segmentation/category_table'
+    autoload :CjBreakEngine,         'twitter_cldr/segmentation/cj_break_engine'
+    autoload :Cursor,                'twitter_cldr/segmentation/cursor'
+    autoload :Dictionary,            'twitter_cldr/segmentation/dictionary'
+    autoload :DictionaryBreakEngine, 'twitter_cldr/segmentation/dictionary_break_engine'
+    autoload :KhmerBreakEngine,      'twitter_cldr/segmentation/khmer_break_engine'
+    autoload :KoreanBreakEngine,     'twitter_cldr/segmentation/korean_break_engine'
+    autoload :LaoBreakEngine,        'twitter_cldr/segmentation/lao_break_engine'
+    autoload :LineIterator,          'twitter_cldr/segmentation/line_iterator'
+    autoload :Metadata,              'twitter_cldr/segmentation/metadata'
+    autoload :NullSuppressions,      'twitter_cldr/segmentation/null_suppressions'
+    autoload :PossibleWord,          'twitter_cldr/segmentation/possible_word'
+    autoload :PossibleWordList,      'twitter_cldr/segmentation/possible_word_list'
+    autoload :RuleSet,               'twitter_cldr/segmentation/rule_set'
+    autoload :SegmentIterator,       'twitter_cldr/segmentation/segment_iterator'
+    autoload :StateMachine,          'twitter_cldr/segmentation/state_machine'
+    autoload :StateTable,            'twitter_cldr/segmentation/state_table'
+    autoload :StatusTable,           'twitter_cldr/segmentation/status_table'
+    autoload :Suppressions,          'twitter_cldr/segmentation/suppressions'
+    autoload :ThaiBreakEngine,       'twitter_cldr/segmentation/thai_break_engine'
+    autoload :UnhandledBreakEngine,  'twitter_cldr/segmentation/unhandled_break_engine'
+    autoload :WordIterator,          'twitter_cldr/segmentation/word_iterator'
   end
 end

data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb ADDED

@@ -0,0 +1,200 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+module TwitterCldr
+  module Segmentation
+    # Base class break engine for languages derived from the Brahmic script,
+    # i.e. Lao, Thai, Khmer, and Burmese.
+    #
+    # This class is based on duplicated code found in ICU's BurmeseBreakEngine
+    # and friends, which all make use of the same break logic.
+    class BrahmicBreakEngine < DictionaryBreakEngine
+      # ICU keeps track of all these variables inline, but since we've done a
+      # bit of method separating (see below), it's too ugly to pass all of
+      # them around as arguments. Instead we encapsulate them all in this
+      # handy state object.
+      class EngineState
+        attr_accessor :current
+        attr_reader :words
+        attr_accessor :words_found, :word_length
+        def initialize(options = {})
+          @current = options.fetch(:current, 0)
+          @words = options.fetch(:words)
+          @words_found = options.fetch(:words_found, 0)
+          @word_length = options.fetch(:word_length, 0)
+        end
+      end
+      attr_reader :lookahead, :root_combine_threshold
+      attr_reader :prefix_combine_threshold, :min_word, :min_word_span
+      attr_reader :word_set, :mark_set, :end_word_set, :begin_word_set
+      attr_reader :dictionary, :advance_past_suffix
+      def initialize(options = {})
+        @lookahead = options.fetch(:lookahead)
+        @root_combine_threshold = options.fetch(:root_combine_threshold)
+        @prefix_combine_threshold = options.fetch(:prefix_combine_threshold)
+        @min_word = options.fetch(:min_word)
+        @min_word_span = options.fetch(:min_word_span)
+        @word_set = options.fetch(:word_set)
+        @mark_set = options.fetch(:mark_set)
+        @end_word_set = options.fetch(:end_word_set)
+        @begin_word_set = options.fetch(:begin_word_set)
+        @dictionary = options.fetch(:dictionary)
+        @advance_past_suffix = options.fetch(:advance_past_suffix)
+      end
+      private
+      # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java#L88
+      def divide_up_dictionary_range(cursor, end_pos)
+        return to_enum(__method__, cursor, end_pos) unless block_given?
+        return if (end_pos - cursor.position) < min_word_span
+        state = EngineState.new(
+          cursor: cursor,
+          end_pos: end_pos,
+          words: PossibleWordList.new(lookahead)
+        )
+        while cursor.position < end_pos
+          state.current = cursor.position
+          state.word_length = 0
+          # look for candidate words at the current position
+          candidates = state.words[state.words_found].candidates(
+            cursor, dictionary, end_pos
+          )
+          # if we found exactly one, use that
+          if candidates == 1
+            state.word_length = state.words[state.words_found].accept_marked(cursor)
+            state.words_found += 1
+          elsif candidates > 1
+            mark_best_candidate(cursor, end_pos, state)
+            state.word_length = state.words[state.words_found].accept_marked(cursor)
+            state.words_found += 1
+          end
+          # We come here after having either found a word or not. We look ahead to the
+          # next word. If it's not a dictionary word, we will combine it with the word we
+          # just found (if there is one), but only if the preceding word does not exceed
+          # the threshold. The cursor should now be positioned at the end of the word we
+          # found.
+          if cursor.position < end_pos && state.word_length < root_combine_threshold
+            # If it is a dictionary word, do nothing. If it isn't, then if there is
+            # no preceding word, or the non-word shares less than the minimum threshold
+            # of characters with a dictionary word, then scan to resynchronize.
+            preceeding_words = state.words[state.words_found].candidates(
+              cursor, dictionary, end_pos
+            )
+            if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
+              advance_to_plausible_word_boundary(cursor, end_pos, state)
+            else
+              # backup to where we were for next iteration
+              cursor.position = state.current + state.word_length
+            end
+          end
+          # never stop before a combining mark.
+          while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
+            cursor.advance
+            state.word_length += 1
+          end
+          # Look ahead for possible suffixes if a dictionary word does not follow.
+          # We do this in code rather than using a rule so that the heuristic
+          # resynch continues to function. For example, one of the suffix characters
+          # could be a typo in the middle of a word.
+          state.word_length += advance_past_suffix.call(
+            cursor, end_pos, state
+          )
+          # Did we find a word on this iteration? If so, yield it as a boundary.
+          if state.word_length > 0
+            yield state.current + state.word_length
+          end
+        end
+      end
+      private
+      # In ICU, this method is part of divide_up_dictionary_range. Extracted here
+      # for readability.
+      def advance_to_plausible_word_boundary(cursor, end_pos, state)
+        remaining = end_pos - (state.current + state.word_length)
+        pc = cursor.codepoint
+        chars = 0
+        loop do
+          cursor.advance
+          uc = cursor.codepoint
+          chars += 1
+          remaining -= 1
+          break if remaining <= 0
+          if end_word_set.include?(pc) && begin_word_set.include?(uc)
+            # Maybe. See if it's in the dictionary.
+            candidate = state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos)
+            cursor.position = state.current + state.word_length + chars
+            break if candidate > 0
+          end
+          pc = uc
+        end
+        # bump the word count if there wasn't already one
+        state.words_found += 1 if state.word_length <= 0
+        # update the length with the passed-over characters
+        state.word_length += chars
+      end
+      def mark_best_candidate(cursor, end_pos, state)
+        # if there was more than one, see which one can take us forward the most words
+        found_best = false
+        # if we're already at the end of the range, we're done
+        if cursor.position < end_pos
+          loop do
+            words_matched = 1
+            if state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos) > 0
+              if words_matched < 2
+                # followed by another dictionary word; mark first word as a good candidate
+                state.words[state.words_found].mark_current
+                words_matched = 2
+              end
+              # if we're already at the end of the range, we're done
+              break if cursor.position >= end_pos
+              # see if any of the possible second words is followed by a third word
+              loop do
+                # if we find a third word, stop right away
+                if state.words[state.words_found + 2].candidates(cursor, dictionary, end_pos) > 0
+                  state.words[state.words_found].mark_current
+                  found_best = true
+                  break
+                end
+                break unless state.words[state.words_found + 1].back_up(cursor)
+              end
+            end
+            break unless state.words[state.words_found].back_up(cursor) && !found_best
+          end
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/break_iterator.rb CHANGED

@@ -6,7 +6,6 @@
 module TwitterCldr
   module Segmentation
     class BreakIterator
       attr_reader :locale, :options
       def initialize(locale = TwitterCldr.locale, options = {})
@@ -15,43 +14,44 @@ module TwitterCldr
       end
       def each_sentence(str, &block)
-        rule_set = rule_set_for('sentence')
-        each_boundary(rule_set, str, &block)
+        iter = iterator_for('sentence')
+        iter.each_segment(str, &block)
       end
       def each_word(str, &block)
-        rule_set = rule_set_for('word')
-        each_boundary(rule_set, str, &block)
+        iter = iterator_for('word')
+        iter.each_segment(str, &block)
       end
       def each_grapheme_cluster(str, &block)
-        rule_set = rule_set_for('grapheme')
-        each_boundary(rule_set, str, &block)
+        iter = iterator_for('grapheme')
+        iter.each_segment(str, &block)
       end
       def each_line(str, &block)
-        rule_set = rule_set_for('line')
-        each_boundary(rule_set, str, &block)
+        iter = iterator_for('line')
+        iter.each_segment(str, &block)
       end
       private
-      def each_boundary(rule_set, str)
-        return to_enum(__method__, rule_set, str) unless block_given?
-        rule_set.each_boundary(str).each_cons(2) do |start, stop|
-          yield str[start...stop], start, stop
+      def iterator_for(boundary_type)
+        iterator_cache[boundary_type] ||= begin
+          rule_set = RuleSet.create(locale, boundary_type, options)
+          case boundary_type
+            when 'line'
+              LineIterator.new(rule_set)
+            when 'word'
+              WordIterator.new(rule_set)
+            else
+              SegmentIterator.new(rule_set)
+          end
         end
       end
-      def rule_set_for(boundary_type)
-        rule_set_cache[boundary_type] ||= RuleSet.create(
-          locale, boundary_type, options
-        )
-      end
-      def rule_set_cache
-        @rule_set_cache ||= {}
+      def iterator_cache
+        @iterator_cache ||= {}
       end
     end
   end

data/lib/twitter_cldr/segmentation/burmese_break_engine.rb ADDED

@@ -0,0 +1,83 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+require 'forwardable'
+module TwitterCldr
+  module Segmentation
+    # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
+    class BurmeseBreakEngine
+      include Singleton
+      extend Forwardable
+      def_delegators :engine, :each_boundary
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]]')
+          uset.to_set
+        end
+      end
+      private
+      # All Brahmic scripts (including Burmese) can make use of the same break
+      # logic, so we use composition here and defer to the Brahmic break engine.
+      def engine
+        @engine ||= BrahmicBreakEngine.new(
+          # How many words in a row are "good enough"?
+          lookahead: 3,
+          # Will not combine a non-word with a preceding dictionary word longer than this
+          root_combine_threshold: 3,
+          # Will not combine a non-word that shares at least this much prefix with a
+          # dictionary word with a preceding word
+          prefix_combine_threshold: 3,
+          # Minimum word size
+          min_word: 2,
+          # Minimum number of characters for two words (same as min_word for Burmese)
+          min_word_span: 2,
+          word_set: self.class.word_set,
+          mark_set: mark_set,
+          end_word_set: end_word_set,
+          begin_word_set: begin_word_set,
+          dictionary: Dictionary.burmese,
+          advance_past_suffix: -> (*) do
+            0  # not applicable to Burmese
+          end
+        )
+      end
+      def mark_set
+        @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]&[:M:]]')
+          set.add(0x0020)
+        end
+      end
+      def end_word_set
+        @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          set.add_list(self.class.word_set)
+        end
+      end
+      def begin_word_set
+        @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
+          # basic consonants and independent vowels
+          set.add_range(0x1000..0x102A)
+        end
+      end
+    end
+  end
+end

data/lib/twitter_cldr/segmentation/category_table.rb CHANGED

@@ -45,12 +45,16 @@ module TwitterCldr
       private
       def find(codepoint)
-        values.bsearch do |entry|
+        cache[codepoint] ||= values.bsearch do |entry|
           next -1 if codepoint < entry[0]
           next 1 if codepoint > entry[1]
           0
         end
       end
+      def cache
+        @cache ||= {}
+      end
     end
   end
 end

data/lib/twitter_cldr/segmentation/cj_break_engine.rb ADDED

@@ -0,0 +1,163 @@
+# encoding: UTF-8
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+require 'singleton'
+module TwitterCldr
+  module Segmentation
+    class CjBreakEngine < DictionaryBreakEngine
+      include Singleton
+      # magic number pulled from ICU's source code, presumably slightly longer
+      # than the longest Chinese/Japanese/Korean word
+      MAX_WORD_SIZE = 20
+      # magic number pulled from ICU's source code
+      MAX_SNLP = 255
+      # the equivalent of Java's Integer.MAX_VALUE
+      LARGE_NUMBER = 0xFFFFFFFF
+      MAX_KATAKANA_LENGTH = 8
+      MAX_KATAKANA_GROUP_LENGTH = 20
+      KATAKANA_COSTS = [8192, 984, 408, 240, 204, 252, 300, 372, 480].freeze
+      MAX_KATAKANA_COST = 8192
+      def self.word_set
+        @word_set ||= begin
+          uset = TwitterCldr::Shared::UnicodeSet.new
+          uset.apply_pattern('[:Han:]')
+          uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
+          uset.apply_pattern('[:Hiragana:]')
+          uset.add(0xFF70)  # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+          uset.add(0x30FC)  # KATAKANA-HIRAGANA PROLONGED SOUND MARK
+          uset.to_set
+        end
+      end
+      private
+      def word_set
+        self.class.word_set
+      end
+      def divide_up_dictionary_range(cursor, end_pos, &block)
+        return to_enum(__method__, cursor, end_pos) unless block_given?
+        input_length = end_pos - cursor.position
+        best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
+        prev = Array.new(input_length + 1) { -1 }
+        best_snlp[0] = 0
+        start_pos = cursor.position
+        is_prev_katakana = false
+        until cursor.position >= end_pos
+          idx = cursor.position - start_pos
+          if best_snlp[idx] == LARGE_NUMBER
+            cursor.advance
+            next
+          end
+          max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
+            MAX_WORD_SIZE
+          else
+            end_pos - cursor.position
+          end
+          count, values, lengths, _ = dictionary.matches(
+            cursor, max_search_length, max_search_length
+          )
+          if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
+            values[count] = MAX_SNLP
+            lengths[count] = 1
+            count += 1
+          end
+          count.times do |j|
+            new_snlp = best_snlp[idx] + values[j]
+            if new_snlp < best_snlp[lengths[j] + idx]
+              best_snlp[lengths[j] + idx] = new_snlp
+              prev[lengths[j] + idx] = idx
+            end
+          end
+          # In Japanese, single-character Katakana words are pretty rare.
+          # Accordingly, we apply the following heuristic: any continuous
+          # run of Katakana characters is considered a candidate word with
+          # a default cost specified in the katakanaCost table according
+          # to its length.
+          is_katakana = is_katakana?(cursor.codepoint)
+          if !is_prev_katakana && is_katakana
+            j = cursor.position + 1
+            cursor.advance
+            while j < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
+              cursor.advance
+              j += 1
+            end
+            if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
+              new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)
+              if new_snlp < best_snlp[j]
+                best_snlp[j] = new_snlp
+                prev[j] = idx
+              end
+            end
+          end
+          is_prev_katakana = is_katakana
+          cursor.advance
+        end
+        t_boundary = []
+        if best_snlp[input_length] == LARGE_NUMBER
+          t_boundary << end_pos
+        else
+          idx = end_pos - start_pos
+          while idx > 0
+            t_boundary << idx + start_pos
+            idx = prev[idx]
+          end
+        end
+        t_boundary.reverse_each(&block)
+      end
+      private
+      def hangul_word_set
+        @@hangul_word_set ||= KoreanBreakEngine.word_set
+      end
+      def is_katakana?(codepoint)
+        (codepoint >= 0x30A1 && codepoint <= 0x30FE && codepoint != 0x30FB) ||
+          (codepoint >= 0xFF66 && codepoint <= 0xFF9F)
+      end
+      def get_katakana_cost(word_length)
+        if word_length > MAX_KATAKANA_LENGTH
+          MAX_KATAKANA_COST
+        else
+          KATAKANA_COSTS[word_length]
+        end
+      end
+      def dictionary
+        @dictionary ||= Dictionary.cj
+      end
+    end
+  end
+end