twitter_cldr 5.2.0 → 5.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/Rakefile +19 -8
  4. data/lib/twitter_cldr/normalization.rb +18 -5
  5. data/lib/twitter_cldr/resources.rb +3 -1
  6. data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
  7. data/lib/twitter_cldr/resources/loader.rb +22 -1
  8. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
  9. data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
  10. data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
  11. data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
  12. data/lib/twitter_cldr/segmentation.rb +25 -10
  13. data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
  14. data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
  15. data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
  16. data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
  17. data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
  18. data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
  19. data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
  20. data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
  21. data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
  22. data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
  23. data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
  24. data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
  25. data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
  26. data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
  27. data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
  28. data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
  29. data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
  30. data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
  31. data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
  32. data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
  33. data/lib/twitter_cldr/shared.rb +1 -0
  34. data/lib/twitter_cldr/shared/caser.rb +3 -3
  35. data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
  36. data/lib/twitter_cldr/utils/range_set.rb +10 -1
  37. data/lib/twitter_cldr/version.rb +1 -1
  38. data/resources/collation/tailoring/km.yml +82 -0
  39. data/resources/collation/tailoring/lo.yml +4 -0
  40. data/resources/collation/tailoring/my.yml +940 -0
  41. data/resources/collation/tries/km.dump +0 -0
  42. data/resources/collation/tries/lo.dump +0 -0
  43. data/resources/collation/tries/my.dump +0 -0
  44. data/resources/locales/km/calendars.yml +373 -0
  45. data/resources/locales/km/currencies.yml +654 -0
  46. data/resources/locales/km/day_periods.yml +96 -0
  47. data/resources/locales/km/fields.yml +495 -0
  48. data/resources/locales/km/languages.yml +397 -0
  49. data/resources/locales/km/layout.yml +5 -0
  50. data/resources/locales/km/lists.yml +37 -0
  51. data/resources/locales/km/numbers.yml +402 -0
  52. data/resources/locales/km/plural_rules.yml +6 -0
  53. data/resources/locales/km/plurals.yml +12 -0
  54. data/resources/locales/km/rbnf.yml +131 -0
  55. data/resources/locales/km/territories.yml +267 -0
  56. data/resources/locales/km/timezones.yml +1471 -0
  57. data/resources/locales/km/units.yml +721 -0
  58. data/resources/locales/lo/calendars.yml +368 -0
  59. data/resources/locales/lo/currencies.yml +918 -0
  60. data/resources/locales/lo/day_periods.yml +96 -0
  61. data/resources/locales/lo/fields.yml +437 -0
  62. data/resources/locales/lo/languages.yml +529 -0
  63. data/resources/locales/lo/layout.yml +5 -0
  64. data/resources/locales/lo/lists.yml +42 -0
  65. data/resources/locales/lo/numbers.yml +476 -0
  66. data/resources/locales/lo/plural_rules.yml +7 -0
  67. data/resources/locales/lo/plurals.yml +14 -0
  68. data/resources/locales/lo/rbnf.yml +119 -0
  69. data/resources/locales/lo/territories.yml +265 -0
  70. data/resources/locales/lo/timezones.yml +1513 -0
  71. data/resources/locales/lo/units.yml +750 -0
  72. data/resources/locales/my/calendars.yml +374 -0
  73. data/resources/locales/my/currencies.yml +697 -0
  74. data/resources/locales/my/day_periods.yml +96 -0
  75. data/resources/locales/my/fields.yml +459 -0
  76. data/resources/locales/my/languages.yml +420 -0
  77. data/resources/locales/my/layout.yml +5 -0
  78. data/resources/locales/my/lists.yml +43 -0
  79. data/resources/locales/my/numbers.yml +417 -0
  80. data/resources/locales/my/plural_rules.yml +6 -0
  81. data/resources/locales/my/plurals.yml +12 -0
  82. data/resources/locales/my/rbnf.yml +145 -0
  83. data/resources/locales/my/territories.yml +265 -0
  84. data/resources/locales/my/timezones.yml +1479 -0
  85. data/resources/locales/my/units.yml +759 -0
  86. data/resources/locales/th/plurals.yml +1 -1
  87. data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
  88. data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
  89. data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
  90. data/resources/shared/segments/dictionaries/laodict.dump +0 -0
  91. data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
  92. data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
  93. data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
  94. data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
  95. data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
  96. data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
  97. data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
  98. data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
  99. data/resources/shared/segments/tests/line_break_test.yml +68 -68
  100. data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
  101. data/resources/supported_locales.yml +3 -0
  102. data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
  103. data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
  104. data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
  105. data/spec/segmentation/dictionary_break_spec.rb +42 -0
  106. data/spec/segmentation/rule_set_spec.rb +3 -1
  107. data/spec/timezones/tests/km.yml +12475 -0
  108. data/spec/timezones/tests/lo.yml +12475 -0
  109. data/spec/timezones/tests/my.yml +12475 -0
  110. metadata +87 -3
@@ -5,15 +5,30 @@
5
5
 
6
6
  module TwitterCldr
7
7
  module Segmentation
8
- autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
9
- autoload :CategoryTable, 'twitter_cldr/segmentation/category_table'
10
- autoload :Cursor, 'twitter_cldr/segmentation/cursor'
11
- autoload :Metadata, 'twitter_cldr/segmentation/metadata'
12
- autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
13
- autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
14
- autoload :StateMachine, 'twitter_cldr/segmentation/state_machine'
15
- autoload :StateTable, 'twitter_cldr/segmentation/state_table'
16
- autoload :StatusTable, 'twitter_cldr/segmentation/status_table'
17
- autoload :Suppressions, 'twitter_cldr/segmentation/suppressions'
8
+ autoload :BrahmicBreakEngine, 'twitter_cldr/segmentation/brahmic_break_engine'
9
+ autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
10
+ autoload :BurmeseBreakEngine, 'twitter_cldr/segmentation/burmese_break_engine'
11
+ autoload :CategoryTable, 'twitter_cldr/segmentation/category_table'
12
+ autoload :CjBreakEngine, 'twitter_cldr/segmentation/cj_break_engine'
13
+ autoload :Cursor, 'twitter_cldr/segmentation/cursor'
14
+ autoload :Dictionary, 'twitter_cldr/segmentation/dictionary'
15
+ autoload :DictionaryBreakEngine, 'twitter_cldr/segmentation/dictionary_break_engine'
16
+ autoload :KhmerBreakEngine, 'twitter_cldr/segmentation/khmer_break_engine'
17
+ autoload :KoreanBreakEngine, 'twitter_cldr/segmentation/korean_break_engine'
18
+ autoload :LaoBreakEngine, 'twitter_cldr/segmentation/lao_break_engine'
19
+ autoload :LineIterator, 'twitter_cldr/segmentation/line_iterator'
20
+ autoload :Metadata, 'twitter_cldr/segmentation/metadata'
21
+ autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
22
+ autoload :PossibleWord, 'twitter_cldr/segmentation/possible_word'
23
+ autoload :PossibleWordList, 'twitter_cldr/segmentation/possible_word_list'
24
+ autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
25
+ autoload :SegmentIterator, 'twitter_cldr/segmentation/segment_iterator'
26
+ autoload :StateMachine, 'twitter_cldr/segmentation/state_machine'
27
+ autoload :StateTable, 'twitter_cldr/segmentation/state_table'
28
+ autoload :StatusTable, 'twitter_cldr/segmentation/status_table'
29
+ autoload :Suppressions, 'twitter_cldr/segmentation/suppressions'
30
+ autoload :ThaiBreakEngine, 'twitter_cldr/segmentation/thai_break_engine'
31
+ autoload :UnhandledBreakEngine, 'twitter_cldr/segmentation/unhandled_break_engine'
32
+ autoload :WordIterator, 'twitter_cldr/segmentation/word_iterator'
18
33
  end
19
34
  end
@@ -0,0 +1,200 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ # Base class break engine for languages derived from the Brahmic script,
9
+ # i.e. Lao, Thai, Khmer, and Burmese.
10
+ #
11
+ # This class is based on duplicated code found in ICU's BurmeseBreakEngine
12
+ # and friends, which all make use of the same break logic.
13
+ class BrahmicBreakEngine < DictionaryBreakEngine
14
+
15
+ # ICU keeps track of all these variables inline, but since we've done a
16
+ # bit of method separating (see below), it's too ugly to pass all of
17
+ # them around as arguments. Instead we encapsulate them all in this
18
+ # handy state object.
19
+ class EngineState
20
+ attr_accessor :current
21
+ attr_reader :words
22
+ attr_accessor :words_found, :word_length
23
+
24
+ def initialize(options = {})
25
+ @current = options.fetch(:current, 0)
26
+ @words = options.fetch(:words)
27
+ @words_found = options.fetch(:words_found, 0)
28
+ @word_length = options.fetch(:word_length, 0)
29
+ end
30
+ end
31
+
32
+ attr_reader :lookahead, :root_combine_threshold
33
+ attr_reader :prefix_combine_threshold, :min_word, :min_word_span
34
+ attr_reader :word_set, :mark_set, :end_word_set, :begin_word_set
35
+ attr_reader :dictionary, :advance_past_suffix
36
+
37
+ def initialize(options = {})
38
+ @lookahead = options.fetch(:lookahead)
39
+ @root_combine_threshold = options.fetch(:root_combine_threshold)
40
+ @prefix_combine_threshold = options.fetch(:prefix_combine_threshold)
41
+ @min_word = options.fetch(:min_word)
42
+ @min_word_span = options.fetch(:min_word_span)
43
+
44
+ @word_set = options.fetch(:word_set)
45
+ @mark_set = options.fetch(:mark_set)
46
+ @end_word_set = options.fetch(:end_word_set)
47
+ @begin_word_set = options.fetch(:begin_word_set)
48
+
49
+ @dictionary = options.fetch(:dictionary)
50
+ @advance_past_suffix = options.fetch(:advance_past_suffix)
51
+ end
52
+
53
+ private
54
+
55
+ # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java#L88
56
+ def divide_up_dictionary_range(cursor, end_pos)
57
+ return to_enum(__method__, cursor, end_pos) unless block_given?
58
+ return if (end_pos - cursor.position) < min_word_span
59
+
60
+ state = EngineState.new(
61
+ cursor: cursor,
62
+ end_pos: end_pos,
63
+ words: PossibleWordList.new(lookahead)
64
+ )
65
+
66
+ while cursor.position < end_pos
67
+ state.current = cursor.position
68
+ state.word_length = 0
69
+
70
+ # look for candidate words at the current position
71
+ candidates = state.words[state.words_found].candidates(
72
+ cursor, dictionary, end_pos
73
+ )
74
+
75
+ # if we found exactly one, use that
76
+ if candidates == 1
77
+ state.word_length = state.words[state.words_found].accept_marked(cursor)
78
+ state.words_found += 1
79
+ elsif candidates > 1
80
+ mark_best_candidate(cursor, end_pos, state)
81
+ state.word_length = state.words[state.words_found].accept_marked(cursor)
82
+ state.words_found += 1
83
+ end
84
+
85
+ # We come here after having either found a word or not. We look ahead to the
86
+ # next word. If it's not a dictionary word, we will combine it with the word we
87
+ # just found (if there is one), but only if the preceding word does not exceed
88
+ # the threshold. The cursor should now be positioned at the end of the word we
89
+ # found.
90
+ if cursor.position < end_pos && state.word_length < root_combine_threshold
91
+ # If it is a dictionary word, do nothing. If it isn't, then if there is
92
+ # no preceding word, or the non-word shares less than the minimum threshold
93
+ # of characters with a dictionary word, then scan to resynchronize.
94
+ preceeding_words = state.words[state.words_found].candidates(
95
+ cursor, dictionary, end_pos
96
+ )
97
+
98
+ if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
99
+ advance_to_plausible_word_boundary(cursor, end_pos, state)
100
+ else
101
+ # backup to where we were for next iteration
102
+ cursor.position = state.current + state.word_length
103
+ end
104
+ end
105
+
106
+ # never stop before a combining mark.
107
+ while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
108
+ cursor.advance
109
+ state.word_length += 1
110
+ end
111
+
112
+ # Look ahead for possible suffixes if a dictionary word does not follow.
113
+ # We do this in code rather than using a rule so that the heuristic
114
+ # resynch continues to function. For example, one of the suffix characters
115
+ # could be a typo in the middle of a word.
116
+ state.word_length += advance_past_suffix.call(
117
+ cursor, end_pos, state
118
+ )
119
+
120
+ # Did we find a word on this iteration? If so, yield it as a boundary.
121
+ if state.word_length > 0
122
+ yield state.current + state.word_length
123
+ end
124
+ end
125
+ end
126
+
127
+ private
128
+
129
+ # In ICU, this method is part of divide_up_dictionary_range. Extracted here
130
+ # for readability.
131
+ def advance_to_plausible_word_boundary(cursor, end_pos, state)
132
+ remaining = end_pos - (state.current + state.word_length)
133
+ pc = cursor.codepoint
134
+ chars = 0
135
+
136
+ loop do
137
+ cursor.advance
138
+ uc = cursor.codepoint
139
+ chars += 1
140
+ remaining -= 1
141
+
142
+ break if remaining <= 0
143
+
144
+ if end_word_set.include?(pc) && begin_word_set.include?(uc)
145
+ # Maybe. See if it's in the dictionary.
146
+ candidate = state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos)
147
+ cursor.position = state.current + state.word_length + chars
148
+ break if candidate > 0
149
+ end
150
+
151
+ pc = uc
152
+ end
153
+
154
+ # bump the word count if there wasn't already one
155
+ state.words_found += 1 if state.word_length <= 0
156
+
157
+ # update the length with the passed-over characters
158
+ state.word_length += chars
159
+ end
160
+
161
+ def mark_best_candidate(cursor, end_pos, state)
162
+ # if there was more than one, see which one can take us forward the most words
163
+ found_best = false
164
+
165
+ # if we're already at the end of the range, we're done
166
+ if cursor.position < end_pos
167
+ loop do
168
+ words_matched = 1
169
+
170
+ if state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos) > 0
171
+ if words_matched < 2
172
+ # followed by another dictionary word; mark first word as a good candidate
173
+ state.words[state.words_found].mark_current
174
+ words_matched = 2
175
+ end
176
+
177
+ # if we're already at the end of the range, we're done
178
+ break if cursor.position >= end_pos
179
+
180
+ # see if any of the possible second words is followed by a third word
181
+ loop do
182
+ # if we find a third word, stop right away
183
+ if state.words[state.words_found + 2].candidates(cursor, dictionary, end_pos) > 0
184
+ state.words[state.words_found].mark_current
185
+ found_best = true
186
+ break
187
+ end
188
+
189
+ break unless state.words[state.words_found + 1].back_up(cursor)
190
+ end
191
+ end
192
+
193
+ break unless state.words[state.words_found].back_up(cursor) && !found_best
194
+ end
195
+ end
196
+ end
197
+
198
+ end
199
+ end
200
+ end
@@ -6,7 +6,6 @@
6
6
  module TwitterCldr
7
7
  module Segmentation
8
8
  class BreakIterator
9
-
10
9
  attr_reader :locale, :options
11
10
 
12
11
  def initialize(locale = TwitterCldr.locale, options = {})
@@ -15,43 +14,44 @@ module TwitterCldr
15
14
  end
16
15
 
17
16
  def each_sentence(str, &block)
18
- rule_set = rule_set_for('sentence')
19
- each_boundary(rule_set, str, &block)
17
+ iter = iterator_for('sentence')
18
+ iter.each_segment(str, &block)
20
19
  end
21
20
 
22
21
  def each_word(str, &block)
23
- rule_set = rule_set_for('word')
24
- each_boundary(rule_set, str, &block)
22
+ iter = iterator_for('word')
23
+ iter.each_segment(str, &block)
25
24
  end
26
25
 
27
26
  def each_grapheme_cluster(str, &block)
28
- rule_set = rule_set_for('grapheme')
29
- each_boundary(rule_set, str, &block)
27
+ iter = iterator_for('grapheme')
28
+ iter.each_segment(str, &block)
30
29
  end
31
30
 
32
31
  def each_line(str, &block)
33
- rule_set = rule_set_for('line')
34
- each_boundary(rule_set, str, &block)
32
+ iter = iterator_for('line')
33
+ iter.each_segment(str, &block)
35
34
  end
36
35
 
37
36
  private
38
37
 
39
- def each_boundary(rule_set, str)
40
- return to_enum(__method__, rule_set, str) unless block_given?
41
-
42
- rule_set.each_boundary(str).each_cons(2) do |start, stop|
43
- yield str[start...stop], start, stop
38
+ def iterator_for(boundary_type)
39
+ iterator_cache[boundary_type] ||= begin
40
+ rule_set = RuleSet.create(locale, boundary_type, options)
41
+
42
+ case boundary_type
43
+ when 'line'
44
+ LineIterator.new(rule_set)
45
+ when 'word'
46
+ WordIterator.new(rule_set)
47
+ else
48
+ SegmentIterator.new(rule_set)
49
+ end
44
50
  end
45
51
  end
46
52
 
47
- def rule_set_for(boundary_type)
48
- rule_set_cache[boundary_type] ||= RuleSet.create(
49
- locale, boundary_type, options
50
- )
51
- end
52
-
53
- def rule_set_cache
54
- @rule_set_cache ||= {}
53
+ def iterator_cache
54
+ @iterator_cache ||= {}
55
55
  end
56
56
  end
57
57
  end
@@ -0,0 +1,83 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+ require 'forwardable'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+
12
+ # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
13
+ class BurmeseBreakEngine
14
+
15
+ include Singleton
16
+ extend Forwardable
17
+
18
+ def_delegators :engine, :each_boundary
19
+
20
+ def self.word_set
21
+ @word_set ||= begin
22
+ uset = TwitterCldr::Shared::UnicodeSet.new
23
+ uset.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]]')
24
+ uset.to_set
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ # All Brahmic scripts (including Burmese) can make use of the same break
31
+ # logic, so we use composition here and defer to the Brahmic break engine.
32
+ def engine
33
+ @engine ||= BrahmicBreakEngine.new(
34
+ # How many words in a row are "good enough"?
35
+ lookahead: 3,
36
+
37
+ # Will not combine a non-word with a preceding dictionary word longer than this
38
+ root_combine_threshold: 3,
39
+
40
+ # Will not combine a non-word that shares at least this much prefix with a
41
+ # dictionary word with a preceding word
42
+ prefix_combine_threshold: 3,
43
+
44
+ # Minimum word size
45
+ min_word: 2,
46
+
47
+ # Minimum number of characters for two words (same as min_word for Burmese)
48
+ min_word_span: 2,
49
+
50
+ word_set: self.class.word_set,
51
+ mark_set: mark_set,
52
+ end_word_set: end_word_set,
53
+ begin_word_set: begin_word_set,
54
+ dictionary: Dictionary.burmese,
55
+ advance_past_suffix: -> (*) do
56
+ 0 # not applicable to Burmese
57
+ end
58
+ )
59
+ end
60
+
61
+ def mark_set
62
+ @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
63
+ set.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]&[:M:]]')
64
+ set.add(0x0020)
65
+ end
66
+ end
67
+
68
+ def end_word_set
69
+ @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
70
+ set.add_list(self.class.word_set)
71
+ end
72
+ end
73
+
74
+ def begin_word_set
75
+ @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
76
+ # basic consonants and independent vowels
77
+ set.add_range(0x1000..0x102A)
78
+ end
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -45,12 +45,16 @@ module TwitterCldr
45
45
  private
46
46
 
47
47
  def find(codepoint)
48
- values.bsearch do |entry|
48
+ cache[codepoint] ||= values.bsearch do |entry|
49
49
  next -1 if codepoint < entry[0]
50
50
  next 1 if codepoint > entry[1]
51
51
  0
52
52
  end
53
53
  end
54
+
55
+ def cache
56
+ @cache ||= {}
57
+ end
54
58
  end
55
59
  end
56
60
  end
@@ -0,0 +1,163 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class CjBreakEngine < DictionaryBreakEngine
11
+
12
+ include Singleton
13
+
14
+ # magic number pulled from ICU's source code, presumably slightly longer
15
+ # than the longest Chinese/Japanese/Korean word
16
+ MAX_WORD_SIZE = 20
17
+
18
+ # magic number pulled from ICU's source code
19
+ MAX_SNLP = 255
20
+
21
+ # the equivalent of Java's Integer.MAX_VALUE
22
+ LARGE_NUMBER = 0xFFFFFFFF
23
+
24
+ MAX_KATAKANA_LENGTH = 8
25
+ MAX_KATAKANA_GROUP_LENGTH = 20
26
+ KATAKANA_COSTS = [8192, 984, 408, 240, 204, 252, 300, 372, 480].freeze
27
+ MAX_KATAKANA_COST = 8192
28
+
29
+ def self.word_set
30
+ @word_set ||= begin
31
+ uset = TwitterCldr::Shared::UnicodeSet.new
32
+ uset.apply_pattern('[:Han:]')
33
+ uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
34
+ uset.apply_pattern('[:Hiragana:]')
35
+ uset.add(0xFF70) # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
36
+ uset.add(0x30FC) # KATAKANA-HIRAGANA PROLONGED SOUND MARK
37
+ uset.to_set
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def word_set
44
+ self.class.word_set
45
+ end
46
+
47
+ def divide_up_dictionary_range(cursor, end_pos, &block)
48
+ return to_enum(__method__, cursor, end_pos) unless block_given?
49
+
50
+ input_length = end_pos - cursor.position
51
+ best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
52
+ prev = Array.new(input_length + 1) { -1 }
53
+
54
+ best_snlp[0] = 0
55
+ start_pos = cursor.position
56
+ is_prev_katakana = false
57
+
58
+ until cursor.position >= end_pos
59
+ idx = cursor.position - start_pos
60
+
61
+ if best_snlp[idx] == LARGE_NUMBER
62
+ cursor.advance
63
+ next
64
+ end
65
+
66
+ max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
67
+ MAX_WORD_SIZE
68
+ else
69
+ end_pos - cursor.position
70
+ end
71
+
72
+ count, values, lengths, _ = dictionary.matches(
73
+ cursor, max_search_length, max_search_length
74
+ )
75
+
76
+ if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
77
+ values[count] = MAX_SNLP
78
+ lengths[count] = 1
79
+ count += 1
80
+ end
81
+
82
+ count.times do |j|
83
+ new_snlp = best_snlp[idx] + values[j]
84
+
85
+ if new_snlp < best_snlp[lengths[j] + idx]
86
+ best_snlp[lengths[j] + idx] = new_snlp
87
+ prev[lengths[j] + idx] = idx
88
+ end
89
+ end
90
+
91
+ # In Japanese, single-character Katakana words are pretty rare.
92
+ # Accordingly, we apply the following heuristic: any continuous
93
+ # run of Katakana characters is considered a candidate word with
94
+ # a default cost specified in the katakanaCost table according
95
+ # to its length.
96
+ is_katakana = is_katakana?(cursor.codepoint)
97
+
98
+ if !is_prev_katakana && is_katakana
99
+ j = cursor.position + 1
100
+ cursor.advance
101
+
102
+ while j < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
103
+ cursor.advance
104
+ j += 1
105
+ end
106
+
107
+ if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
108
+ new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)
109
+
110
+ if new_snlp < best_snlp[j]
111
+ best_snlp[j] = new_snlp
112
+ prev[j] = idx
113
+ end
114
+ end
115
+ end
116
+
117
+ is_prev_katakana = is_katakana
118
+
119
+ cursor.advance
120
+ end
121
+
122
+ t_boundary = []
123
+
124
+ if best_snlp[input_length] == LARGE_NUMBER
125
+ t_boundary << end_pos
126
+ else
127
+ idx = end_pos - start_pos
128
+
129
+ while idx > 0
130
+ t_boundary << idx + start_pos
131
+ idx = prev[idx]
132
+ end
133
+ end
134
+
135
+ t_boundary.reverse_each(&block)
136
+ end
137
+
138
+ private
139
+
140
+ def hangul_word_set
141
+ @@hangul_word_set ||= KoreanBreakEngine.word_set
142
+ end
143
+
144
+ def is_katakana?(codepoint)
145
+ (codepoint >= 0x30A1 && codepoint <= 0x30FE && codepoint != 0x30FB) ||
146
+ (codepoint >= 0xFF66 && codepoint <= 0xFF9F)
147
+ end
148
+
149
+ def get_katakana_cost(word_length)
150
+ if word_length > MAX_KATAKANA_LENGTH
151
+ MAX_KATAKANA_COST
152
+ else
153
+ KATAKANA_COSTS[word_length]
154
+ end
155
+ end
156
+
157
+ def dictionary
158
+ @dictionary ||= Dictionary.cj
159
+ end
160
+
161
+ end
162
+ end
163
+ end