twitter_cldr 5.2.0 → 5.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -4
- data/Rakefile +19 -8
- data/lib/twitter_cldr/normalization.rb +18 -5
- data/lib/twitter_cldr/resources.rb +3 -1
- data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
- data/lib/twitter_cldr/resources/loader.rb +22 -1
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
- data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
- data/lib/twitter_cldr/segmentation.rb +25 -10
- data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
- data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
- data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
- data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
- data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
- data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
- data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
- data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
- data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
- data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
- data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
- data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
- data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
- data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
- data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
- data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
- data/lib/twitter_cldr/shared.rb +1 -0
- data/lib/twitter_cldr/shared/caser.rb +3 -3
- data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
- data/lib/twitter_cldr/utils/range_set.rb +10 -1
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/km.yml +82 -0
- data/resources/collation/tailoring/lo.yml +4 -0
- data/resources/collation/tailoring/my.yml +940 -0
- data/resources/collation/tries/km.dump +0 -0
- data/resources/collation/tries/lo.dump +0 -0
- data/resources/collation/tries/my.dump +0 -0
- data/resources/locales/km/calendars.yml +373 -0
- data/resources/locales/km/currencies.yml +654 -0
- data/resources/locales/km/day_periods.yml +96 -0
- data/resources/locales/km/fields.yml +495 -0
- data/resources/locales/km/languages.yml +397 -0
- data/resources/locales/km/layout.yml +5 -0
- data/resources/locales/km/lists.yml +37 -0
- data/resources/locales/km/numbers.yml +402 -0
- data/resources/locales/km/plural_rules.yml +6 -0
- data/resources/locales/km/plurals.yml +12 -0
- data/resources/locales/km/rbnf.yml +131 -0
- data/resources/locales/km/territories.yml +267 -0
- data/resources/locales/km/timezones.yml +1471 -0
- data/resources/locales/km/units.yml +721 -0
- data/resources/locales/lo/calendars.yml +368 -0
- data/resources/locales/lo/currencies.yml +918 -0
- data/resources/locales/lo/day_periods.yml +96 -0
- data/resources/locales/lo/fields.yml +437 -0
- data/resources/locales/lo/languages.yml +529 -0
- data/resources/locales/lo/layout.yml +5 -0
- data/resources/locales/lo/lists.yml +42 -0
- data/resources/locales/lo/numbers.yml +476 -0
- data/resources/locales/lo/plural_rules.yml +7 -0
- data/resources/locales/lo/plurals.yml +14 -0
- data/resources/locales/lo/rbnf.yml +119 -0
- data/resources/locales/lo/territories.yml +265 -0
- data/resources/locales/lo/timezones.yml +1513 -0
- data/resources/locales/lo/units.yml +750 -0
- data/resources/locales/my/calendars.yml +374 -0
- data/resources/locales/my/currencies.yml +697 -0
- data/resources/locales/my/day_periods.yml +96 -0
- data/resources/locales/my/fields.yml +459 -0
- data/resources/locales/my/languages.yml +420 -0
- data/resources/locales/my/layout.yml +5 -0
- data/resources/locales/my/lists.yml +43 -0
- data/resources/locales/my/numbers.yml +417 -0
- data/resources/locales/my/plural_rules.yml +6 -0
- data/resources/locales/my/plurals.yml +12 -0
- data/resources/locales/my/rbnf.yml +145 -0
- data/resources/locales/my/territories.yml +265 -0
- data/resources/locales/my/timezones.yml +1479 -0
- data/resources/locales/my/units.yml +759 -0
- data/resources/locales/th/plurals.yml +1 -1
- data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
- data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
- data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
- data/resources/shared/segments/dictionaries/laodict.dump +0 -0
- data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
- data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
- data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
- data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
- data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
- data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
- data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
- data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
- data/resources/shared/segments/tests/line_break_test.yml +68 -68
- data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
- data/resources/supported_locales.yml +3 -0
- data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
- data/spec/segmentation/dictionary_break_spec.rb +42 -0
- data/spec/segmentation/rule_set_spec.rb +3 -1
- data/spec/timezones/tests/km.yml +12475 -0
- data/spec/timezones/tests/lo.yml +12475 -0
- data/spec/timezones/tests/my.yml +12475 -0
- metadata +87 -3
@@ -5,15 +5,30 @@
|
|
5
5
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Segmentation
|
8
|
-
autoload :
|
9
|
-
autoload :
|
10
|
-
autoload :
|
11
|
-
autoload :
|
12
|
-
autoload :
|
13
|
-
autoload :
|
14
|
-
autoload :
|
15
|
-
autoload :
|
16
|
-
autoload :
|
17
|
-
autoload :
|
8
|
+
autoload :BrahmicBreakEngine, 'twitter_cldr/segmentation/brahmic_break_engine'
|
9
|
+
autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
|
10
|
+
autoload :BurmeseBreakEngine, 'twitter_cldr/segmentation/burmese_break_engine'
|
11
|
+
autoload :CategoryTable, 'twitter_cldr/segmentation/category_table'
|
12
|
+
autoload :CjBreakEngine, 'twitter_cldr/segmentation/cj_break_engine'
|
13
|
+
autoload :Cursor, 'twitter_cldr/segmentation/cursor'
|
14
|
+
autoload :Dictionary, 'twitter_cldr/segmentation/dictionary'
|
15
|
+
autoload :DictionaryBreakEngine, 'twitter_cldr/segmentation/dictionary_break_engine'
|
16
|
+
autoload :KhmerBreakEngine, 'twitter_cldr/segmentation/khmer_break_engine'
|
17
|
+
autoload :KoreanBreakEngine, 'twitter_cldr/segmentation/korean_break_engine'
|
18
|
+
autoload :LaoBreakEngine, 'twitter_cldr/segmentation/lao_break_engine'
|
19
|
+
autoload :LineIterator, 'twitter_cldr/segmentation/line_iterator'
|
20
|
+
autoload :Metadata, 'twitter_cldr/segmentation/metadata'
|
21
|
+
autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
|
22
|
+
autoload :PossibleWord, 'twitter_cldr/segmentation/possible_word'
|
23
|
+
autoload :PossibleWordList, 'twitter_cldr/segmentation/possible_word_list'
|
24
|
+
autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
|
25
|
+
autoload :SegmentIterator, 'twitter_cldr/segmentation/segment_iterator'
|
26
|
+
autoload :StateMachine, 'twitter_cldr/segmentation/state_machine'
|
27
|
+
autoload :StateTable, 'twitter_cldr/segmentation/state_table'
|
28
|
+
autoload :StatusTable, 'twitter_cldr/segmentation/status_table'
|
29
|
+
autoload :Suppressions, 'twitter_cldr/segmentation/suppressions'
|
30
|
+
autoload :ThaiBreakEngine, 'twitter_cldr/segmentation/thai_break_engine'
|
31
|
+
autoload :UnhandledBreakEngine, 'twitter_cldr/segmentation/unhandled_break_engine'
|
32
|
+
autoload :WordIterator, 'twitter_cldr/segmentation/word_iterator'
|
18
33
|
end
|
19
34
|
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Segmentation
|
8
|
+
# Base class break engine for languages derived from the Brahmic script,
|
9
|
+
# i.e. Lao, Thai, Khmer, and Burmese.
|
10
|
+
#
|
11
|
+
# This class is based on duplicated code found in ICU's BurmeseBreakEngine
|
12
|
+
# and friends, which all make use of the same break logic.
|
13
|
+
class BrahmicBreakEngine < DictionaryBreakEngine
|
14
|
+
|
15
|
+
# ICU keeps track of all these variables inline, but since we've done a
|
16
|
+
# bit of method separating (see below), it's too ugly to pass all of
|
17
|
+
# them around as arguments. Instead we encapsulate them all in this
|
18
|
+
# handy state object.
|
19
|
+
class EngineState
|
20
|
+
attr_accessor :current
|
21
|
+
attr_reader :words
|
22
|
+
attr_accessor :words_found, :word_length
|
23
|
+
|
24
|
+
def initialize(options = {})
|
25
|
+
@current = options.fetch(:current, 0)
|
26
|
+
@words = options.fetch(:words)
|
27
|
+
@words_found = options.fetch(:words_found, 0)
|
28
|
+
@word_length = options.fetch(:word_length, 0)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :lookahead, :root_combine_threshold
|
33
|
+
attr_reader :prefix_combine_threshold, :min_word, :min_word_span
|
34
|
+
attr_reader :word_set, :mark_set, :end_word_set, :begin_word_set
|
35
|
+
attr_reader :dictionary, :advance_past_suffix
|
36
|
+
|
37
|
+
def initialize(options = {})
|
38
|
+
@lookahead = options.fetch(:lookahead)
|
39
|
+
@root_combine_threshold = options.fetch(:root_combine_threshold)
|
40
|
+
@prefix_combine_threshold = options.fetch(:prefix_combine_threshold)
|
41
|
+
@min_word = options.fetch(:min_word)
|
42
|
+
@min_word_span = options.fetch(:min_word_span)
|
43
|
+
|
44
|
+
@word_set = options.fetch(:word_set)
|
45
|
+
@mark_set = options.fetch(:mark_set)
|
46
|
+
@end_word_set = options.fetch(:end_word_set)
|
47
|
+
@begin_word_set = options.fetch(:begin_word_set)
|
48
|
+
|
49
|
+
@dictionary = options.fetch(:dictionary)
|
50
|
+
@advance_past_suffix = options.fetch(:advance_past_suffix)
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java#L88
|
56
|
+
def divide_up_dictionary_range(cursor, end_pos)
|
57
|
+
return to_enum(__method__, cursor, end_pos) unless block_given?
|
58
|
+
return if (end_pos - cursor.position) < min_word_span
|
59
|
+
|
60
|
+
state = EngineState.new(
|
61
|
+
cursor: cursor,
|
62
|
+
end_pos: end_pos,
|
63
|
+
words: PossibleWordList.new(lookahead)
|
64
|
+
)
|
65
|
+
|
66
|
+
while cursor.position < end_pos
|
67
|
+
state.current = cursor.position
|
68
|
+
state.word_length = 0
|
69
|
+
|
70
|
+
# look for candidate words at the current position
|
71
|
+
candidates = state.words[state.words_found].candidates(
|
72
|
+
cursor, dictionary, end_pos
|
73
|
+
)
|
74
|
+
|
75
|
+
# if we found exactly one, use that
|
76
|
+
if candidates == 1
|
77
|
+
state.word_length = state.words[state.words_found].accept_marked(cursor)
|
78
|
+
state.words_found += 1
|
79
|
+
elsif candidates > 1
|
80
|
+
mark_best_candidate(cursor, end_pos, state)
|
81
|
+
state.word_length = state.words[state.words_found].accept_marked(cursor)
|
82
|
+
state.words_found += 1
|
83
|
+
end
|
84
|
+
|
85
|
+
# We come here after having either found a word or not. We look ahead to the
|
86
|
+
# next word. If it's not a dictionary word, we will combine it with the word we
|
87
|
+
# just found (if there is one), but only if the preceding word does not exceed
|
88
|
+
# the threshold. The cursor should now be positioned at the end of the word we
|
89
|
+
# found.
|
90
|
+
if cursor.position < end_pos && state.word_length < root_combine_threshold
|
91
|
+
# If it is a dictionary word, do nothing. If it isn't, then if there is
|
92
|
+
# no preceding word, or the non-word shares less than the minimum threshold
|
93
|
+
# of characters with a dictionary word, then scan to resynchronize.
|
94
|
+
preceeding_words = state.words[state.words_found].candidates(
|
95
|
+
cursor, dictionary, end_pos
|
96
|
+
)
|
97
|
+
|
98
|
+
if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
|
99
|
+
advance_to_plausible_word_boundary(cursor, end_pos, state)
|
100
|
+
else
|
101
|
+
# backup to where we were for next iteration
|
102
|
+
cursor.position = state.current + state.word_length
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# never stop before a combining mark.
|
107
|
+
while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
|
108
|
+
cursor.advance
|
109
|
+
state.word_length += 1
|
110
|
+
end
|
111
|
+
|
112
|
+
# Look ahead for possible suffixes if a dictionary word does not follow.
|
113
|
+
# We do this in code rather than using a rule so that the heuristic
|
114
|
+
# resynch continues to function. For example, one of the suffix characters
|
115
|
+
# could be a typo in the middle of a word.
|
116
|
+
state.word_length += advance_past_suffix.call(
|
117
|
+
cursor, end_pos, state
|
118
|
+
)
|
119
|
+
|
120
|
+
# Did we find a word on this iteration? If so, yield it as a boundary.
|
121
|
+
if state.word_length > 0
|
122
|
+
yield state.current + state.word_length
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# In ICU, this method is part of divide_up_dictionary_range. Extracted here
|
130
|
+
# for readability.
|
131
|
+
def advance_to_plausible_word_boundary(cursor, end_pos, state)
|
132
|
+
remaining = end_pos - (state.current + state.word_length)
|
133
|
+
pc = cursor.codepoint
|
134
|
+
chars = 0
|
135
|
+
|
136
|
+
loop do
|
137
|
+
cursor.advance
|
138
|
+
uc = cursor.codepoint
|
139
|
+
chars += 1
|
140
|
+
remaining -= 1
|
141
|
+
|
142
|
+
break if remaining <= 0
|
143
|
+
|
144
|
+
if end_word_set.include?(pc) && begin_word_set.include?(uc)
|
145
|
+
# Maybe. See if it's in the dictionary.
|
146
|
+
candidate = state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos)
|
147
|
+
cursor.position = state.current + state.word_length + chars
|
148
|
+
break if candidate > 0
|
149
|
+
end
|
150
|
+
|
151
|
+
pc = uc
|
152
|
+
end
|
153
|
+
|
154
|
+
# bump the word count if there wasn't already one
|
155
|
+
state.words_found += 1 if state.word_length <= 0
|
156
|
+
|
157
|
+
# update the length with the passed-over characters
|
158
|
+
state.word_length += chars
|
159
|
+
end
|
160
|
+
|
161
|
+
def mark_best_candidate(cursor, end_pos, state)
|
162
|
+
# if there was more than one, see which one can take us forward the most words
|
163
|
+
found_best = false
|
164
|
+
|
165
|
+
# if we're already at the end of the range, we're done
|
166
|
+
if cursor.position < end_pos
|
167
|
+
loop do
|
168
|
+
words_matched = 1
|
169
|
+
|
170
|
+
if state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos) > 0
|
171
|
+
if words_matched < 2
|
172
|
+
# followed by another dictionary word; mark first word as a good candidate
|
173
|
+
state.words[state.words_found].mark_current
|
174
|
+
words_matched = 2
|
175
|
+
end
|
176
|
+
|
177
|
+
# if we're already at the end of the range, we're done
|
178
|
+
break if cursor.position >= end_pos
|
179
|
+
|
180
|
+
# see if any of the possible second words is followed by a third word
|
181
|
+
loop do
|
182
|
+
# if we find a third word, stop right away
|
183
|
+
if state.words[state.words_found + 2].candidates(cursor, dictionary, end_pos) > 0
|
184
|
+
state.words[state.words_found].mark_current
|
185
|
+
found_best = true
|
186
|
+
break
|
187
|
+
end
|
188
|
+
|
189
|
+
break unless state.words[state.words_found + 1].back_up(cursor)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
break unless state.words[state.words_found].back_up(cursor) && !found_best
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -6,7 +6,6 @@
|
|
6
6
|
module TwitterCldr
|
7
7
|
module Segmentation
|
8
8
|
class BreakIterator
|
9
|
-
|
10
9
|
attr_reader :locale, :options
|
11
10
|
|
12
11
|
def initialize(locale = TwitterCldr.locale, options = {})
|
@@ -15,43 +14,44 @@ module TwitterCldr
|
|
15
14
|
end
|
16
15
|
|
17
16
|
def each_sentence(str, &block)
|
18
|
-
|
19
|
-
|
17
|
+
iter = iterator_for('sentence')
|
18
|
+
iter.each_segment(str, &block)
|
20
19
|
end
|
21
20
|
|
22
21
|
def each_word(str, &block)
|
23
|
-
|
24
|
-
|
22
|
+
iter = iterator_for('word')
|
23
|
+
iter.each_segment(str, &block)
|
25
24
|
end
|
26
25
|
|
27
26
|
def each_grapheme_cluster(str, &block)
|
28
|
-
|
29
|
-
|
27
|
+
iter = iterator_for('grapheme')
|
28
|
+
iter.each_segment(str, &block)
|
30
29
|
end
|
31
30
|
|
32
31
|
def each_line(str, &block)
|
33
|
-
|
34
|
-
|
32
|
+
iter = iterator_for('line')
|
33
|
+
iter.each_segment(str, &block)
|
35
34
|
end
|
36
35
|
|
37
36
|
private
|
38
37
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
def iterator_for(boundary_type)
|
39
|
+
iterator_cache[boundary_type] ||= begin
|
40
|
+
rule_set = RuleSet.create(locale, boundary_type, options)
|
41
|
+
|
42
|
+
case boundary_type
|
43
|
+
when 'line'
|
44
|
+
LineIterator.new(rule_set)
|
45
|
+
when 'word'
|
46
|
+
WordIterator.new(rule_set)
|
47
|
+
else
|
48
|
+
SegmentIterator.new(rule_set)
|
49
|
+
end
|
44
50
|
end
|
45
51
|
end
|
46
52
|
|
47
|
-
def
|
48
|
-
|
49
|
-
locale, boundary_type, options
|
50
|
-
)
|
51
|
-
end
|
52
|
-
|
53
|
-
def rule_set_cache
|
54
|
-
@rule_set_cache ||= {}
|
53
|
+
def iterator_cache
|
54
|
+
@iterator_cache ||= {}
|
55
55
|
end
|
56
56
|
end
|
57
57
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
require 'forwardable'
|
8
|
+
|
9
|
+
module TwitterCldr
|
10
|
+
module Segmentation
|
11
|
+
|
12
|
+
# See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
|
13
|
+
class BurmeseBreakEngine
|
14
|
+
|
15
|
+
include Singleton
|
16
|
+
extend Forwardable
|
17
|
+
|
18
|
+
def_delegators :engine, :each_boundary
|
19
|
+
|
20
|
+
def self.word_set
|
21
|
+
@word_set ||= begin
|
22
|
+
uset = TwitterCldr::Shared::UnicodeSet.new
|
23
|
+
uset.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]]')
|
24
|
+
uset.to_set
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# All Brahmic scripts (including Burmese) can make use of the same break
|
31
|
+
# logic, so we use composition here and defer to the Brahmic break engine.
|
32
|
+
def engine
|
33
|
+
@engine ||= BrahmicBreakEngine.new(
|
34
|
+
# How many words in a row are "good enough"?
|
35
|
+
lookahead: 3,
|
36
|
+
|
37
|
+
# Will not combine a non-word with a preceding dictionary word longer than this
|
38
|
+
root_combine_threshold: 3,
|
39
|
+
|
40
|
+
# Will not combine a non-word that shares at least this much prefix with a
|
41
|
+
# dictionary word with a preceding word
|
42
|
+
prefix_combine_threshold: 3,
|
43
|
+
|
44
|
+
# Minimum word size
|
45
|
+
min_word: 2,
|
46
|
+
|
47
|
+
# Minimum number of characters for two words (same as min_word for Burmese)
|
48
|
+
min_word_span: 2,
|
49
|
+
|
50
|
+
word_set: self.class.word_set,
|
51
|
+
mark_set: mark_set,
|
52
|
+
end_word_set: end_word_set,
|
53
|
+
begin_word_set: begin_word_set,
|
54
|
+
dictionary: Dictionary.burmese,
|
55
|
+
advance_past_suffix: -> (*) do
|
56
|
+
0 # not applicable to Burmese
|
57
|
+
end
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
def mark_set
|
62
|
+
@mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
63
|
+
set.apply_pattern('[[:Mymr:]&[:Line_Break=SA:]&[:M:]]')
|
64
|
+
set.add(0x0020)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def end_word_set
|
69
|
+
@end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
70
|
+
set.add_list(self.class.word_set)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def begin_word_set
|
75
|
+
@begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
|
76
|
+
# basic consonants and independent vowels
|
77
|
+
set.add_range(0x1000..0x102A)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -45,12 +45,16 @@ module TwitterCldr
|
|
45
45
|
private
|
46
46
|
|
47
47
|
def find(codepoint)
|
48
|
-
values.bsearch do |entry|
|
48
|
+
cache[codepoint] ||= values.bsearch do |entry|
|
49
49
|
next -1 if codepoint < entry[0]
|
50
50
|
next 1 if codepoint > entry[1]
|
51
51
|
0
|
52
52
|
end
|
53
53
|
end
|
54
|
+
|
55
|
+
def cache
|
56
|
+
@cache ||= {}
|
57
|
+
end
|
54
58
|
end
|
55
59
|
end
|
56
60
|
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'singleton'
|
7
|
+
|
8
|
+
module TwitterCldr
|
9
|
+
module Segmentation
|
10
|
+
class CjBreakEngine < DictionaryBreakEngine
|
11
|
+
|
12
|
+
include Singleton
|
13
|
+
|
14
|
+
# magic number pulled from ICU's source code, presumably slightly longer
|
15
|
+
# than the longest Chinese/Japanese/Korean word
|
16
|
+
MAX_WORD_SIZE = 20
|
17
|
+
|
18
|
+
# magic number pulled from ICU's source code
|
19
|
+
MAX_SNLP = 255
|
20
|
+
|
21
|
+
# the equivalent of Java's Integer.MAX_VALUE
|
22
|
+
LARGE_NUMBER = 0xFFFFFFFF
|
23
|
+
|
24
|
+
MAX_KATAKANA_LENGTH = 8
|
25
|
+
MAX_KATAKANA_GROUP_LENGTH = 20
|
26
|
+
KATAKANA_COSTS = [8192, 984, 408, 240, 204, 252, 300, 372, 480].freeze
|
27
|
+
MAX_KATAKANA_COST = 8192
|
28
|
+
|
29
|
+
def self.word_set
|
30
|
+
@word_set ||= begin
|
31
|
+
uset = TwitterCldr::Shared::UnicodeSet.new
|
32
|
+
uset.apply_pattern('[:Han:]')
|
33
|
+
uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
|
34
|
+
uset.apply_pattern('[:Hiragana:]')
|
35
|
+
uset.add(0xFF70) # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
36
|
+
uset.add(0x30FC) # KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
37
|
+
uset.to_set
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def word_set
|
44
|
+
self.class.word_set
|
45
|
+
end
|
46
|
+
|
47
|
+
def divide_up_dictionary_range(cursor, end_pos, &block)
|
48
|
+
return to_enum(__method__, cursor, end_pos) unless block_given?
|
49
|
+
|
50
|
+
input_length = end_pos - cursor.position
|
51
|
+
best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
|
52
|
+
prev = Array.new(input_length + 1) { -1 }
|
53
|
+
|
54
|
+
best_snlp[0] = 0
|
55
|
+
start_pos = cursor.position
|
56
|
+
is_prev_katakana = false
|
57
|
+
|
58
|
+
until cursor.position >= end_pos
|
59
|
+
idx = cursor.position - start_pos
|
60
|
+
|
61
|
+
if best_snlp[idx] == LARGE_NUMBER
|
62
|
+
cursor.advance
|
63
|
+
next
|
64
|
+
end
|
65
|
+
|
66
|
+
max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
|
67
|
+
MAX_WORD_SIZE
|
68
|
+
else
|
69
|
+
end_pos - cursor.position
|
70
|
+
end
|
71
|
+
|
72
|
+
count, values, lengths, _ = dictionary.matches(
|
73
|
+
cursor, max_search_length, max_search_length
|
74
|
+
)
|
75
|
+
|
76
|
+
if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
|
77
|
+
values[count] = MAX_SNLP
|
78
|
+
lengths[count] = 1
|
79
|
+
count += 1
|
80
|
+
end
|
81
|
+
|
82
|
+
count.times do |j|
|
83
|
+
new_snlp = best_snlp[idx] + values[j]
|
84
|
+
|
85
|
+
if new_snlp < best_snlp[lengths[j] + idx]
|
86
|
+
best_snlp[lengths[j] + idx] = new_snlp
|
87
|
+
prev[lengths[j] + idx] = idx
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# In Japanese, single-character Katakana words are pretty rare.
|
92
|
+
# Accordingly, we apply the following heuristic: any continuous
|
93
|
+
# run of Katakana characters is considered a candidate word with
|
94
|
+
# a default cost specified in the katakanaCost table according
|
95
|
+
# to its length.
|
96
|
+
is_katakana = is_katakana?(cursor.codepoint)
|
97
|
+
|
98
|
+
if !is_prev_katakana && is_katakana
|
99
|
+
j = cursor.position + 1
|
100
|
+
cursor.advance
|
101
|
+
|
102
|
+
while j < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
|
103
|
+
cursor.advance
|
104
|
+
j += 1
|
105
|
+
end
|
106
|
+
|
107
|
+
if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
|
108
|
+
new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)
|
109
|
+
|
110
|
+
if new_snlp < best_snlp[j]
|
111
|
+
best_snlp[j] = new_snlp
|
112
|
+
prev[j] = idx
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
is_prev_katakana = is_katakana
|
118
|
+
|
119
|
+
cursor.advance
|
120
|
+
end
|
121
|
+
|
122
|
+
t_boundary = []
|
123
|
+
|
124
|
+
if best_snlp[input_length] == LARGE_NUMBER
|
125
|
+
t_boundary << end_pos
|
126
|
+
else
|
127
|
+
idx = end_pos - start_pos
|
128
|
+
|
129
|
+
while idx > 0
|
130
|
+
t_boundary << idx + start_pos
|
131
|
+
idx = prev[idx]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
t_boundary.reverse_each(&block)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def hangul_word_set
|
141
|
+
@@hangul_word_set ||= KoreanBreakEngine.word_set
|
142
|
+
end
|
143
|
+
|
144
|
+
def is_katakana?(codepoint)
|
145
|
+
(codepoint >= 0x30A1 && codepoint <= 0x30FE && codepoint != 0x30FB) ||
|
146
|
+
(codepoint >= 0xFF66 && codepoint <= 0xFF9F)
|
147
|
+
end
|
148
|
+
|
149
|
+
def get_katakana_cost(word_length)
|
150
|
+
if word_length > MAX_KATAKANA_LENGTH
|
151
|
+
MAX_KATAKANA_COST
|
152
|
+
else
|
153
|
+
KATAKANA_COSTS[word_length]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def dictionary
|
158
|
+
@dictionary ||= Dictionary.cj
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|