twitter_cldr 5.2.0 → 5.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/Rakefile +19 -8
  4. data/lib/twitter_cldr/normalization.rb +18 -5
  5. data/lib/twitter_cldr/resources.rb +3 -1
  6. data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
  7. data/lib/twitter_cldr/resources/loader.rb +22 -1
  8. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
  9. data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
  10. data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
  11. data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
  12. data/lib/twitter_cldr/segmentation.rb +25 -10
  13. data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
  14. data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
  15. data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
  16. data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
  17. data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
  18. data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
  19. data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
  20. data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
  21. data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
  22. data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
  23. data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
  24. data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
  25. data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
  26. data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
  27. data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
  28. data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
  29. data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
  30. data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
  31. data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
  32. data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
  33. data/lib/twitter_cldr/shared.rb +1 -0
  34. data/lib/twitter_cldr/shared/caser.rb +3 -3
  35. data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
  36. data/lib/twitter_cldr/utils/range_set.rb +10 -1
  37. data/lib/twitter_cldr/version.rb +1 -1
  38. data/resources/collation/tailoring/km.yml +82 -0
  39. data/resources/collation/tailoring/lo.yml +4 -0
  40. data/resources/collation/tailoring/my.yml +940 -0
  41. data/resources/collation/tries/km.dump +0 -0
  42. data/resources/collation/tries/lo.dump +0 -0
  43. data/resources/collation/tries/my.dump +0 -0
  44. data/resources/locales/km/calendars.yml +373 -0
  45. data/resources/locales/km/currencies.yml +654 -0
  46. data/resources/locales/km/day_periods.yml +96 -0
  47. data/resources/locales/km/fields.yml +495 -0
  48. data/resources/locales/km/languages.yml +397 -0
  49. data/resources/locales/km/layout.yml +5 -0
  50. data/resources/locales/km/lists.yml +37 -0
  51. data/resources/locales/km/numbers.yml +402 -0
  52. data/resources/locales/km/plural_rules.yml +6 -0
  53. data/resources/locales/km/plurals.yml +12 -0
  54. data/resources/locales/km/rbnf.yml +131 -0
  55. data/resources/locales/km/territories.yml +267 -0
  56. data/resources/locales/km/timezones.yml +1471 -0
  57. data/resources/locales/km/units.yml +721 -0
  58. data/resources/locales/lo/calendars.yml +368 -0
  59. data/resources/locales/lo/currencies.yml +918 -0
  60. data/resources/locales/lo/day_periods.yml +96 -0
  61. data/resources/locales/lo/fields.yml +437 -0
  62. data/resources/locales/lo/languages.yml +529 -0
  63. data/resources/locales/lo/layout.yml +5 -0
  64. data/resources/locales/lo/lists.yml +42 -0
  65. data/resources/locales/lo/numbers.yml +476 -0
  66. data/resources/locales/lo/plural_rules.yml +7 -0
  67. data/resources/locales/lo/plurals.yml +14 -0
  68. data/resources/locales/lo/rbnf.yml +119 -0
  69. data/resources/locales/lo/territories.yml +265 -0
  70. data/resources/locales/lo/timezones.yml +1513 -0
  71. data/resources/locales/lo/units.yml +750 -0
  72. data/resources/locales/my/calendars.yml +374 -0
  73. data/resources/locales/my/currencies.yml +697 -0
  74. data/resources/locales/my/day_periods.yml +96 -0
  75. data/resources/locales/my/fields.yml +459 -0
  76. data/resources/locales/my/languages.yml +420 -0
  77. data/resources/locales/my/layout.yml +5 -0
  78. data/resources/locales/my/lists.yml +43 -0
  79. data/resources/locales/my/numbers.yml +417 -0
  80. data/resources/locales/my/plural_rules.yml +6 -0
  81. data/resources/locales/my/plurals.yml +12 -0
  82. data/resources/locales/my/rbnf.yml +145 -0
  83. data/resources/locales/my/territories.yml +265 -0
  84. data/resources/locales/my/timezones.yml +1479 -0
  85. data/resources/locales/my/units.yml +759 -0
  86. data/resources/locales/th/plurals.yml +1 -1
  87. data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
  88. data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
  89. data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
  90. data/resources/shared/segments/dictionaries/laodict.dump +0 -0
  91. data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
  92. data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
  93. data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
  94. data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
  95. data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
  96. data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
  97. data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
  98. data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
  99. data/resources/shared/segments/tests/line_break_test.yml +68 -68
  100. data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
  101. data/resources/supported_locales.yml +3 -0
  102. data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
  103. data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
  104. data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
  105. data/spec/segmentation/dictionary_break_spec.rb +42 -0
  106. data/spec/segmentation/rule_set_spec.rb +3 -1
  107. data/spec/timezones/tests/km.yml +12475 -0
  108. data/spec/timezones/tests/lo.yml +12475 -0
  109. data/spec/timezones/tests/my.yml +12475 -0
  110. metadata +87 -3
@@ -26,19 +26,10 @@ module TwitterCldr
26
26
  )
27
27
  end
28
28
 
29
- def each_boundary(str)
30
- return to_enum(__method__, str) unless block_given?
29
+ def each_boundary(cursor, stop = cursor.length)
30
+ return to_enum(__method__, cursor, stop) unless block_given?
31
31
 
32
- cursor = Cursor.new(str)
33
-
34
- # Let the state machine find the first boundary for the line
35
- # boundary type. This helps pass nearly all the Unicode
36
- # segmentation tests, so it must be the right thing to do.
37
- # Normally the first boundary is the implicit start of text
38
- # boundary, but potentially not for the line rules?
39
- yield 0 unless state_machine.boundary_type == 'line'
40
-
41
- until cursor.eos?
32
+ until cursor.position >= stop || cursor.eos?
42
33
  state_machine.handle_next(cursor)
43
34
  yield cursor.position if suppressions.should_break?(cursor)
44
35
  end
@@ -0,0 +1,40 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Segmentation
8
+ class SegmentIterator
9
+ attr_reader :rule_set
10
+
11
+ def initialize(rule_set)
12
+ @rule_set = rule_set
13
+ end
14
+
15
+ def each_segment(str)
16
+ return to_enum(__method__, str) unless block_given?
17
+
18
+ each_boundary(str).each_cons(2) do |start, stop|
19
+ yield str[start...stop], start, stop
20
+ end
21
+ end
22
+
23
+ def each_boundary(str, &block)
24
+ return to_enum(__method__, str) unless block_given?
25
+
26
+ # implicit start of text boundary
27
+ yield 0
28
+
29
+ cursor = create_cursor(str)
30
+ rule_set.each_boundary(cursor, &block)
31
+ end
32
+
33
+ private
34
+
35
+ def create_cursor(str)
36
+ Cursor.new(str)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -70,7 +70,7 @@ module TwitterCldr
70
70
  def handle_next(cursor)
71
71
  result = initial_position = cursor.position
72
72
  state = START_STATE
73
- row = row_index_for(state)
73
+ row = state * (metadata.category_count + 4)
74
74
  category = 3
75
75
  mode = :run
76
76
 
@@ -97,7 +97,7 @@ module TwitterCldr
97
97
  end
98
98
 
99
99
  state = ftable[row + NEXT_STATES + category]
100
- row = row_index_for(state)
100
+ row = state * (metadata.category_count + 4)
101
101
 
102
102
  if ftable[row + ACCEPTING] == -1
103
103
  # match found
@@ -114,12 +114,6 @@ module TwitterCldr
114
114
 
115
115
  result
116
116
  end
117
-
118
- private
119
-
120
- def row_index_for(state)
121
- state * (metadata.category_count + 4)
122
- end
123
117
  end
124
118
  end
125
119
  end
@@ -0,0 +1,141 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+ require 'forwardable'
8
+
9
+ module TwitterCldr
10
+ module Segmentation
11
+
12
+ # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java
13
+ class ThaiBreakEngine
14
+
15
+ include Singleton
16
+ extend Forwardable
17
+
18
+ def_delegators :engine, :each_boundary
19
+
20
+ def self.word_set
21
+ @word_set ||= begin
22
+ uset = TwitterCldr::Shared::UnicodeSet.new
23
+ uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
24
+ uset.to_set
25
+ end
26
+ end
27
+
28
+ # ellision character
29
+ THAI_PAIYANNOI = 0x0E2F
30
+
31
+ # repeat character
32
+ THAI_MAIYAMOK = 0x0E46
33
+
34
+ def each_boundary(*args, &block)
35
+ engine.each_boundary(*args, &block)
36
+ end
37
+
38
+ private
39
+
40
+ def engine
41
+ @engine ||= BrahmicBreakEngine.new(
42
+ # How many words in a row are "good enough"?
43
+ lookahead: 3,
44
+
45
+ # Will not combine a non-word with a preceding dictionary word longer than this
46
+ root_combine_threshold: 3,
47
+
48
+ # Will not combine a non-word that shares at least this much prefix with a
49
+ # dictionary word with a preceding word
50
+ prefix_combine_threshold: 3,
51
+
52
+ # Minimum word size
53
+ min_word: 2,
54
+
55
+ # Minimum number of characters for two words (min_word * 2)
56
+ min_word_span: 4,
57
+
58
+ word_set: self.class.word_set,
59
+ mark_set: mark_set,
60
+ end_word_set: end_word_set,
61
+ begin_word_set: begin_word_set,
62
+ dictionary: Dictionary.thai,
63
+ advance_past_suffix: -> (*args) do
64
+ advance_past_suffix(*args)
65
+ end
66
+ )
67
+ end
68
+
69
+ def advance_past_suffix(cursor, end_pos, state)
70
+ suffix_length = 0
71
+
72
+ if cursor.position < end_pos && state.word_length > 0
73
+ uc = cursor.codepoint
74
+
75
+ candidates = state.words[state.words_found].candidates(
76
+ cursor, engine.dictionary, end_pos
77
+ )
78
+
79
+ if candidates <= 0 && suffix_set.include?(uc)
80
+ if uc == THAI_PAIYANNOI
81
+ unless suffix_set.include?(cursor.previous)
82
+ # skip over previous end and PAIYANNOI
83
+ cursor.advance(2)
84
+ suffix_length += 1
85
+ uc = cursor.codepoint
86
+ else
87
+ # restore prior position
88
+ cursor.advance
89
+ end
90
+ end
91
+
92
+ if uc == THAI_MAIYAMOK
93
+ if cursor.previous != THAI_MAIYAMOK
94
+ # skip over previous end and MAIYAMOK
95
+ cursor.advance(2)
96
+ suffix_length += 1
97
+ else
98
+ # restore prior position
99
+ cursor.advance
100
+ end
101
+ end
102
+ else
103
+ cursor.position = state.current + state.word_length
104
+ end
105
+ end
106
+
107
+ suffix_length
108
+ end
109
+
110
+ def mark_set
111
+ @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
112
+ set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
113
+ set.add(0x0020)
114
+ end
115
+ end
116
+
117
+ def end_word_set
118
+ @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
119
+ set.add_list(self.class.word_set)
120
+ set.subtract(0x0E31) # MAI HAN-AKAT
121
+ set.subtract_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
122
+ end
123
+ end
124
+
125
+ def begin_word_set
126
+ @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
127
+ set.add_range(0x0E01..0x0E2E) # KO KAI through HO NOKHUK
128
+ set.add_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
129
+ end
130
+ end
131
+
132
+ def suffix_set
133
+ @suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
134
+ set.add(THAI_PAIYANNOI)
135
+ set.add(THAI_MAIYAMOK)
136
+ end
137
+ end
138
+
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'singleton'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class UnhandledBreakEngine
11
+
12
+ include Singleton
13
+
14
+ def each_boundary(cursor, &block)
15
+ return to_enum(__method__, cursor) unless block_given?
16
+ cursor.advance
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,170 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'set'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class DeormalizedStringError < StandardError; end
11
+
12
+ class WordIterator < SegmentIterator
13
+ DICTIONARY_BREAK_ENGINES = [
14
+ CjBreakEngine,
15
+ KoreanBreakEngine,
16
+ BurmeseBreakEngine,
17
+ KhmerBreakEngine,
18
+ LaoBreakEngine,
19
+ ThaiBreakEngine
20
+ ]
21
+
22
+ def each_boundary(str, &block)
23
+ return to_enum(__method__, str) unless block_given?
24
+
25
+ # Rather than put a bunch of duplicate logic in
26
+ # each_boundary_helper to make sure we don't yield the same
27
+ # boundary twice, we wrap it in this additional de-duping
28
+ # enumerator and call it a day.
29
+ last_boundary = nil
30
+
31
+ each_boundary_helper(str) do |boundary|
32
+ yield boundary if boundary != last_boundary
33
+ last_boundary = boundary
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def each_boundary_helper(str, &block)
40
+ # Set up two independent cursors so the algorithm can iterate
41
+ # over those portions of the input string that require a
42
+ # dictionary-based break iterator independently from those that
43
+ # only need the normal, rule-based break iterator. Cursors
44
+ # hold references to the input text and a list of all the
45
+ # corresponding Unicode codepoints, meaning they are fairly
46
+ # expensive to create. The duplication below should only
47
+ # produce a shallow copy however. The text and codepoint list
48
+ # are not duplicated, but the cursor's integer position can
49
+ # be varied independently.
50
+ dict_cursor = create_cursor(str)
51
+ rule_cursor = dict_cursor.dup
52
+
53
+ # implicit start of text boundary
54
+ yield 0
55
+
56
+ until dict_cursor.eos? || rule_cursor.eos?
57
+ # We use a regex to identify the beginnings of potential runs
58
+ # of dictionary characters. This regex was benchmarked and
59
+ # found to be pretty fast, but could become a bottleneck if
60
+ # other parts of the algorithm are improved in the future.
61
+ m = dictionary_re.match(dict_cursor.text, dict_cursor.position)
62
+ break unless m
63
+
64
+ dict_cursor.position = m.begin(0)
65
+ dict_break_engine = dictionary_break_engine_for(dict_cursor.codepoint)
66
+
67
+ # It's possible to encounter a dictionary character that can't
68
+ # be handled by any of the dictionary-based break engines
69
+ # because it's too short to make up an actual word. The
70
+ # break engine will simply yield no breaks in such a case, which
71
+ # we test for below by peeking for the first boundary value and
72
+ # rescuing a StopIteration error. Since the run of dictionary
73
+ # characters may be arbitrarily long, peeking should be more
74
+ # performant than attempting to calculate all the boundary
75
+ # positions for the run at once.
76
+ #
77
+ # It should be noted that, despite our best efforts here in
78
+ # WordIterator, certain dictionary-based break engines (eg.
79
+ # CjBreakEngine) cannot yield word boundaries without first
80
+ # examining the entire run of dictionary characters. In practice
81
+ # this shouldn't be too big an issue, since Chinese text often
82
+ # contains punctuation that should limit the average run length.
83
+ dict_enum = dict_break_engine.each_boundary(dict_cursor)
84
+
85
+ dict_boundary = begin
86
+ dict_enum.peek
87
+ rescue StopIteration
88
+ nil
89
+ end
90
+
91
+ # If a dictionary boundary was found, attempt to use the rule-based
92
+ # break iterator to find breaks in the text immediately before it.
93
+ # Otherwise, since none of the dictionary-based break engines could
94
+ # find any boundaries in the current run, we advance the dictionary
95
+ # cursor in an attempt to find the next dictionary boundary. Doing
96
+ # so effectively causes the algorithm to fall back to the rule-based
97
+ # break engine.
98
+ if dict_boundary
99
+ # Only use the rule-based break engine if there are characters to
100
+ # process.
101
+ if rule_cursor.position < m.begin(0)
102
+ rule_set.each_boundary(rule_cursor, m.begin(0), &block)
103
+ end
104
+
105
+ # Yield all the dictionary breaks from the enum. We can't use .each
106
+ # here because that will restart the iteration. Ruby's loop
107
+ # construct automatically rescues StopIteration.
108
+ loop do
109
+ yield dict_enum.next
110
+ end
111
+
112
+ # We've reached the end of a dictionary character run, so yield
113
+ # the end of text boundary.
114
+ yield dict_cursor.position
115
+
116
+ # These should be the same after a successful dictionary run, i.e.
117
+ # they should both be positioned at the end of the current rule-based
118
+ # and dictionary-based portions of the run, ready for the next one.
119
+ rule_cursor.position = dict_cursor.position
120
+ else
121
+ dict_cursor.advance
122
+ end
123
+ end
124
+
125
+ # Find boundaries in the straggler, non-dictionary run at the end of
126
+ # the input text.
127
+ unless rule_cursor.eos?
128
+ rule_set.each_boundary(rule_cursor, &block)
129
+ end
130
+
131
+ # implicit end of text boundary
132
+ yield rule_cursor.length
133
+ end
134
+
135
+ # all dictionary characters, i.e. characters that must be handled
136
+ # by one of the dictionary-based break engines
137
+ def dictionary_set
138
+ @@dictionary_set ||= Set.new.tap do |set|
139
+ DICTIONARY_BREAK_ENGINES.each do |break_engine|
140
+ set.merge(break_engine.word_set)
141
+ end
142
+ end
143
+ end
144
+
145
+ def dictionary_break_engine_for(codepoint)
146
+ codepoint_to_engine_cache[codepoint] ||= begin
147
+ engine = DICTIONARY_BREAK_ENGINES.find do |break_engine|
148
+ break_engine.word_set.include?(codepoint)
149
+ end
150
+
151
+ (engine || UnhandledBreakEngine).instance
152
+ end
153
+ end
154
+
155
+ def dictionary_re
156
+ @@dictionary_re ||= begin
157
+ ranges = TwitterCldr::Utils::RangeSet.from_array(dictionary_set).ranges.map do |r|
158
+ "\\u{#{r.first.to_s(16)}}-\\u{#{r.last.to_s(16)}}"
159
+ end
160
+
161
+ /[#{ranges.join}]/
162
+ end
163
+ end
164
+
165
+ def codepoint_to_engine_cache
166
+ @@codepoint_to_engine_cache ||= {}
167
+ end
168
+ end
169
+ end
170
+ end
@@ -32,6 +32,7 @@ module TwitterCldr
32
32
  autoload :TerritoriesContainment, 'twitter_cldr/shared/territories_containment'
33
33
  autoload :Territory, 'twitter_cldr/shared/territory'
34
34
  autoload :UnicodeRegex, 'twitter_cldr/shared/unicode_regex'
35
+ autoload :UnicodeSet, 'twitter_cldr/shared/unicode_set'
35
36
  autoload :Unit, 'twitter_cldr/shared/unit'
36
37
  end
37
38
  end
@@ -25,7 +25,7 @@ module TwitterCldr
25
25
  # and the following word boundary to Lowercase_Mapping(C).
26
26
  def titlecase(string)
27
27
  string.dup.tap do |result|
28
- boundary_rule_set.each_boundary(result).each_cons(2) do |boundary_pair|
28
+ word_iterator.each_word(result) do |_, *boundary_pair|
29
29
  if cased_pos = first_cased(string, *boundary_pair)
30
30
  result[cased_pos] = titlecasing_hash[result[cased_pos]]
31
31
 
@@ -47,8 +47,8 @@ module TwitterCldr
47
47
  end
48
48
  end
49
49
 
50
- def boundary_rule_set
51
- @boundary_rule_set ||= Segmentation::RuleSet.create(:en, 'word')
50
+ def word_iterator
51
+ @word_iterator ||= Segmentation::BreakIterator.new(:en)
52
52
  end
53
53
 
54
54
  def cased?(char)