twitter_cldr 1.6.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/History.txt +5 -0
- data/js/lib/twitter_cldr_js.rb +2 -0
- data/lib/twitter_cldr/collation/collator.rb +8 -3
- data/lib/twitter_cldr/collation/sort_key_builder.rb +118 -34
- data/lib/twitter_cldr/collation/trie_builder.rb +5 -1
- data/lib/twitter_cldr/resources/import/tailoring.rb +14 -5
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/af.yml +1 -0
- data/resources/collation/tailoring/ar.yml +1 -0
- data/resources/collation/tailoring/ca.yml +1 -0
- data/resources/collation/tailoring/cs.yml +1 -0
- data/resources/collation/tailoring/da.yml +2 -0
- data/resources/collation/tailoring/de.yml +1 -0
- data/resources/collation/tailoring/el.yml +1 -0
- data/resources/collation/tailoring/en.yml +1 -0
- data/resources/collation/tailoring/es.yml +1 -0
- data/resources/collation/tailoring/eu.yml +1 -0
- data/resources/collation/tailoring/fa.yml +1 -0
- data/resources/collation/tailoring/fi.yml +1 -0
- data/resources/collation/tailoring/fil.yml +1 -0
- data/resources/collation/tailoring/fr.yml +1 -0
- data/resources/collation/tailoring/he.yml +1 -0
- data/resources/collation/tailoring/hi.yml +1 -0
- data/resources/collation/tailoring/hu.yml +1 -0
- data/resources/collation/tailoring/id.yml +1 -0
- data/resources/collation/tailoring/it.yml +1 -0
- data/resources/collation/tailoring/ja.yml +1 -0
- data/resources/collation/tailoring/ko.yml +1 -0
- data/resources/collation/tailoring/ms.yml +1 -0
- data/resources/collation/tailoring/nb.yml +1 -0
- data/resources/collation/tailoring/nl.yml +1 -0
- data/resources/collation/tailoring/pl.yml +1 -0
- data/resources/collation/tailoring/pt.yml +1 -0
- data/resources/collation/tailoring/ru.yml +1 -0
- data/resources/collation/tailoring/sv.yml +1 -0
- data/resources/collation/tailoring/th.yml +1 -0
- data/resources/collation/tailoring/tr.yml +1 -0
- data/resources/collation/tailoring/uk.yml +1 -0
- data/resources/collation/tailoring/ur.yml +1 -0
- data/resources/collation/tailoring/zh-Hant.yml +1 -0
- data/resources/collation/tailoring/zh.yml +1 -0
- data/spec/collation/collator_spec.rb +118 -16
- data/spec/collation/sort_key_builder_spec.rb +79 -25
- data/spec/collation/tailoring_spec.rb +0 -76
- data/spec/collation/tailoring_tests/da.txt +181 -181
- data/spec/collation/trie_builder_spec.rb +26 -12
- metadata +3 -3
data/Gemfile
CHANGED
data/History.txt
CHANGED
data/js/lib/twitter_cldr_js.rb
CHANGED
@@ -10,6 +10,7 @@ require 'uglifier'
|
|
10
10
|
require 'jasmine-headless-webkit'
|
11
11
|
require 'coffee-script'
|
12
12
|
require 'json'
|
13
|
+
require 'ruby_parser'
|
13
14
|
|
14
15
|
require 'compiler'
|
15
16
|
require 'renderers/bundle'
|
@@ -58,6 +59,7 @@ module TwitterCldr
|
|
58
59
|
|
59
60
|
def self.make(options = {})
|
60
61
|
# clean dir, then build js
|
62
|
+
FileUtils.mkdir_p(build_dir)
|
61
63
|
FileUtils.rm_rf(Dir.glob(File.join(build_dir, "**")))
|
62
64
|
build(options)
|
63
65
|
build(options.merge({ :minify => true }))
|
@@ -16,8 +16,9 @@ module TwitterCldr
|
|
16
16
|
attr_accessor :locale
|
17
17
|
|
18
18
|
def initialize(locale = nil)
|
19
|
-
@locale
|
20
|
-
@
|
19
|
+
@locale = TwitterCldr.convert_locale(locale) if locale
|
20
|
+
@options = tailoring_options
|
21
|
+
@trie = load_trie
|
21
22
|
end
|
22
23
|
|
23
24
|
def sort(strings)
|
@@ -34,7 +35,7 @@ module TwitterCldr
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def get_sort_key(string_or_code_points)
|
37
|
-
TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points))
|
38
|
+
TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), @options[:case_first])
|
38
39
|
end
|
39
40
|
|
40
41
|
def get_collation_elements(string_or_code_points)
|
@@ -47,6 +48,10 @@ module TwitterCldr
|
|
47
48
|
|
48
49
|
private
|
49
50
|
|
51
|
+
def tailoring_options
|
52
|
+
@locale ? TwitterCldr::Collation::TrieBuilder.tailoring_data(@locale)[:collator_options] : {}
|
53
|
+
end
|
54
|
+
|
50
55
|
def load_trie
|
51
56
|
@locale ? self.class.tailored_fce_trie(@locale) : self.class.default_fce_trie
|
52
57
|
end
|
@@ -8,7 +8,8 @@ module TwitterCldr
|
|
8
8
|
|
9
9
|
# SortKeyBuilder builds a collation sort key from an array of collation elements.
|
10
10
|
#
|
11
|
-
# Weights compression algorithms for every level are described in
|
11
|
+
# Weights compression algorithms for every level are described in
|
12
|
+
# http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
|
12
13
|
#
|
13
14
|
class SortKeyBuilder
|
14
15
|
|
@@ -16,35 +17,36 @@ module TwitterCldr
|
|
16
17
|
|
17
18
|
LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
|
18
19
|
|
19
|
-
|
20
|
+
VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
|
20
21
|
|
21
|
-
|
22
|
-
PRIMARY_BYTE_MAX = 0xFF
|
23
|
-
|
24
|
-
MIN_NON_LATIN_PRIMARY = 0x5B
|
25
|
-
MAX_REGULAR_PRIMARY = 0x7A
|
26
|
-
|
27
|
-
attr_reader :collation_elements
|
22
|
+
attr_reader :collation_elements, :case_first
|
28
23
|
|
29
24
|
# Returns a sort key as an array of bytes.
|
30
25
|
#
|
31
26
|
# Arguments:
|
32
27
|
#
|
33
28
|
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
29
|
+
# case_first - case-first sorting order setting.
|
34
30
|
#
|
35
31
|
# An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
|
36
32
|
# method into another while forming the sort key.
|
37
33
|
#
|
38
|
-
def self.build(collation_elements)
|
39
|
-
new(collation_elements).bytes_array
|
34
|
+
def self.build(collation_elements, case_first = nil)
|
35
|
+
new(collation_elements, case_first).bytes_array
|
40
36
|
end
|
41
37
|
|
42
38
|
# Arguments:
|
43
39
|
#
|
44
40
|
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
41
|
+
# case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
|
45
42
|
#
|
46
|
-
def initialize(collation_elements)
|
43
|
+
def initialize(collation_elements, case_first = nil)
|
44
|
+
raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)
|
45
|
+
|
47
46
|
@collation_elements = collation_elements
|
47
|
+
@case_first = case_first
|
48
|
+
|
49
|
+
init_tertiary_constants
|
48
50
|
end
|
49
51
|
|
50
52
|
def bytes_array
|
@@ -115,7 +117,14 @@ module TwitterCldr
|
|
115
117
|
end
|
116
118
|
|
117
119
|
# append compressed trailing common bytes
|
118
|
-
|
120
|
+
if @common_count > 0
|
121
|
+
if @tertiary_common == TERTIARY_BOTTOM_NORMAL
|
122
|
+
append_common_bytes(@tertiary_bottom, @tertiary_bottom_count, false)
|
123
|
+
else
|
124
|
+
append_common_bytes(@tertiary_top, @tertiary_top_count, true)
|
125
|
+
@bytes_array[-1] -= 1 # make @bytes_array[-1] = boundary - @common_count (for compatibility with ICU)
|
126
|
+
end
|
127
|
+
end
|
119
128
|
end
|
120
129
|
|
121
130
|
def append_secondary_byte(secondary)
|
@@ -127,11 +136,16 @@ module TwitterCldr
|
|
127
136
|
end
|
128
137
|
|
129
138
|
def append_tertiary_byte(tertiary)
|
130
|
-
if tertiary ==
|
139
|
+
if tertiary == @tertiary_common
|
131
140
|
@common_count += 1
|
132
141
|
else
|
133
|
-
|
134
|
-
|
142
|
+
if @tertiary_common == TERTIARY_COMMON_NORMAL && @tertiary_common < tertiary
|
143
|
+
tertiary += @tertiary_addition
|
144
|
+
elsif @tertiary_common == TERTIARY_COMMON_UPPER_FIRST && tertiary <= @tertiary_common
|
145
|
+
tertiary -= @tertiary_addition
|
146
|
+
end
|
147
|
+
|
148
|
+
append_with_common_bytes(tertiary, @tertiary_common_space)
|
135
149
|
end
|
136
150
|
end
|
137
151
|
|
@@ -160,7 +174,13 @@ module TwitterCldr
|
|
160
174
|
end
|
161
175
|
|
162
176
|
def tertiary_weight(collation_element)
|
163
|
-
level_weight(collation_element, TERTIARY_LEVEL)
|
177
|
+
weight = level_weight(collation_element, TERTIARY_LEVEL)
|
178
|
+
|
179
|
+
if continuation?(weight)
|
180
|
+
remove_continuation_bits(weight)
|
181
|
+
else
|
182
|
+
(weight & @tertiary_mask) ^ @case_switch
|
183
|
+
end
|
164
184
|
end
|
165
185
|
|
166
186
|
def level_weight(collation_element, level)
|
@@ -178,6 +198,60 @@ module TwitterCldr
|
|
178
198
|
bytes
|
179
199
|
end
|
180
200
|
|
201
|
+
def continuation?(weight)
|
202
|
+
weight & CASE_BITS_MASK == CASE_BITS_MASK
|
203
|
+
end
|
204
|
+
|
205
|
+
def remove_continuation_bits(weight)
|
206
|
+
weight & REMOVE_CASE_MASK
|
207
|
+
end
|
208
|
+
|
209
|
+
def init_tertiary_constants
|
210
|
+
@case_switch = @case_first == :upper ? CASE_SWITCH : NO_CASE_SWITCH
|
211
|
+
|
212
|
+
if @case_first
|
213
|
+
@tertiary_mask = KEEP_CASE_MASK
|
214
|
+
@tertiary_addition = TERTIARY_ADDITION_CASE_FIRST
|
215
|
+
|
216
|
+
if @case_first == :upper
|
217
|
+
@tertiary_common = TERTIARY_COMMON_UPPER_FIRST
|
218
|
+
@tertiary_top = TERTIARY_TOP_UPPER_FIRST
|
219
|
+
@tertiary_bottom = TERTIARY_BOTTOM_UPPER_FIRST
|
220
|
+
else # @case_first == :lower
|
221
|
+
@tertiary_common = TERTIARY_COMMON_NORMAL
|
222
|
+
@tertiary_top = TERTIARY_TOP_LOWER_FIRST
|
223
|
+
@tertiary_bottom = TERTIARY_BOTTOM_LOWER_FIRST
|
224
|
+
end
|
225
|
+
else
|
226
|
+
@tertiary_mask = REMOVE_CASE_MASK
|
227
|
+
@tertiary_addition = TERTIARY_ADDITION_NORMAL
|
228
|
+
|
229
|
+
@tertiary_common = TERTIARY_COMMON_NORMAL
|
230
|
+
@tertiary_top = TERTIARY_TOP_NORMAL
|
231
|
+
@tertiary_bottom = TERTIARY_BOTTOM_NORMAL
|
232
|
+
end
|
233
|
+
|
234
|
+
total_tertiary_count = @tertiary_top - @tertiary_bottom - 1
|
235
|
+
@tertiary_top_count = (TERTIARY_PROPORTION * total_tertiary_count).to_i
|
236
|
+
@tertiary_bottom_count = total_tertiary_count - @tertiary_top_count
|
237
|
+
|
238
|
+
@tertiary_common_space = {
|
239
|
+
:common => @tertiary_common,
|
240
|
+
:bottom => @tertiary_bottom,
|
241
|
+
:bottom_count => @tertiary_bottom_count,
|
242
|
+
:top => @tertiary_top,
|
243
|
+
:top_count => @tertiary_top_count
|
244
|
+
}
|
245
|
+
end
|
246
|
+
|
247
|
+
# Primary level compression constants
|
248
|
+
|
249
|
+
PRIMARY_BYTE_MIN = 0x3
|
250
|
+
PRIMARY_BYTE_MAX = 0xFF
|
251
|
+
|
252
|
+
MIN_NON_LATIN_PRIMARY = 0x5B
|
253
|
+
MAX_REGULAR_PRIMARY = 0x7A
|
254
|
+
|
181
255
|
# Secondary level compression constants
|
182
256
|
|
183
257
|
SECONDARY_BOTTOM = 0x05
|
@@ -198,23 +272,33 @@ module TwitterCldr
|
|
198
272
|
|
199
273
|
# Tertiary level compression constants
|
200
274
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
275
|
+
REMOVE_CASE_MASK = 0x3F
|
276
|
+
KEEP_CASE_MASK = 0xFF
|
277
|
+
|
278
|
+
CASE_BITS_MASK = 0xC0
|
279
|
+
|
280
|
+
CASE_SWITCH = 0xC0
|
281
|
+
NO_CASE_SWITCH = 0
|
282
|
+
|
283
|
+
TERTIARY_ADDITION_NORMAL = 0x80
|
284
|
+
TERTIARY_ADDITION_CASE_FIRST = 0x40
|
285
|
+
|
286
|
+
TERTIARY_PROPORTION = 0.667
|
287
|
+
|
288
|
+
# Normal (case-first disabled)
|
289
|
+
TERTIARY_BOTTOM_NORMAL = 0x05
|
290
|
+
TERTIARY_TOP_NORMAL = 0x85
|
291
|
+
TERTIARY_COMMON_NORMAL = TERTIARY_BOTTOM_NORMAL
|
292
|
+
|
293
|
+
# Lower first
|
294
|
+
TERTIARY_BOTTOM_LOWER_FIRST = TERTIARY_BOTTOM_NORMAL
|
295
|
+
TERTIARY_TOP_LOWER_FIRST = 0x45
|
296
|
+
TERTIARY_COMMON_LOWER_FIRST = TERTIARY_BOTTOM_LOWER_FIRST
|
297
|
+
|
298
|
+
# Upper first
|
299
|
+
TERTIARY_BOTTOM_UPPER_FIRST = 0x86
|
300
|
+
TERTIARY_TOP_UPPER_FIRST = 0xC5
|
301
|
+
TERTIARY_COMMON_UPPER_FIRST = TERTIARY_TOP_UPPER_FIRST
|
218
302
|
|
219
303
|
end
|
220
304
|
|
@@ -20,7 +20,7 @@ module TwitterCldr
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def load_tailored_trie(locale, fallback)
|
23
|
-
build_tailored_trie(
|
23
|
+
build_tailored_trie(tailoring_data(locale), fallback)
|
24
24
|
end
|
25
25
|
|
26
26
|
def parse_trie(table, trie = TwitterCldr::Collation::Trie.new)
|
@@ -31,6 +31,10 @@ module TwitterCldr
|
|
31
31
|
trie
|
32
32
|
end
|
33
33
|
|
34
|
+
def tailoring_data(locale)
|
35
|
+
TwitterCldr.get_resource(:collation, :tailoring, locale)
|
36
|
+
end
|
37
|
+
|
34
38
|
private
|
35
39
|
|
36
40
|
def load_resource(resource)
|
@@ -33,7 +33,7 @@ module TwitterCldr
|
|
33
33
|
:pt => :root
|
34
34
|
}
|
35
35
|
|
36
|
-
EMPTY_TAILORING_DATA = { 'tailored_table' => '', 'suppressed_contractions' => '' }
|
36
|
+
EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
|
37
37
|
|
38
38
|
class ImportError < RuntimeError; end
|
39
39
|
|
@@ -98,6 +98,7 @@ module TwitterCldr
|
|
98
98
|
standard_tailoring = collations.at_xpath('collation[@type="standard"]')
|
99
99
|
|
100
100
|
{
|
101
|
+
'collator_options' => parse_collator_options(standard_tailoring),
|
101
102
|
'tailored_table' => parse_tailorings(standard_tailoring, locale),
|
102
103
|
'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
|
103
104
|
}
|
@@ -147,11 +148,19 @@ module TwitterCldr
|
|
147
148
|
end
|
148
149
|
|
149
150
|
def parse_suppressed_contractions(data)
|
150
|
-
|
151
|
+
node = data && data.at_xpath('suppress_contractions')
|
152
|
+
node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
|
153
|
+
end
|
154
|
+
|
155
|
+
def parse_collator_options(data)
|
156
|
+
options = {}
|
157
|
+
|
158
|
+
if data
|
159
|
+
case_first_setting = data.at_xpath('settings[@caseFirst]')
|
160
|
+
options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
|
161
|
+
end
|
151
162
|
|
152
|
-
|
153
|
-
Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(contractions.text)).to_a
|
154
|
-
end.flatten.join
|
163
|
+
options
|
155
164
|
end
|
156
165
|
|
157
166
|
def validate_tailoring_rule(rule)
|
data/lib/twitter_cldr/version.rb
CHANGED