twitter_cldr 1.6.0 → 1.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/History.txt +5 -0
- data/js/lib/twitter_cldr_js.rb +2 -0
- data/lib/twitter_cldr/collation/collator.rb +8 -3
- data/lib/twitter_cldr/collation/sort_key_builder.rb +118 -34
- data/lib/twitter_cldr/collation/trie_builder.rb +5 -1
- data/lib/twitter_cldr/resources/import/tailoring.rb +14 -5
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/af.yml +1 -0
- data/resources/collation/tailoring/ar.yml +1 -0
- data/resources/collation/tailoring/ca.yml +1 -0
- data/resources/collation/tailoring/cs.yml +1 -0
- data/resources/collation/tailoring/da.yml +2 -0
- data/resources/collation/tailoring/de.yml +1 -0
- data/resources/collation/tailoring/el.yml +1 -0
- data/resources/collation/tailoring/en.yml +1 -0
- data/resources/collation/tailoring/es.yml +1 -0
- data/resources/collation/tailoring/eu.yml +1 -0
- data/resources/collation/tailoring/fa.yml +1 -0
- data/resources/collation/tailoring/fi.yml +1 -0
- data/resources/collation/tailoring/fil.yml +1 -0
- data/resources/collation/tailoring/fr.yml +1 -0
- data/resources/collation/tailoring/he.yml +1 -0
- data/resources/collation/tailoring/hi.yml +1 -0
- data/resources/collation/tailoring/hu.yml +1 -0
- data/resources/collation/tailoring/id.yml +1 -0
- data/resources/collation/tailoring/it.yml +1 -0
- data/resources/collation/tailoring/ja.yml +1 -0
- data/resources/collation/tailoring/ko.yml +1 -0
- data/resources/collation/tailoring/ms.yml +1 -0
- data/resources/collation/tailoring/nb.yml +1 -0
- data/resources/collation/tailoring/nl.yml +1 -0
- data/resources/collation/tailoring/pl.yml +1 -0
- data/resources/collation/tailoring/pt.yml +1 -0
- data/resources/collation/tailoring/ru.yml +1 -0
- data/resources/collation/tailoring/sv.yml +1 -0
- data/resources/collation/tailoring/th.yml +1 -0
- data/resources/collation/tailoring/tr.yml +1 -0
- data/resources/collation/tailoring/uk.yml +1 -0
- data/resources/collation/tailoring/ur.yml +1 -0
- data/resources/collation/tailoring/zh-Hant.yml +1 -0
- data/resources/collation/tailoring/zh.yml +1 -0
- data/spec/collation/collator_spec.rb +118 -16
- data/spec/collation/sort_key_builder_spec.rb +79 -25
- data/spec/collation/tailoring_spec.rb +0 -76
- data/spec/collation/tailoring_tests/da.txt +181 -181
- data/spec/collation/trie_builder_spec.rb +26 -12
- metadata +3 -3
data/Gemfile
CHANGED
data/History.txt
CHANGED
data/js/lib/twitter_cldr_js.rb
CHANGED
@@ -10,6 +10,7 @@ require 'uglifier'
|
|
10
10
|
require 'jasmine-headless-webkit'
|
11
11
|
require 'coffee-script'
|
12
12
|
require 'json'
|
13
|
+
require 'ruby_parser'
|
13
14
|
|
14
15
|
require 'compiler'
|
15
16
|
require 'renderers/bundle'
|
@@ -58,6 +59,7 @@ module TwitterCldr
|
|
58
59
|
|
59
60
|
def self.make(options = {})
|
60
61
|
# clean dir, then build js
|
62
|
+
FileUtils.mkdir_p(build_dir)
|
61
63
|
FileUtils.rm_rf(Dir.glob(File.join(build_dir, "**")))
|
62
64
|
build(options)
|
63
65
|
build(options.merge({ :minify => true }))
|
@@ -16,8 +16,9 @@ module TwitterCldr
|
|
16
16
|
attr_accessor :locale
|
17
17
|
|
18
18
|
def initialize(locale = nil)
|
19
|
-
@locale
|
20
|
-
@
|
19
|
+
@locale = TwitterCldr.convert_locale(locale) if locale
|
20
|
+
@options = tailoring_options
|
21
|
+
@trie = load_trie
|
21
22
|
end
|
22
23
|
|
23
24
|
def sort(strings)
|
@@ -34,7 +35,7 @@ module TwitterCldr
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def get_sort_key(string_or_code_points)
|
37
|
-
TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points))
|
38
|
+
TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), @options[:case_first])
|
38
39
|
end
|
39
40
|
|
40
41
|
def get_collation_elements(string_or_code_points)
|
@@ -47,6 +48,10 @@ module TwitterCldr
|
|
47
48
|
|
48
49
|
private
|
49
50
|
|
51
|
+
def tailoring_options
|
52
|
+
@locale ? TwitterCldr::Collation::TrieBuilder.tailoring_data(@locale)[:collator_options] : {}
|
53
|
+
end
|
54
|
+
|
50
55
|
def load_trie
|
51
56
|
@locale ? self.class.tailored_fce_trie(@locale) : self.class.default_fce_trie
|
52
57
|
end
|
@@ -8,7 +8,8 @@ module TwitterCldr
|
|
8
8
|
|
9
9
|
# SortKeyBuilder builds a collation sort key from an array of collation elements.
|
10
10
|
#
|
11
|
-
# Weights compression algorithms for every level are described in
|
11
|
+
# Weights compression algorithms for every level are described in
|
12
|
+
# http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
|
12
13
|
#
|
13
14
|
class SortKeyBuilder
|
14
15
|
|
@@ -16,35 +17,36 @@ module TwitterCldr
|
|
16
17
|
|
17
18
|
LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
|
18
19
|
|
19
|
-
|
20
|
+
VALID_CASE_FIRST_OPTIONS = [nil, :lower, :upper]
|
20
21
|
|
21
|
-
|
22
|
-
PRIMARY_BYTE_MAX = 0xFF
|
23
|
-
|
24
|
-
MIN_NON_LATIN_PRIMARY = 0x5B
|
25
|
-
MAX_REGULAR_PRIMARY = 0x7A
|
26
|
-
|
27
|
-
attr_reader :collation_elements
|
22
|
+
attr_reader :collation_elements, :case_first
|
28
23
|
|
29
24
|
# Returns a sort key as an array of bytes.
|
30
25
|
#
|
31
26
|
# Arguments:
|
32
27
|
#
|
33
28
|
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
29
|
+
# case_first - case-first sorting order setting.
|
34
30
|
#
|
35
31
|
# An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
|
36
32
|
# method into another while forming the sort key.
|
37
33
|
#
|
38
|
-
def self.build(collation_elements)
|
39
|
-
new(collation_elements).bytes_array
|
34
|
+
def self.build(collation_elements, case_first = nil)
|
35
|
+
new(collation_elements, case_first).bytes_array
|
40
36
|
end
|
41
37
|
|
42
38
|
# Arguments:
|
43
39
|
#
|
44
40
|
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
41
|
+
# case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
|
45
42
|
#
|
46
|
-
def initialize(collation_elements)
|
43
|
+
def initialize(collation_elements, case_first = nil)
|
44
|
+
raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)
|
45
|
+
|
47
46
|
@collation_elements = collation_elements
|
47
|
+
@case_first = case_first
|
48
|
+
|
49
|
+
init_tertiary_constants
|
48
50
|
end
|
49
51
|
|
50
52
|
def bytes_array
|
@@ -115,7 +117,14 @@ module TwitterCldr
|
|
115
117
|
end
|
116
118
|
|
117
119
|
# append compressed trailing common bytes
|
118
|
-
|
120
|
+
if @common_count > 0
|
121
|
+
if @tertiary_common == TERTIARY_BOTTOM_NORMAL
|
122
|
+
append_common_bytes(@tertiary_bottom, @tertiary_bottom_count, false)
|
123
|
+
else
|
124
|
+
append_common_bytes(@tertiary_top, @tertiary_top_count, true)
|
125
|
+
@bytes_array[-1] -= 1 # make @bytes_array[-1] = boundary - @common_count (for compatibility with ICU)
|
126
|
+
end
|
127
|
+
end
|
119
128
|
end
|
120
129
|
|
121
130
|
def append_secondary_byte(secondary)
|
@@ -127,11 +136,16 @@ module TwitterCldr
|
|
127
136
|
end
|
128
137
|
|
129
138
|
def append_tertiary_byte(tertiary)
|
130
|
-
if tertiary ==
|
139
|
+
if tertiary == @tertiary_common
|
131
140
|
@common_count += 1
|
132
141
|
else
|
133
|
-
|
134
|
-
|
142
|
+
if @tertiary_common == TERTIARY_COMMON_NORMAL && @tertiary_common < tertiary
|
143
|
+
tertiary += @tertiary_addition
|
144
|
+
elsif @tertiary_common == TERTIARY_COMMON_UPPER_FIRST && tertiary <= @tertiary_common
|
145
|
+
tertiary -= @tertiary_addition
|
146
|
+
end
|
147
|
+
|
148
|
+
append_with_common_bytes(tertiary, @tertiary_common_space)
|
135
149
|
end
|
136
150
|
end
|
137
151
|
|
@@ -160,7 +174,13 @@ module TwitterCldr
|
|
160
174
|
end
|
161
175
|
|
162
176
|
def tertiary_weight(collation_element)
|
163
|
-
level_weight(collation_element, TERTIARY_LEVEL)
|
177
|
+
weight = level_weight(collation_element, TERTIARY_LEVEL)
|
178
|
+
|
179
|
+
if continuation?(weight)
|
180
|
+
remove_continuation_bits(weight)
|
181
|
+
else
|
182
|
+
(weight & @tertiary_mask) ^ @case_switch
|
183
|
+
end
|
164
184
|
end
|
165
185
|
|
166
186
|
def level_weight(collation_element, level)
|
@@ -178,6 +198,60 @@ module TwitterCldr
|
|
178
198
|
bytes
|
179
199
|
end
|
180
200
|
|
201
|
+
def continuation?(weight)
|
202
|
+
weight & CASE_BITS_MASK == CASE_BITS_MASK
|
203
|
+
end
|
204
|
+
|
205
|
+
def remove_continuation_bits(weight)
|
206
|
+
weight & REMOVE_CASE_MASK
|
207
|
+
end
|
208
|
+
|
209
|
+
def init_tertiary_constants
|
210
|
+
@case_switch = @case_first == :upper ? CASE_SWITCH : NO_CASE_SWITCH
|
211
|
+
|
212
|
+
if @case_first
|
213
|
+
@tertiary_mask = KEEP_CASE_MASK
|
214
|
+
@tertiary_addition = TERTIARY_ADDITION_CASE_FIRST
|
215
|
+
|
216
|
+
if @case_first == :upper
|
217
|
+
@tertiary_common = TERTIARY_COMMON_UPPER_FIRST
|
218
|
+
@tertiary_top = TERTIARY_TOP_UPPER_FIRST
|
219
|
+
@tertiary_bottom = TERTIARY_BOTTOM_UPPER_FIRST
|
220
|
+
else # @case_first == :lower
|
221
|
+
@tertiary_common = TERTIARY_COMMON_NORMAL
|
222
|
+
@tertiary_top = TERTIARY_TOP_LOWER_FIRST
|
223
|
+
@tertiary_bottom = TERTIARY_BOTTOM_LOWER_FIRST
|
224
|
+
end
|
225
|
+
else
|
226
|
+
@tertiary_mask = REMOVE_CASE_MASK
|
227
|
+
@tertiary_addition = TERTIARY_ADDITION_NORMAL
|
228
|
+
|
229
|
+
@tertiary_common = TERTIARY_COMMON_NORMAL
|
230
|
+
@tertiary_top = TERTIARY_TOP_NORMAL
|
231
|
+
@tertiary_bottom = TERTIARY_BOTTOM_NORMAL
|
232
|
+
end
|
233
|
+
|
234
|
+
total_tertiary_count = @tertiary_top - @tertiary_bottom - 1
|
235
|
+
@tertiary_top_count = (TERTIARY_PROPORTION * total_tertiary_count).to_i
|
236
|
+
@tertiary_bottom_count = total_tertiary_count - @tertiary_top_count
|
237
|
+
|
238
|
+
@tertiary_common_space = {
|
239
|
+
:common => @tertiary_common,
|
240
|
+
:bottom => @tertiary_bottom,
|
241
|
+
:bottom_count => @tertiary_bottom_count,
|
242
|
+
:top => @tertiary_top,
|
243
|
+
:top_count => @tertiary_top_count
|
244
|
+
}
|
245
|
+
end
|
246
|
+
|
247
|
+
# Primary level compression constants
|
248
|
+
|
249
|
+
PRIMARY_BYTE_MIN = 0x3
|
250
|
+
PRIMARY_BYTE_MAX = 0xFF
|
251
|
+
|
252
|
+
MIN_NON_LATIN_PRIMARY = 0x5B
|
253
|
+
MAX_REGULAR_PRIMARY = 0x7A
|
254
|
+
|
181
255
|
# Secondary level compression constants
|
182
256
|
|
183
257
|
SECONDARY_BOTTOM = 0x05
|
@@ -198,23 +272,33 @@ module TwitterCldr
|
|
198
272
|
|
199
273
|
# Tertiary level compression constants
|
200
274
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
275
|
+
REMOVE_CASE_MASK = 0x3F
|
276
|
+
KEEP_CASE_MASK = 0xFF
|
277
|
+
|
278
|
+
CASE_BITS_MASK = 0xC0
|
279
|
+
|
280
|
+
CASE_SWITCH = 0xC0
|
281
|
+
NO_CASE_SWITCH = 0
|
282
|
+
|
283
|
+
TERTIARY_ADDITION_NORMAL = 0x80
|
284
|
+
TERTIARY_ADDITION_CASE_FIRST = 0x40
|
285
|
+
|
286
|
+
TERTIARY_PROPORTION = 0.667
|
287
|
+
|
288
|
+
# Normal (case-first disabled)
|
289
|
+
TERTIARY_BOTTOM_NORMAL = 0x05
|
290
|
+
TERTIARY_TOP_NORMAL = 0x85
|
291
|
+
TERTIARY_COMMON_NORMAL = TERTIARY_BOTTOM_NORMAL
|
292
|
+
|
293
|
+
# Lower first
|
294
|
+
TERTIARY_BOTTOM_LOWER_FIRST = TERTIARY_BOTTOM_NORMAL
|
295
|
+
TERTIARY_TOP_LOWER_FIRST = 0x45
|
296
|
+
TERTIARY_COMMON_LOWER_FIRST = TERTIARY_BOTTOM_LOWER_FIRST
|
297
|
+
|
298
|
+
# Upper first
|
299
|
+
TERTIARY_BOTTOM_UPPER_FIRST = 0x86
|
300
|
+
TERTIARY_TOP_UPPER_FIRST = 0xC5
|
301
|
+
TERTIARY_COMMON_UPPER_FIRST = TERTIARY_TOP_UPPER_FIRST
|
218
302
|
|
219
303
|
end
|
220
304
|
|
@@ -20,7 +20,7 @@ module TwitterCldr
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def load_tailored_trie(locale, fallback)
|
23
|
-
build_tailored_trie(
|
23
|
+
build_tailored_trie(tailoring_data(locale), fallback)
|
24
24
|
end
|
25
25
|
|
26
26
|
def parse_trie(table, trie = TwitterCldr::Collation::Trie.new)
|
@@ -31,6 +31,10 @@ module TwitterCldr
|
|
31
31
|
trie
|
32
32
|
end
|
33
33
|
|
34
|
+
def tailoring_data(locale)
|
35
|
+
TwitterCldr.get_resource(:collation, :tailoring, locale)
|
36
|
+
end
|
37
|
+
|
34
38
|
private
|
35
39
|
|
36
40
|
def load_resource(resource)
|
@@ -33,7 +33,7 @@ module TwitterCldr
|
|
33
33
|
:pt => :root
|
34
34
|
}
|
35
35
|
|
36
|
-
EMPTY_TAILORING_DATA = { 'tailored_table' => '', 'suppressed_contractions' => '' }
|
36
|
+
EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
|
37
37
|
|
38
38
|
class ImportError < RuntimeError; end
|
39
39
|
|
@@ -98,6 +98,7 @@ module TwitterCldr
|
|
98
98
|
standard_tailoring = collations.at_xpath('collation[@type="standard"]')
|
99
99
|
|
100
100
|
{
|
101
|
+
'collator_options' => parse_collator_options(standard_tailoring),
|
101
102
|
'tailored_table' => parse_tailorings(standard_tailoring, locale),
|
102
103
|
'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
|
103
104
|
}
|
@@ -147,11 +148,19 @@ module TwitterCldr
|
|
147
148
|
end
|
148
149
|
|
149
150
|
def parse_suppressed_contractions(data)
|
150
|
-
|
151
|
+
node = data && data.at_xpath('suppress_contractions')
|
152
|
+
node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
|
153
|
+
end
|
154
|
+
|
155
|
+
def parse_collator_options(data)
|
156
|
+
options = {}
|
157
|
+
|
158
|
+
if data
|
159
|
+
case_first_setting = data.at_xpath('settings[@caseFirst]')
|
160
|
+
options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
|
161
|
+
end
|
151
162
|
|
152
|
-
|
153
|
-
Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(contractions.text)).to_a
|
154
|
-
end.flatten.join
|
163
|
+
options
|
155
164
|
end
|
156
165
|
|
157
166
|
def validate_tailoring_rule(rule)
|
data/lib/twitter_cldr/version.rb
CHANGED