twitter_cldr 1.4.1 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/NOTICE +36 -2
- data/README.md +2 -2
- data/lib/twitter_cldr/collation/collator.rb +143 -0
- data/lib/twitter_cldr/collation/implicit_collation_elements.rb +188 -0
- data/lib/twitter_cldr/collation/sort_key.rb +199 -0
- data/lib/twitter_cldr/collation/trie.rb +73 -0
- data/lib/twitter_cldr/collation/trie_builder.rb +56 -0
- data/lib/twitter_cldr/collation.rb +14 -0
- data/lib/twitter_cldr/core_ext/localized_object.rb +3 -2
- data/lib/twitter_cldr/core_ext/string.rb +1 -1
- data/lib/twitter_cldr/formatters/calendars/datetime_formatter.rb +89 -72
- data/lib/twitter_cldr/normalization/base.rb +22 -0
- data/lib/twitter_cldr/normalization/hangul.rb +68 -0
- data/lib/twitter_cldr/{normalizers → normalization}/nfc.rb +2 -2
- data/lib/twitter_cldr/{normalizers → normalization}/nfd.rb +1 -1
- data/lib/twitter_cldr/{normalizers → normalization}/nfkc.rb +5 -17
- data/lib/twitter_cldr/{normalizers → normalization}/nfkd.rb +3 -18
- data/lib/twitter_cldr/normalization.rb +15 -0
- data/lib/twitter_cldr/shared/code_point.rb +5 -3
- data/lib/twitter_cldr/tokenizers/base.rb +15 -1
- data/lib/twitter_cldr/tokenizers/calendars/datetime_tokenizer.rb +6 -1
- data/lib/twitter_cldr/utils/code_points.rb +1 -1
- data/lib/twitter_cldr/version.rb +2 -2
- data/lib/twitter_cldr.rb +9 -8
- data/resources/collation/FractionalUCA_SHORT.txt +41593 -0
- data/resources/locales/af/calendars.yml +164 -0
- data/resources/locales/af/languages.yml +173 -0
- data/resources/locales/af/numbers.yml +42 -0
- data/resources/locales/af/plurals.yml +2 -0
- data/resources/locales/af/units.yml +88 -0
- data/resources/locales/ar/calendars.yml +9 -0
- data/resources/locales/ar/numbers.yml +15 -2
- data/resources/locales/ca/calendars.yml +228 -0
- data/resources/locales/ca/languages.yml +510 -0
- data/resources/locales/ca/numbers.yml +43 -0
- data/resources/locales/ca/plurals.yml +2 -0
- data/resources/locales/ca/units.yml +93 -0
- data/resources/locales/cs/calendars.yml +229 -0
- data/resources/locales/cs/languages.yml +471 -0
- data/resources/locales/cs/numbers.yml +44 -0
- data/resources/locales/cs/plurals.yml +2 -0
- data/resources/locales/cs/units.yml +114 -0
- data/resources/locales/da/calendars.yml +10 -0
- data/resources/locales/da/numbers.yml +13 -0
- data/resources/locales/de/calendars.yml +9 -0
- data/resources/locales/de/numbers.yml +13 -0
- data/resources/locales/el/calendars.yml +227 -0
- data/resources/locales/el/languages.yml +519 -0
- data/resources/locales/el/numbers.yml +42 -0
- data/resources/locales/el/plurals.yml +2 -0
- data/resources/locales/el/units.yml +107 -0
- data/resources/locales/en/calendars.yml +10 -0
- data/resources/locales/en/numbers.yml +13 -0
- data/resources/locales/es/calendars.yml +9 -0
- data/resources/locales/es/numbers.yml +13 -0
- data/resources/locales/eu/calendars.yml +173 -0
- data/resources/locales/eu/languages.yml +161 -0
- data/resources/locales/eu/numbers.yml +43 -0
- data/resources/locales/eu/plurals.yml +2 -0
- data/resources/locales/eu/units.yml +91 -0
- data/resources/locales/fa/calendars.yml +10 -0
- data/resources/locales/fa/numbers.yml +13 -0
- data/resources/locales/fi/calendars.yml +10 -0
- data/resources/locales/fi/numbers.yml +14 -1
- data/resources/locales/fil/calendars.yml +8 -0
- data/resources/locales/fil/numbers.yml +13 -0
- data/resources/locales/fr/calendars.yml +9 -0
- data/resources/locales/fr/numbers.yml +14 -1
- data/resources/locales/he/calendars.yml +9 -0
- data/resources/locales/he/numbers.yml +13 -0
- data/resources/locales/hi/calendars.yml +8 -0
- data/resources/locales/hi/numbers.yml +13 -0
- data/resources/locales/hu/calendars.yml +10 -0
- data/resources/locales/hu/numbers.yml +15 -2
- data/resources/locales/id/calendars.yml +8 -0
- data/resources/locales/id/numbers.yml +16 -3
- data/resources/locales/it/calendars.yml +9 -0
- data/resources/locales/it/numbers.yml +13 -0
- data/resources/locales/ja/calendars.yml +9 -0
- data/resources/locales/ja/numbers.yml +13 -0
- data/resources/locales/ko/calendars.yml +9 -0
- data/resources/locales/ko/numbers.yml +13 -0
- data/resources/locales/ms/calendars.yml +8 -0
- data/resources/locales/ms/numbers.yml +16 -3
- data/resources/locales/nb/calendars.yml +234 -0
- data/resources/locales/{no → nb}/languages.yml +25 -4
- data/resources/locales/nb/numbers.yml +43 -0
- data/resources/locales/nb/plurals.yml +2 -0
- data/resources/locales/nb/units.yml +87 -0
- data/resources/locales/nl/calendars.yml +10 -0
- data/resources/locales/nl/numbers.yml +13 -0
- data/resources/locales/pl/calendars.yml +9 -0
- data/resources/locales/pl/numbers.yml +14 -1
- data/resources/locales/pt/calendars.yml +9 -0
- data/resources/locales/pt/numbers.yml +13 -0
- data/resources/locales/ru/calendars.yml +10 -0
- data/resources/locales/ru/numbers.yml +14 -1
- data/resources/locales/sv/calendars.yml +10 -0
- data/resources/locales/sv/numbers.yml +14 -1
- data/resources/locales/th/calendars.yml +67 -57
- data/resources/locales/th/numbers.yml +13 -0
- data/resources/locales/tr/calendars.yml +9 -0
- data/resources/locales/tr/numbers.yml +13 -0
- data/resources/locales/uk/calendars.yml +199 -0
- data/resources/locales/uk/languages.yml +519 -0
- data/resources/locales/uk/numbers.yml +45 -0
- data/resources/locales/uk/plurals.yml +2 -0
- data/resources/locales/uk/units.yml +135 -0
- data/resources/locales/ur/calendars.yml +9 -0
- data/resources/locales/ur/numbers.yml +13 -0
- data/resources/locales/zh/calendars.yml +8 -0
- data/resources/locales/zh/numbers.yml +13 -0
- data/resources/locales/zh-Hant/calendars.yml +8 -0
- data/resources/locales/zh-Hant/numbers.yml +16 -3
- data/resources/locales/zh-Hant/plurals.yml +2 -0
- data/resources/unicode_data/hangul_blocks.yml +21 -0
- data/spec/collation/CollationTest_CLDR_NON_IGNORABLE_Short.txt +714 -0
- data/spec/collation/collation_spec.rb +93 -0
- data/spec/collation/collator_spec.rb +117 -0
- data/spec/collation/implicit_collation_elements_spec.rb +24 -0
- data/spec/collation/sort_key_spec.rb +56 -0
- data/spec/collation/trie_builder_spec.rb +114 -0
- data/spec/collation/trie_spec.rb +97 -0
- data/spec/core_ext/calendars/datetime_spec.rb +5 -0
- data/spec/core_ext/calendars_spec.rb +34 -0
- data/spec/core_ext/numbers_spec.rb +39 -0
- data/spec/core_ext/string_spec.rb +4 -4
- data/spec/formatters/calendars/datetime_formatter_spec.rb +92 -2
- data/spec/{normalizers → normalization}/NormalizationTestShort.txt +0 -0
- data/spec/{normalizers → normalization}/base_spec.rb +1 -1
- data/spec/normalization/hangul_spec.rb +42 -0
- data/spec/{normalizers → normalization}/normalization_spec.rb +15 -16
- data/spec/readme_spec.rb +2 -2
- data/spec/shared/code_point_spec.rb +42 -30
- data/spec/shared/resources_spec.rb +30 -6
- data/spec/tokenizers/base_spec.rb +17 -0
- data/spec/twitter_cldr_spec.rb +1 -1
- metadata +71 -83
- data/lib/twitter_cldr/normalizers/base.rb +0 -34
- data/lib/twitter_cldr/normalizers.rb +0 -14
- data/resources/locales/no/calendars.yml +0 -127
- data/resources/locales/no/numbers.yml +0 -29
- data/resources/locales/no/plurals.yml +0 -1
- data/resources/unicode_data/blocks_hangul.yml +0 -46
- data/spec/normalizers/NormalizationTest.txt +0 -18431
data/NOTICE
CHANGED
@@ -2,7 +2,7 @@ twitter-cldr-rb is a Ruby implementation of the Common Locale Data Repository
|
|
2
2
|
Copyright (C) 2012 Twitter, Inc.
|
3
3
|
|
4
4
|
|
5
|
-
Portions of this gem were borrowed from Sven Fuchs' ruby-cldr gem.
|
5
|
+
Portions of this gem were borrowed from Sven Fuchs' ruby-cldr gem. Here is
|
6
6
|
the license that accompanied Mr. Fuchs' code:
|
7
7
|
|
8
8
|
Copyright (c) 2009 Sven Fuchs
|
@@ -117,4 +117,38 @@ the software.
|
|
117
117
|
6. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
|
118
118
|
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
119
119
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
120
|
-
PURPOSE.
|
120
|
+
PURPOSE.
|
121
|
+
|
122
|
+
|
123
|
+
TwitterCldr::Collation::ImplicitCollationElements module was ported from
|
124
|
+
the ICU4J library (http://site.icu-project.org). Here is its license:
|
125
|
+
|
126
|
+
ICU License - ICU 1.8.1 and later
|
127
|
+
|
128
|
+
COPYRIGHT AND PERMISSION NOTICE
|
129
|
+
|
130
|
+
Copyright (c) 1995-2012 International Business Machines Corporation and others
|
131
|
+
|
132
|
+
All rights reserved.
|
133
|
+
|
134
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
135
|
+
this software and associated documentation files (the "Software"), to deal in
|
136
|
+
the Software without restriction, including without limitation the rights to use,
|
137
|
+
copy, modify, merge, publish, distribute, and/or sell copies of the Software,
|
138
|
+
and to permit persons to whom the Software is furnished to do so, provided that
|
139
|
+
the above copyright notice(s) and this permission notice appear in all copies
|
140
|
+
of the Software and that both the above copyright notice(s) and this permission
|
141
|
+
notice appear in supporting documentation.
|
142
|
+
|
143
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
144
|
+
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
145
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
|
146
|
+
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
|
147
|
+
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
|
148
|
+
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
149
|
+
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
150
|
+
OR PERFORMANCE OF THIS SOFTWARE.
|
151
|
+
|
152
|
+
Except as contained in this notice, the name of a copyright holder shall not be
|
153
|
+
used in advertising or otherwise to promote the sale, use or other dealings in
|
154
|
+
this Software without prior written authorization of the copyright holder.
|
data/README.md
CHANGED
@@ -300,7 +300,7 @@ TwitterCldr::Utils::CodePoints.to_string(["00BF"]) # "¿"
|
|
300
300
|
Normalize/decompose a Unicode string (NFD, NFKD, NFC, and NFKC implementations available). Note that the normalized string will almost always look the same as the original string because most character display systems automatically combine decomposed characters.
|
301
301
|
|
302
302
|
```ruby
|
303
|
-
TwitterCldr::
|
303
|
+
TwitterCldr::Normalization::NFD.normalize("français") # "français"
|
304
304
|
```
|
305
305
|
|
306
306
|
Normalization is easier to see in hex:
|
@@ -310,7 +310,7 @@ Normalization is easier to see in hex:
|
|
310
310
|
TwitterCldr::Utils::CodePoints.from_string("español")
|
311
311
|
|
312
312
|
# ["0065", "0073", "0070", "0061", "006E", "0303", "006F", "006C"]
|
313
|
-
TwitterCldr::Utils::CodePoints.from_string(TwitterCldr::
|
313
|
+
TwitterCldr::Utils::CodePoints.from_string(TwitterCldr::Normalization::NFD.normalize("español"))
|
314
314
|
```
|
315
315
|
|
316
316
|
Notice in the example above that the letter "ñ" was transformed from `00F1` to `006E 0303`, which represent the "n" and the "˜" respectively.
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Collation
|
8
|
+
|
9
|
+
# Collator uses fractional collation elements table form CLDR to generate sort keys for Unicode strings as well as
|
10
|
+
# compare and sort such strings by generated sort keys.
|
11
|
+
#
|
12
|
+
class Collator
|
13
|
+
|
14
|
+
FRACTIONAL_UCA_SHORT_RESOURCE = 'collation/FractionalUCA_SHORT.txt'
|
15
|
+
|
16
|
+
def sort(strings)
|
17
|
+
strings.map{ |s| [s, comparison_key(s)] }.sort{ |a, b| compare_keys(a[1], b[1]) }.map(&:first)
|
18
|
+
end
|
19
|
+
|
20
|
+
def compare(string_a, string_b)
|
21
|
+
string_a == string_b ? 0 : compare_keys(comparison_key(string_a), comparison_key(string_b))
|
22
|
+
end
|
23
|
+
|
24
|
+
def sort_key(string_or_code_points)
|
25
|
+
sort_key_for_code_points(get_code_points(string_or_code_points))
|
26
|
+
end
|
27
|
+
|
28
|
+
def trie
|
29
|
+
@trie ||= self.class.trie
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.trie
|
33
|
+
@trie ||= TwitterCldr::Collation::TrieBuilder.load_trie(FRACTIONAL_UCA_SHORT_RESOURCE)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def comparison_key(string)
|
39
|
+
code_points = TwitterCldr::Utils::CodePoints.from_string(string)
|
40
|
+
{ :code_points => code_points, :sort_key => sort_key(code_points) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def compare_keys(a, b)
|
44
|
+
(a[:sort_key] <=> b[:sort_key]).nonzero? || get_integer_code_points(a[:code_points]) <=> get_integer_code_points(b[:code_points])
|
45
|
+
end
|
46
|
+
|
47
|
+
def sort_key_for_code_points(integer_code_points)
|
48
|
+
TwitterCldr::Collation::SortKey.build(get_collation_elements(integer_code_points))
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_integer_code_points(code_points)
|
52
|
+
code_points.map { |code_point| code_point.to_i(16) }
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_collation_elements(integer_code_points)
|
56
|
+
result = []
|
57
|
+
result.concat(code_point_collation_elements(integer_code_points)) until integer_code_points.empty?
|
58
|
+
result
|
59
|
+
end
|
60
|
+
|
61
|
+
def get_code_points(str_or_code_points)
|
62
|
+
code_points = str_or_code_points.is_a?(String) ? TwitterCldr::Utils::CodePoints.from_string(str_or_code_points) : str_or_code_points
|
63
|
+
|
64
|
+
# Normalization makes the collation process significantly slower (like seven times slower on the UCA
|
65
|
+
# non-ignorable test from CollationTest_NON_IGNORABLE.txt). ICU uses some optimizations to apply normalization
|
66
|
+
# only in special, rare cases. We need to investigate possible solutions and do normalization cleverly too.
|
67
|
+
code_points = TwitterCldr::Normalization::NFD.normalize_code_points(code_points)
|
68
|
+
|
69
|
+
get_integer_code_points(code_points)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the first sequence of fractional collation elements for an array of integer code points. Returned value
|
73
|
+
# is an array of well formed (including weights for all significant levels) integer arrays.
|
74
|
+
#
|
75
|
+
# NOTE (side-effect): all used code points are removed from the input array.
|
76
|
+
#
|
77
|
+
def code_point_collation_elements(integer_code_points)
|
78
|
+
explicit_collation_elements(integer_code_points) || implicit_collation_elements(integer_code_points)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Tries to build explicit collation elements array for the longest code points prefix in the given sequence. When
|
82
|
+
# possible, combines this prefix with (not necessarily subsequent) non-starters that follow it in the sequence.
|
83
|
+
# That's necessary because canonical ordering (that is performed during normalization) can break contractions
|
84
|
+
# that existed in the original, de-normalized string.
|
85
|
+
#
|
86
|
+
# NOTE (side-effect): all used code points are removed from the input array.
|
87
|
+
#
|
88
|
+
# For more information see section '4.2 Produce Array' of the main algorithm at http://www.unicode.org/reports/tr10/#Main_Algorithm
|
89
|
+
#
|
90
|
+
def explicit_collation_elements(integer_code_points)
|
91
|
+
# find the longest prefix in the trie
|
92
|
+
collation_elements, suffixes, prefix_size = trie.find_prefix(integer_code_points)
|
93
|
+
|
94
|
+
return unless collation_elements
|
95
|
+
|
96
|
+
# remove prefix from the code points sequence
|
97
|
+
integer_code_points.shift(prefix_size)
|
98
|
+
|
99
|
+
non_starter_pos = 0
|
100
|
+
|
101
|
+
used_combining_classes = {}
|
102
|
+
|
103
|
+
while non_starter_pos < integer_code_points.size && !suffixes.empty?
|
104
|
+
# create a trie from a hash of suffixes available for the chosen prefix
|
105
|
+
subtrie = TwitterCldr::Collation::Trie.new(suffixes)
|
106
|
+
|
107
|
+
# get next code point (possibly non-starter)
|
108
|
+
non_starter_code_point = integer_code_points[non_starter_pos]
|
109
|
+
combining_class = TwitterCldr::Normalization::Base.combining_class_for(non_starter_code_point.to_s(16))
|
110
|
+
|
111
|
+
# code point is a starter or combining class has been already used (non-starter is 'blocked' from the prefix)
|
112
|
+
break if combining_class == 0 || used_combining_classes[combining_class]
|
113
|
+
|
114
|
+
used_combining_classes[combining_class] = true
|
115
|
+
|
116
|
+
# Try to find collation elements for [prefix + non-starter] code points sequence. As the subtrie contains
|
117
|
+
# suffixes (without prefix) we pass only non-starter itself.
|
118
|
+
new_collation_elements, new_suffixes = subtrie.find_prefix([non_starter_code_point]).first(2)
|
119
|
+
|
120
|
+
if new_collation_elements
|
121
|
+
# non-starter with a collation elements sequence corresponding to [prefix + non-starter] accepted
|
122
|
+
collation_elements = new_collation_elements
|
123
|
+
suffixes = new_suffixes
|
124
|
+
|
125
|
+
# Remove non-starter from its position in the sequence. Then we can move further from the same position.
|
126
|
+
integer_code_points.delete_at(non_starter_pos)
|
127
|
+
else
|
128
|
+
# move to the next code point
|
129
|
+
non_starter_pos += 1
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
collation_elements
|
134
|
+
end
|
135
|
+
|
136
|
+
def implicit_collation_elements(integer_code_points)
|
137
|
+
TwitterCldr::Collation::ImplicitCollationElements.for_code_point(integer_code_points.shift)
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Collation
|
8
|
+
|
9
|
+
# ImplicitCollationElements generates implicit collation elements for code points (including some CJK characters),
|
10
|
+
# that are not explicitly mentioned in the collation elements table.
|
11
|
+
#
|
12
|
+
# This module was ported from the ICU4J library (ImplicitCEGenerator class). See NOTICE file for license information.
|
13
|
+
#
|
14
|
+
module ImplicitCollationElements
|
15
|
+
|
16
|
+
DEFAULT_SECONDARY_AND_TERTIARY = 5
|
17
|
+
|
18
|
+
class << self
|
19
|
+
|
20
|
+
def for_code_point(code_point)
|
21
|
+
[[primary_weight(swapCJK(code_point) + 1), DEFAULT_SECONDARY_AND_TERTIARY, DEFAULT_SECONDARY_AND_TERTIARY]]
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
# Generates the primary weight of the implicit CE for a given code point.
|
27
|
+
#
|
28
|
+
def primary_weight(code_point)
|
29
|
+
byte0 = code_point - MIN_4_BOUNDARY
|
30
|
+
|
31
|
+
if byte0 < 0
|
32
|
+
byte1 = code_point / FINAL_3_COUNT
|
33
|
+
byte0 = code_point % FINAL_3_COUNT
|
34
|
+
|
35
|
+
byte2 = byte1 / MEDIAL_COUNT
|
36
|
+
byte1 %= MEDIAL_COUNT
|
37
|
+
|
38
|
+
# spread out, leaving gap at start
|
39
|
+
byte0 = MIN_TRAIL + byte0 * FINAL_3_MULTIPLIER
|
40
|
+
|
41
|
+
# offset
|
42
|
+
byte1 += MIN_TRAIL
|
43
|
+
byte2 += MIN_PRIMARY
|
44
|
+
|
45
|
+
(byte2 << 16) + (byte1 << 8) + byte0
|
46
|
+
else
|
47
|
+
byte1 = byte0 / FINAL_4_COUNT
|
48
|
+
byte0 %= FINAL_4_COUNT
|
49
|
+
|
50
|
+
byte2 = byte1 / MEDIAL_COUNT
|
51
|
+
byte1 %= MEDIAL_COUNT
|
52
|
+
|
53
|
+
byte3 = byte2 / MEDIAL_COUNT
|
54
|
+
byte2 %= MEDIAL_COUNT
|
55
|
+
|
56
|
+
# spread out, leaving gap at start
|
57
|
+
byte0 = MIN_TRAIL + byte0 * FINAL_4_MULTIPLIER
|
58
|
+
|
59
|
+
# offset
|
60
|
+
byte1 += MIN_TRAIL
|
61
|
+
byte2 += MIN_TRAIL
|
62
|
+
byte3 += MIN_4_PRIMARY
|
63
|
+
|
64
|
+
(byte3 << 24) + (byte2 << 16) + (byte1 << 8) + byte0
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Method used to:
|
69
|
+
# a) collapse two different Han ranges from UCA into one (in the right order)
|
70
|
+
# b) bump any non-CJK characters by NON_CJK_OFFSET.
|
71
|
+
#
|
72
|
+
# The relevant blocks are:
|
73
|
+
# A: 4E00..9FFF; CJK Unified Ideographs
|
74
|
+
# F900..FAFF; CJK Compatibility Ideographs
|
75
|
+
# B: 3400..4DBF; CJK Unified Ideographs Extension A
|
76
|
+
# 20000..XX; CJK Unified Ideographs Extension B (and others later on)
|
77
|
+
#
|
78
|
+
# As long as
|
79
|
+
# no new B characters are allocated between 4E00 and FAFF, and
|
80
|
+
# no new A characters are outside of this range,
|
81
|
+
# (very high probability) this simple code will work.
|
82
|
+
#
|
83
|
+
# The reordered blocks are:
|
84
|
+
# Block1 is CJK
|
85
|
+
# Block2 is CJK_COMPAT_USED
|
86
|
+
# Block3 is CJK_A
|
87
|
+
# (all contiguous)
|
88
|
+
#
|
89
|
+
# Any other CJK gets its normal code point.
|
90
|
+
#
|
91
|
+
# When we reorder Block1, we make sure that it is at the very start, so that it will use a 3-byte form.
|
92
|
+
#
|
93
|
+
def swapCJK(code_point)
|
94
|
+
if code_point >= CJK_BASE
|
95
|
+
return code_point - CJK_BASE if code_point < CJK_LIMIT
|
96
|
+
return code_point + NON_CJK_OFFSET if code_point < CJK_COMPAT_USED_BASE
|
97
|
+
return code_point - CJK_COMPAT_USED_BASE + (CJK_LIMIT - CJK_BASE) if code_point < CJK_COMPAT_USED_LIMIT
|
98
|
+
return code_point + NON_CJK_OFFSET if code_point < CJK_B_BASE
|
99
|
+
return code_point if code_point < CJK_B_LIMIT # non-BMP-CJK
|
100
|
+
return code_point + NON_CJK_OFFSET if code_point < CJK_C_BASE
|
101
|
+
return code_point if code_point < CJK_C_LIMIT # non-BMP-CJK
|
102
|
+
return code_point + NON_CJK_OFFSET if code_point < CJK_D_BASE
|
103
|
+
return code_point if code_point < CJK_D_LIMIT # non-BMP-CJK
|
104
|
+
|
105
|
+
return code_point + NON_CJK_OFFSET # non-CJK
|
106
|
+
end
|
107
|
+
|
108
|
+
return code_point + NON_CJK_OFFSET if code_point < CJK_A_BASE
|
109
|
+
return code_point - CJK_A_BASE + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE) if code_point < CJK_A_LIMIT
|
110
|
+
|
111
|
+
code_point + NON_CJK_OFFSET # non-CJK
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
# primary value
|
117
|
+
MIN_PRIMARY = 0xE0
|
118
|
+
MAX_PRIMARY = 0xe4
|
119
|
+
|
120
|
+
# final byte
|
121
|
+
MIN_TRAIL = 0x04
|
122
|
+
MAX_TRAIL = 0xFE
|
123
|
+
|
124
|
+
# gap for tailoring of 3-byte forms
|
125
|
+
GAP_3 = 1
|
126
|
+
|
127
|
+
# number of 3-byte primaries that can be used
|
128
|
+
PRIMARIES_3_COUNT = 1
|
129
|
+
|
130
|
+
# 2 * [Unicode range] + 2
|
131
|
+
MAX_INPUT = 0x220001
|
132
|
+
|
133
|
+
# medials can use full range
|
134
|
+
MEDIAL_COUNT = MAX_TRAIL - MIN_TRAIL + 1
|
135
|
+
|
136
|
+
# number of values we can use in trailing bytes
|
137
|
+
# leave room for empty values between AND above, e.g., if gap = 2
|
138
|
+
# range 3..7 => +3 -4 -5 -6 -7: so 1 value
|
139
|
+
# range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
|
140
|
+
# range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
|
141
|
+
FINAL_3_MULTIPLIER = GAP_3 + 1
|
142
|
+
FINAL_3_COUNT = MEDIAL_COUNT / FINAL_3_MULTIPLIER
|
143
|
+
|
144
|
+
# find out how many values fit in each form
|
145
|
+
THREE_BYTE_COUNT = MEDIAL_COUNT * FINAL_3_COUNT
|
146
|
+
|
147
|
+
# now determine where the 3/4 boundary is
|
148
|
+
# we use 3 bytes below the boundary, and 4 above
|
149
|
+
PRIMARIES_AVAILABLE = MAX_PRIMARY - MIN_PRIMARY + 1
|
150
|
+
PRIMARIES_4_COUNT = PRIMARIES_AVAILABLE - PRIMARIES_3_COUNT
|
151
|
+
MIN_4_PRIMARY = MIN_PRIMARY + PRIMARIES_3_COUNT
|
152
|
+
MIN_4_BOUNDARY = PRIMARIES_3_COUNT * THREE_BYTE_COUNT
|
153
|
+
|
154
|
+
TOTAL_NEEDED = MAX_INPUT - MIN_4_BOUNDARY
|
155
|
+
NEEDED_PER_PRIMARY_BYTE = (TOTAL_NEEDED - 1) / PRIMARIES_4_COUNT + 1
|
156
|
+
NEEDED_PER_FINAL_BYTE = (NEEDED_PER_PRIMARY_BYTE - 1) / (MEDIAL_COUNT * MEDIAL_COUNT) + 1
|
157
|
+
|
158
|
+
GAP_4 = (MAX_TRAIL - MIN_TRAIL - 1) / NEEDED_PER_FINAL_BYTE
|
159
|
+
|
160
|
+
FINAL_4_MULTIPLIER = GAP_4 + 1
|
161
|
+
FINAL_4_COUNT = NEEDED_PER_FINAL_BYTE
|
162
|
+
|
163
|
+
# CJK constants
|
164
|
+
|
165
|
+
NON_CJK_OFFSET = 0x110000
|
166
|
+
|
167
|
+
CJK_COMPAT_USED_BASE = 0xFA0E
|
168
|
+
CJK_COMPAT_USED_LIMIT = 0xFA2F + 1
|
169
|
+
|
170
|
+
CJK_BASE = 0x4E00 # 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
171
|
+
CJK_LIMIT = 0x9FCC + 1 # 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
172
|
+
|
173
|
+
CJK_A_BASE = 0x3400 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
174
|
+
CJK_A_LIMIT = 0x4DB5 + 1 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
175
|
+
|
176
|
+
CJK_B_BASE = 0x20000 # 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
|
177
|
+
CJK_B_LIMIT = 0x2A6D6 + 1 # 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
|
178
|
+
|
179
|
+
CJK_C_BASE = 0x2A700 # 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
|
180
|
+
CJK_C_LIMIT = 0x2B734 + 1 # 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
|
181
|
+
|
182
|
+
CJK_D_BASE = 0x2B740 # 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
|
183
|
+
CJK_D_LIMIT = 0x2B81D + 1 # 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Collation
|
8
|
+
|
9
|
+
# SortKey builds a collation sort key from an array of collation elements.
|
10
|
+
#
|
11
|
+
class SortKey
|
12
|
+
|
13
|
+
PRIMARY_LEVEL, SECONDARY_LEVEL, TERTIARY_LEVEL = 0, 1, 2
|
14
|
+
|
15
|
+
LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
|
16
|
+
|
17
|
+
TERTIARY_LEVEL_MASK = 0x3F # mask for removing case bits from tertiary weight ('CC' bits in 'CC00 0000')
|
18
|
+
|
19
|
+
attr_reader :collation_elements
|
20
|
+
|
21
|
+
# Returns a sort key as an array of bytes.
|
22
|
+
#
|
23
|
+
# Arguments:
|
24
|
+
#
|
25
|
+
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
26
|
+
#
|
27
|
+
# An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
|
28
|
+
# method into another while forming the sort key.
|
29
|
+
#
|
30
|
+
def self.build(collation_elements)
|
31
|
+
new(collation_elements).bytes_array
|
32
|
+
end
|
33
|
+
|
34
|
+
# Arguments:
|
35
|
+
#
|
36
|
+
# collation_elements - an array of collation elements, represented as arrays of integer weights.
|
37
|
+
#
|
38
|
+
def initialize(collation_elements)
|
39
|
+
@collation_elements = collation_elements
|
40
|
+
end
|
41
|
+
|
42
|
+
def bytes_array
|
43
|
+
@bytes_array ||= build_bytes_array
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def build_bytes_array
|
49
|
+
@bytes_array = []
|
50
|
+
|
51
|
+
append_primary_bytes
|
52
|
+
append_secondary_bytes
|
53
|
+
append_tertiary_bytes
|
54
|
+
|
55
|
+
@bytes_array
|
56
|
+
end
|
57
|
+
|
58
|
+
def append_primary_bytes
|
59
|
+
@collation_elements.each do |collation_element|
|
60
|
+
append_weight(level_weight(collation_element, PRIMARY_LEVEL))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def append_secondary_bytes
|
65
|
+
@bytes_array << LEVEL_SEPARATOR
|
66
|
+
|
67
|
+
@common_count = 0
|
68
|
+
|
69
|
+
@collation_elements.each do |collation_element|
|
70
|
+
fixnum_to_bytes_array(level_weight(collation_element, SECONDARY_LEVEL)).each do |byte|
|
71
|
+
append_secondary_byte(byte)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# append compressed trailing common bytes
|
76
|
+
append_common_bytes(SECONDARY_BOTTOM, SECONDARY_BOTTOM_COUNT, false) if @common_count > 0
|
77
|
+
end
|
78
|
+
|
79
|
+
def append_tertiary_bytes
|
80
|
+
@bytes_array << LEVEL_SEPARATOR
|
81
|
+
|
82
|
+
@common_count = 0
|
83
|
+
|
84
|
+
@collation_elements.each do |collation_element|
|
85
|
+
fixnum_to_bytes_array(tertiary_weight(collation_element)).each do |byte|
|
86
|
+
append_tertiary_byte(byte)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# append compressed trailing common bytes
|
91
|
+
append_common_bytes(TERTIARY_BOTTOM, TERTIARY_BOTTOM_COUNT, false) if @common_count > 0
|
92
|
+
end
|
93
|
+
|
94
|
+
def append_secondary_byte(secondary)
|
95
|
+
if secondary == SECONDARY_COMMON
|
96
|
+
@common_count += 1
|
97
|
+
else
|
98
|
+
append_with_common_bytes(secondary, SECONDARY_COMMON_SPACE)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def append_tertiary_byte(tertiary)
|
103
|
+
if tertiary == TERTIARY_COMMON
|
104
|
+
@common_count += 1
|
105
|
+
else
|
106
|
+
tertiary += TERTIARY_TOP_ADDITION if tertiary > TERTIARY_COMMON # create a gap above TERTIARY_COMMON
|
107
|
+
append_with_common_bytes(tertiary, TERTIARY_COMMON_SPACE)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def append_with_common_bytes(byte, options)
|
112
|
+
if @common_count > 0
|
113
|
+
if byte < options[:common]
|
114
|
+
append_common_bytes(options[:bottom], options[:bottom_count], false)
|
115
|
+
else
|
116
|
+
append_common_bytes(options[:top], options[:top_count], true)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
@bytes_array << byte
|
121
|
+
end
|
122
|
+
|
123
|
+
def append_common_bytes(boundary, count_limit, top)
|
124
|
+
sign = top ? -1 : +1
|
125
|
+
|
126
|
+
while @common_count > count_limit
|
127
|
+
@bytes_array << boundary + sign * count_limit
|
128
|
+
@common_count -= count_limit
|
129
|
+
end
|
130
|
+
|
131
|
+
@bytes_array << boundary + sign * (@common_count - 1)
|
132
|
+
@common_count = 0
|
133
|
+
end
|
134
|
+
|
135
|
+
def tertiary_weight(collation_element)
|
136
|
+
level_weight(collation_element, TERTIARY_LEVEL) & TERTIARY_LEVEL_MASK
|
137
|
+
end
|
138
|
+
|
139
|
+
def append_weight(weight)
|
140
|
+
@bytes_array.concat(fixnum_to_bytes_array(weight))
|
141
|
+
end
|
142
|
+
|
143
|
+
def level_weight(collation_element, level)
|
144
|
+
collation_element[level] || 0
|
145
|
+
end
|
146
|
+
|
147
|
+
def fixnum_to_bytes_array(number)
|
148
|
+
bytes = []
|
149
|
+
|
150
|
+
while number > 0
|
151
|
+
bytes.unshift(number & 0xFF)
|
152
|
+
number >>= 8
|
153
|
+
end
|
154
|
+
|
155
|
+
bytes
|
156
|
+
end
|
157
|
+
|
158
|
+
# Secondary level compression constants
|
159
|
+
|
160
|
+
SECONDARY_BOTTOM = 0x05
|
161
|
+
SECONDARY_TOP = 0x86
|
162
|
+
SECONDARY_PROPORTION = 0.5
|
163
|
+
SECONDARY_COMMON = SECONDARY_BOTTOM
|
164
|
+
SECONDARY_TOTAL_COUNT = SECONDARY_TOP - SECONDARY_BOTTOM - 1
|
165
|
+
SECONDARY_TOP_COUNT = (SECONDARY_PROPORTION * SECONDARY_TOTAL_COUNT).to_i
|
166
|
+
SECONDARY_BOTTOM_COUNT = SECONDARY_TOTAL_COUNT - SECONDARY_TOP_COUNT
|
167
|
+
|
168
|
+
SECONDARY_COMMON_SPACE = {
|
169
|
+
:common => SECONDARY_COMMON,
|
170
|
+
:bottom => SECONDARY_BOTTOM,
|
171
|
+
:bottom_count => SECONDARY_BOTTOM_COUNT,
|
172
|
+
:top => SECONDARY_TOP,
|
173
|
+
:top_count => SECONDARY_TOP_COUNT
|
174
|
+
}
|
175
|
+
|
176
|
+
# Tertiary level compression constants
|
177
|
+
|
178
|
+
TERTIARY_TOP_ADDITION = 0x80
|
179
|
+
|
180
|
+
TERTIARY_BOTTOM = 0x05
|
181
|
+
TERTIARY_TOP = 0x85
|
182
|
+
TERTIARY_PROPORTION = 0.667
|
183
|
+
TERTIARY_COMMON = TERTIARY_BOTTOM
|
184
|
+
TERTIARY_TOTAL_COUNT = TERTIARY_TOP - TERTIARY_BOTTOM - 1
|
185
|
+
TERTIARY_TOP_COUNT = (TERTIARY_PROPORTION * TERTIARY_TOTAL_COUNT).to_i
|
186
|
+
TERTIARY_BOTTOM_COUNT = TERTIARY_TOTAL_COUNT - TERTIARY_TOP_COUNT
|
187
|
+
|
188
|
+
TERTIARY_COMMON_SPACE = {
|
189
|
+
:common => TERTIARY_COMMON,
|
190
|
+
:bottom => TERTIARY_BOTTOM,
|
191
|
+
:bottom_count => TERTIARY_BOTTOM_COUNT,
|
192
|
+
:top => TERTIARY_TOP,
|
193
|
+
:top_count => TERTIARY_TOP_COUNT
|
194
|
+
}
|
195
|
+
|
196
|
+
end
|
197
|
+
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
module TwitterCldr
|
7
|
+
module Collation
|
8
|
+
|
9
|
+
# This class represents a trie - a tree data structure, also known as a prefix tree.
|
10
|
+
#
|
11
|
+
# Every node corresponds to a single character of the key. To find the value by key one goes down the trie
|
12
|
+
# starting from the root and descending one character at a time. If at some level current node doesn't have a
|
13
|
+
# child corresponding to the next character of the key, then the trie doesn't contain a value with the given key.
|
14
|
+
# Otherwise, the final node, corresponding to the last character of the key, should contain the value. If it's
|
15
|
+
# nil, then the trie doesn't contain a value with the given key (or the value itself is nil).
|
16
|
+
#
|
17
|
+
class Trie
|
18
|
+
|
19
|
+
# Initializes a new trie. If `trie_hash` value is passed it's used as the initial data for the trie. Usually,
|
20
|
+
# `trie_hash` is extracted from other trie and represents its sub-trie.
|
21
|
+
#
|
22
|
+
def initialize(trie_hash = {})
|
23
|
+
@root = [nil, trie_hash]
|
24
|
+
end
|
25
|
+
|
26
|
+
def add(key, value)
|
27
|
+
final = key.inject(@root) do |node, key_element|
|
28
|
+
node[1][key_element] ||= [nil, {}]
|
29
|
+
end
|
30
|
+
|
31
|
+
final[0] = value
|
32
|
+
end
|
33
|
+
|
34
|
+
def get(key)
|
35
|
+
final = key.inject(@root) do |node, key_element|
|
36
|
+
subtree = node[1][key_element]
|
37
|
+
return unless subtree
|
38
|
+
subtree
|
39
|
+
end
|
40
|
+
|
41
|
+
final[0]
|
42
|
+
end
|
43
|
+
|
44
|
+
# Finds the longest substring of the `key` that matches, as a key, a node in the trie.
|
45
|
+
#
|
46
|
+
# Returns a three elements array:
|
47
|
+
#
|
48
|
+
# 1. value in the last node that was visited
|
49
|
+
# 2. sub-trie of this node (as a hash)
|
50
|
+
# 3. size of the `key` prefix that matches this node
|
51
|
+
#
|
52
|
+
def find_prefix(key)
|
53
|
+
prefix_size = 0
|
54
|
+
node = @root
|
55
|
+
|
56
|
+
key.each do |key_element|
|
57
|
+
subtree = node[1][key_element]
|
58
|
+
|
59
|
+
if subtree
|
60
|
+
prefix_size += 1
|
61
|
+
node = subtree
|
62
|
+
else
|
63
|
+
break
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
node + [prefix_size]
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|