twitter_cldr 1.4.1 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. data/NOTICE +36 -2
  2. data/README.md +2 -2
  3. data/lib/twitter_cldr/collation/collator.rb +143 -0
  4. data/lib/twitter_cldr/collation/implicit_collation_elements.rb +188 -0
  5. data/lib/twitter_cldr/collation/sort_key.rb +199 -0
  6. data/lib/twitter_cldr/collation/trie.rb +73 -0
  7. data/lib/twitter_cldr/collation/trie_builder.rb +56 -0
  8. data/lib/twitter_cldr/collation.rb +14 -0
  9. data/lib/twitter_cldr/core_ext/localized_object.rb +3 -2
  10. data/lib/twitter_cldr/core_ext/string.rb +1 -1
  11. data/lib/twitter_cldr/formatters/calendars/datetime_formatter.rb +89 -72
  12. data/lib/twitter_cldr/normalization/base.rb +22 -0
  13. data/lib/twitter_cldr/normalization/hangul.rb +68 -0
  14. data/lib/twitter_cldr/{normalizers → normalization}/nfc.rb +2 -2
  15. data/lib/twitter_cldr/{normalizers → normalization}/nfd.rb +1 -1
  16. data/lib/twitter_cldr/{normalizers → normalization}/nfkc.rb +5 -17
  17. data/lib/twitter_cldr/{normalizers → normalization}/nfkd.rb +3 -18
  18. data/lib/twitter_cldr/normalization.rb +15 -0
  19. data/lib/twitter_cldr/shared/code_point.rb +5 -3
  20. data/lib/twitter_cldr/tokenizers/base.rb +15 -1
  21. data/lib/twitter_cldr/tokenizers/calendars/datetime_tokenizer.rb +6 -1
  22. data/lib/twitter_cldr/utils/code_points.rb +1 -1
  23. data/lib/twitter_cldr/version.rb +2 -2
  24. data/lib/twitter_cldr.rb +9 -8
  25. data/resources/collation/FractionalUCA_SHORT.txt +41593 -0
  26. data/resources/locales/af/calendars.yml +164 -0
  27. data/resources/locales/af/languages.yml +173 -0
  28. data/resources/locales/af/numbers.yml +42 -0
  29. data/resources/locales/af/plurals.yml +2 -0
  30. data/resources/locales/af/units.yml +88 -0
  31. data/resources/locales/ar/calendars.yml +9 -0
  32. data/resources/locales/ar/numbers.yml +15 -2
  33. data/resources/locales/ca/calendars.yml +228 -0
  34. data/resources/locales/ca/languages.yml +510 -0
  35. data/resources/locales/ca/numbers.yml +43 -0
  36. data/resources/locales/ca/plurals.yml +2 -0
  37. data/resources/locales/ca/units.yml +93 -0
  38. data/resources/locales/cs/calendars.yml +229 -0
  39. data/resources/locales/cs/languages.yml +471 -0
  40. data/resources/locales/cs/numbers.yml +44 -0
  41. data/resources/locales/cs/plurals.yml +2 -0
  42. data/resources/locales/cs/units.yml +114 -0
  43. data/resources/locales/da/calendars.yml +10 -0
  44. data/resources/locales/da/numbers.yml +13 -0
  45. data/resources/locales/de/calendars.yml +9 -0
  46. data/resources/locales/de/numbers.yml +13 -0
  47. data/resources/locales/el/calendars.yml +227 -0
  48. data/resources/locales/el/languages.yml +519 -0
  49. data/resources/locales/el/numbers.yml +42 -0
  50. data/resources/locales/el/plurals.yml +2 -0
  51. data/resources/locales/el/units.yml +107 -0
  52. data/resources/locales/en/calendars.yml +10 -0
  53. data/resources/locales/en/numbers.yml +13 -0
  54. data/resources/locales/es/calendars.yml +9 -0
  55. data/resources/locales/es/numbers.yml +13 -0
  56. data/resources/locales/eu/calendars.yml +173 -0
  57. data/resources/locales/eu/languages.yml +161 -0
  58. data/resources/locales/eu/numbers.yml +43 -0
  59. data/resources/locales/eu/plurals.yml +2 -0
  60. data/resources/locales/eu/units.yml +91 -0
  61. data/resources/locales/fa/calendars.yml +10 -0
  62. data/resources/locales/fa/numbers.yml +13 -0
  63. data/resources/locales/fi/calendars.yml +10 -0
  64. data/resources/locales/fi/numbers.yml +14 -1
  65. data/resources/locales/fil/calendars.yml +8 -0
  66. data/resources/locales/fil/numbers.yml +13 -0
  67. data/resources/locales/fr/calendars.yml +9 -0
  68. data/resources/locales/fr/numbers.yml +14 -1
  69. data/resources/locales/he/calendars.yml +9 -0
  70. data/resources/locales/he/numbers.yml +13 -0
  71. data/resources/locales/hi/calendars.yml +8 -0
  72. data/resources/locales/hi/numbers.yml +13 -0
  73. data/resources/locales/hu/calendars.yml +10 -0
  74. data/resources/locales/hu/numbers.yml +15 -2
  75. data/resources/locales/id/calendars.yml +8 -0
  76. data/resources/locales/id/numbers.yml +16 -3
  77. data/resources/locales/it/calendars.yml +9 -0
  78. data/resources/locales/it/numbers.yml +13 -0
  79. data/resources/locales/ja/calendars.yml +9 -0
  80. data/resources/locales/ja/numbers.yml +13 -0
  81. data/resources/locales/ko/calendars.yml +9 -0
  82. data/resources/locales/ko/numbers.yml +13 -0
  83. data/resources/locales/ms/calendars.yml +8 -0
  84. data/resources/locales/ms/numbers.yml +16 -3
  85. data/resources/locales/nb/calendars.yml +234 -0
  86. data/resources/locales/{no → nb}/languages.yml +25 -4
  87. data/resources/locales/nb/numbers.yml +43 -0
  88. data/resources/locales/nb/plurals.yml +2 -0
  89. data/resources/locales/nb/units.yml +87 -0
  90. data/resources/locales/nl/calendars.yml +10 -0
  91. data/resources/locales/nl/numbers.yml +13 -0
  92. data/resources/locales/pl/calendars.yml +9 -0
  93. data/resources/locales/pl/numbers.yml +14 -1
  94. data/resources/locales/pt/calendars.yml +9 -0
  95. data/resources/locales/pt/numbers.yml +13 -0
  96. data/resources/locales/ru/calendars.yml +10 -0
  97. data/resources/locales/ru/numbers.yml +14 -1
  98. data/resources/locales/sv/calendars.yml +10 -0
  99. data/resources/locales/sv/numbers.yml +14 -1
  100. data/resources/locales/th/calendars.yml +67 -57
  101. data/resources/locales/th/numbers.yml +13 -0
  102. data/resources/locales/tr/calendars.yml +9 -0
  103. data/resources/locales/tr/numbers.yml +13 -0
  104. data/resources/locales/uk/calendars.yml +199 -0
  105. data/resources/locales/uk/languages.yml +519 -0
  106. data/resources/locales/uk/numbers.yml +45 -0
  107. data/resources/locales/uk/plurals.yml +2 -0
  108. data/resources/locales/uk/units.yml +135 -0
  109. data/resources/locales/ur/calendars.yml +9 -0
  110. data/resources/locales/ur/numbers.yml +13 -0
  111. data/resources/locales/zh/calendars.yml +8 -0
  112. data/resources/locales/zh/numbers.yml +13 -0
  113. data/resources/locales/zh-Hant/calendars.yml +8 -0
  114. data/resources/locales/zh-Hant/numbers.yml +16 -3
  115. data/resources/locales/zh-Hant/plurals.yml +2 -0
  116. data/resources/unicode_data/hangul_blocks.yml +21 -0
  117. data/spec/collation/CollationTest_CLDR_NON_IGNORABLE_Short.txt +714 -0
  118. data/spec/collation/collation_spec.rb +93 -0
  119. data/spec/collation/collator_spec.rb +117 -0
  120. data/spec/collation/implicit_collation_elements_spec.rb +24 -0
  121. data/spec/collation/sort_key_spec.rb +56 -0
  122. data/spec/collation/trie_builder_spec.rb +114 -0
  123. data/spec/collation/trie_spec.rb +97 -0
  124. data/spec/core_ext/calendars/datetime_spec.rb +5 -0
  125. data/spec/core_ext/calendars_spec.rb +34 -0
  126. data/spec/core_ext/numbers_spec.rb +39 -0
  127. data/spec/core_ext/string_spec.rb +4 -4
  128. data/spec/formatters/calendars/datetime_formatter_spec.rb +92 -2
  129. data/spec/{normalizers → normalization}/NormalizationTestShort.txt +0 -0
  130. data/spec/{normalizers → normalization}/base_spec.rb +1 -1
  131. data/spec/normalization/hangul_spec.rb +42 -0
  132. data/spec/{normalizers → normalization}/normalization_spec.rb +15 -16
  133. data/spec/readme_spec.rb +2 -2
  134. data/spec/shared/code_point_spec.rb +42 -30
  135. data/spec/shared/resources_spec.rb +30 -6
  136. data/spec/tokenizers/base_spec.rb +17 -0
  137. data/spec/twitter_cldr_spec.rb +1 -1
  138. metadata +71 -83
  139. data/lib/twitter_cldr/normalizers/base.rb +0 -34
  140. data/lib/twitter_cldr/normalizers.rb +0 -14
  141. data/resources/locales/no/calendars.yml +0 -127
  142. data/resources/locales/no/numbers.yml +0 -29
  143. data/resources/locales/no/plurals.yml +0 -1
  144. data/resources/unicode_data/blocks_hangul.yml +0 -46
  145. data/spec/normalizers/NormalizationTest.txt +0 -18431
data/NOTICE CHANGED
@@ -2,7 +2,7 @@ twitter-cldr-rb is a Ruby implementation of the Common Locale Data Repository
2
2
  Copyright (C) 2012 Twitter, Inc.
3
3
 
4
4
 
5
- Portions of this gem were borrowed from Sven Fuchs' ruby-cldr gem. Here is
5
+ Portions of this gem were borrowed from Sven Fuchs' ruby-cldr gem. Here is
6
6
  the license that accompanied Mr. Fuchs' code:
7
7
 
8
8
  Copyright (c) 2009 Sven Fuchs
@@ -117,4 +117,38 @@ the software.
117
117
  6. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
118
118
  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
119
119
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
120
- PURPOSE.
120
+ PURPOSE.
121
+
122
+
123
+ TwitterCldr::Collation::ImplicitCollationElements module was ported from
124
+ the ICU4J library (http://site.icu-project.org). Here is its license:
125
+
126
+ ICU License - ICU 1.8.1 and later
127
+
128
+ COPYRIGHT AND PERMISSION NOTICE
129
+
130
+ Copyright (c) 1995-2012 International Business Machines Corporation and others
131
+
132
+ All rights reserved.
133
+
134
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
135
+ this software and associated documentation files (the "Software"), to deal in
136
+ the Software without restriction, including without limitation the rights to use,
137
+ copy, modify, merge, publish, distribute, and/or sell copies of the Software,
138
+ and to permit persons to whom the Software is furnished to do so, provided that
139
+ the above copyright notice(s) and this permission notice appear in all copies
140
+ of the Software and that both the above copyright notice(s) and this permission
141
+ notice appear in supporting documentation.
142
+
143
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
144
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
145
+ PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
146
+ THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
147
+ OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
148
+ RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
149
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
150
+ OR PERFORMANCE OF THIS SOFTWARE.
151
+
152
+ Except as contained in this notice, the name of a copyright holder shall not be
153
+ used in advertising or otherwise to promote the sale, use or other dealings in
154
+ this Software without prior written authorization of the copyright holder.
data/README.md CHANGED
@@ -300,7 +300,7 @@ TwitterCldr::Utils::CodePoints.to_string(["00BF"]) # "¿"
300
300
  Normalize/decompose a Unicode string (NFD, NFKD, NFC, and NFKC implementations available). Note that the normalized string will almost always look the same as the original string because most character display systems automatically combine decomposed characters.
301
301
 
302
302
  ```ruby
303
- TwitterCldr::Normalizers::NFD.normalize("français") # "français"
303
+ TwitterCldr::Normalization::NFD.normalize("français") # "français"
304
304
  ```
305
305
 
306
306
  Normalization is easier to see in hex:
@@ -310,7 +310,7 @@ Normalization is easier to see in hex:
310
310
  TwitterCldr::Utils::CodePoints.from_string("español")
311
311
 
312
312
  # ["0065", "0073", "0070", "0061", "006E", "0303", "006F", "006C"]
313
- TwitterCldr::Utils::CodePoints.from_string(TwitterCldr::Normalizers::NFD.normalize("español"))
313
+ TwitterCldr::Utils::CodePoints.from_string(TwitterCldr::Normalization::NFD.normalize("español"))
314
314
  ```
315
315
 
316
316
  Notice in the example above that the letter "ñ" was transformed from `00F1` to `006E 0303`, which represent the "n" and the "˜" respectively.
@@ -0,0 +1,143 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Collation
8
+
9
+ # Collator uses fractional collation elements table form CLDR to generate sort keys for Unicode strings as well as
10
+ # compare and sort such strings by generated sort keys.
11
+ #
12
+ class Collator
13
+
14
+ FRACTIONAL_UCA_SHORT_RESOURCE = 'collation/FractionalUCA_SHORT.txt'
15
+
16
+ def sort(strings)
17
+ strings.map{ |s| [s, comparison_key(s)] }.sort{ |a, b| compare_keys(a[1], b[1]) }.map(&:first)
18
+ end
19
+
20
+ def compare(string_a, string_b)
21
+ string_a == string_b ? 0 : compare_keys(comparison_key(string_a), comparison_key(string_b))
22
+ end
23
+
24
+ def sort_key(string_or_code_points)
25
+ sort_key_for_code_points(get_code_points(string_or_code_points))
26
+ end
27
+
28
+ def trie
29
+ @trie ||= self.class.trie
30
+ end
31
+
32
+ def self.trie
33
+ @trie ||= TwitterCldr::Collation::TrieBuilder.load_trie(FRACTIONAL_UCA_SHORT_RESOURCE)
34
+ end
35
+
36
+ private
37
+
38
+ def comparison_key(string)
39
+ code_points = TwitterCldr::Utils::CodePoints.from_string(string)
40
+ { :code_points => code_points, :sort_key => sort_key(code_points) }
41
+ end
42
+
43
+ def compare_keys(a, b)
44
+ (a[:sort_key] <=> b[:sort_key]).nonzero? || get_integer_code_points(a[:code_points]) <=> get_integer_code_points(b[:code_points])
45
+ end
46
+
47
+ def sort_key_for_code_points(integer_code_points)
48
+ TwitterCldr::Collation::SortKey.build(get_collation_elements(integer_code_points))
49
+ end
50
+
51
+ def get_integer_code_points(code_points)
52
+ code_points.map { |code_point| code_point.to_i(16) }
53
+ end
54
+
55
+ def get_collation_elements(integer_code_points)
56
+ result = []
57
+ result.concat(code_point_collation_elements(integer_code_points)) until integer_code_points.empty?
58
+ result
59
+ end
60
+
61
+ def get_code_points(str_or_code_points)
62
+ code_points = str_or_code_points.is_a?(String) ? TwitterCldr::Utils::CodePoints.from_string(str_or_code_points) : str_or_code_points
63
+
64
+ # Normalization makes the collation process significantly slower (like seven times slower on the UCA
65
+ # non-ignorable test from CollationTest_NON_IGNORABLE.txt). ICU uses some optimizations to apply normalization
66
+ # only in special, rare cases. We need to investigate possible solutions and do normalization cleverly too.
67
+ code_points = TwitterCldr::Normalization::NFD.normalize_code_points(code_points)
68
+
69
+ get_integer_code_points(code_points)
70
+ end
71
+
72
+ # Returns the first sequence of fractional collation elements for an array of integer code points. Returned value
73
+ # is an array of well formed (including weights for all significant levels) integer arrays.
74
+ #
75
+ # NOTE (side-effect): all used code points are removed from the input array.
76
+ #
77
+ def code_point_collation_elements(integer_code_points)
78
+ explicit_collation_elements(integer_code_points) || implicit_collation_elements(integer_code_points)
79
+ end
80
+
81
+ # Tries to build explicit collation elements array for the longest code points prefix in the given sequence. When
82
+ # possible, combines this prefix with (not necessarily subsequent) non-starters that follow it in the sequence.
83
+ # That's necessary because canonical ordering (that is performed during normalization) can break contractions
84
+ # that existed in the original, de-normalized string.
85
+ #
86
+ # NOTE (side-effect): all used code points are removed from the input array.
87
+ #
88
+ # For more information see section '4.2 Produce Array' of the main algorithm at http://www.unicode.org/reports/tr10/#Main_Algorithm
89
+ #
90
+ def explicit_collation_elements(integer_code_points)
91
+ # find the longest prefix in the trie
92
+ collation_elements, suffixes, prefix_size = trie.find_prefix(integer_code_points)
93
+
94
+ return unless collation_elements
95
+
96
+ # remove prefix from the code points sequence
97
+ integer_code_points.shift(prefix_size)
98
+
99
+ non_starter_pos = 0
100
+
101
+ used_combining_classes = {}
102
+
103
+ while non_starter_pos < integer_code_points.size && !suffixes.empty?
104
+ # create a trie from a hash of suffixes available for the chosen prefix
105
+ subtrie = TwitterCldr::Collation::Trie.new(suffixes)
106
+
107
+ # get next code point (possibly non-starter)
108
+ non_starter_code_point = integer_code_points[non_starter_pos]
109
+ combining_class = TwitterCldr::Normalization::Base.combining_class_for(non_starter_code_point.to_s(16))
110
+
111
+ # code point is a starter or combining class has been already used (non-starter is 'blocked' from the prefix)
112
+ break if combining_class == 0 || used_combining_classes[combining_class]
113
+
114
+ used_combining_classes[combining_class] = true
115
+
116
+ # Try to find collation elements for [prefix + non-starter] code points sequence. As the subtrie contains
117
+ # suffixes (without prefix) we pass only non-starter itself.
118
+ new_collation_elements, new_suffixes = subtrie.find_prefix([non_starter_code_point]).first(2)
119
+
120
+ if new_collation_elements
121
+ # non-starter with a collation elements sequence corresponding to [prefix + non-starter] accepted
122
+ collation_elements = new_collation_elements
123
+ suffixes = new_suffixes
124
+
125
+ # Remove non-starter from its position in the sequence. Then we can move further from the same position.
126
+ integer_code_points.delete_at(non_starter_pos)
127
+ else
128
+ # move to the next code point
129
+ non_starter_pos += 1
130
+ end
131
+ end
132
+
133
+ collation_elements
134
+ end
135
+
136
+ def implicit_collation_elements(integer_code_points)
137
+ TwitterCldr::Collation::ImplicitCollationElements.for_code_point(integer_code_points.shift)
138
+ end
139
+
140
+ end
141
+
142
+ end
143
+ end
@@ -0,0 +1,188 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Collation
8
+
9
+ # ImplicitCollationElements generates implicit collation elements for code points (including some CJK characters),
10
+ # that are not explicitly mentioned in the collation elements table.
11
+ #
12
+ # This module was ported from the ICU4J library (ImplicitCEGenerator class). See NOTICE file for license information.
13
+ #
14
+ module ImplicitCollationElements
15
+
16
+ DEFAULT_SECONDARY_AND_TERTIARY = 5
17
+
18
+ class << self
19
+
20
+ def for_code_point(code_point)
21
+ [[primary_weight(swapCJK(code_point) + 1), DEFAULT_SECONDARY_AND_TERTIARY, DEFAULT_SECONDARY_AND_TERTIARY]]
22
+ end
23
+
24
+ private
25
+
26
+ # Generates the primary weight of the implicit CE for a given code point.
27
+ #
28
+ def primary_weight(code_point)
29
+ byte0 = code_point - MIN_4_BOUNDARY
30
+
31
+ if byte0 < 0
32
+ byte1 = code_point / FINAL_3_COUNT
33
+ byte0 = code_point % FINAL_3_COUNT
34
+
35
+ byte2 = byte1 / MEDIAL_COUNT
36
+ byte1 %= MEDIAL_COUNT
37
+
38
+ # spread out, leaving gap at start
39
+ byte0 = MIN_TRAIL + byte0 * FINAL_3_MULTIPLIER
40
+
41
+ # offset
42
+ byte1 += MIN_TRAIL
43
+ byte2 += MIN_PRIMARY
44
+
45
+ (byte2 << 16) + (byte1 << 8) + byte0
46
+ else
47
+ byte1 = byte0 / FINAL_4_COUNT
48
+ byte0 %= FINAL_4_COUNT
49
+
50
+ byte2 = byte1 / MEDIAL_COUNT
51
+ byte1 %= MEDIAL_COUNT
52
+
53
+ byte3 = byte2 / MEDIAL_COUNT
54
+ byte2 %= MEDIAL_COUNT
55
+
56
+ # spread out, leaving gap at start
57
+ byte0 = MIN_TRAIL + byte0 * FINAL_4_MULTIPLIER
58
+
59
+ # offset
60
+ byte1 += MIN_TRAIL
61
+ byte2 += MIN_TRAIL
62
+ byte3 += MIN_4_PRIMARY
63
+
64
+ (byte3 << 24) + (byte2 << 16) + (byte1 << 8) + byte0
65
+ end
66
+ end
67
+
68
+ # Method used to:
69
+ # a) collapse two different Han ranges from UCA into one (in the right order)
70
+ # b) bump any non-CJK characters by NON_CJK_OFFSET.
71
+ #
72
+ # The relevant blocks are:
73
+ # A: 4E00..9FFF; CJK Unified Ideographs
74
+ # F900..FAFF; CJK Compatibility Ideographs
75
+ # B: 3400..4DBF; CJK Unified Ideographs Extension A
76
+ # 20000..XX; CJK Unified Ideographs Extension B (and others later on)
77
+ #
78
+ # As long as
79
+ # no new B characters are allocated between 4E00 and FAFF, and
80
+ # no new A characters are outside of this range,
81
+ # (very high probability) this simple code will work.
82
+ #
83
+ # The reordered blocks are:
84
+ # Block1 is CJK
85
+ # Block2 is CJK_COMPAT_USED
86
+ # Block3 is CJK_A
87
+ # (all contiguous)
88
+ #
89
+ # Any other CJK gets its normal code point.
90
+ #
91
+ # When we reorder Block1, we make sure that it is at the very start, so that it will use a 3-byte form.
92
+ #
93
+ def swapCJK(code_point)
94
+ if code_point >= CJK_BASE
95
+ return code_point - CJK_BASE if code_point < CJK_LIMIT
96
+ return code_point + NON_CJK_OFFSET if code_point < CJK_COMPAT_USED_BASE
97
+ return code_point - CJK_COMPAT_USED_BASE + (CJK_LIMIT - CJK_BASE) if code_point < CJK_COMPAT_USED_LIMIT
98
+ return code_point + NON_CJK_OFFSET if code_point < CJK_B_BASE
99
+ return code_point if code_point < CJK_B_LIMIT # non-BMP-CJK
100
+ return code_point + NON_CJK_OFFSET if code_point < CJK_C_BASE
101
+ return code_point if code_point < CJK_C_LIMIT # non-BMP-CJK
102
+ return code_point + NON_CJK_OFFSET if code_point < CJK_D_BASE
103
+ return code_point if code_point < CJK_D_LIMIT # non-BMP-CJK
104
+
105
+ return code_point + NON_CJK_OFFSET # non-CJK
106
+ end
107
+
108
+ return code_point + NON_CJK_OFFSET if code_point < CJK_A_BASE
109
+ return code_point - CJK_A_BASE + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE) if code_point < CJK_A_LIMIT
110
+
111
+ code_point + NON_CJK_OFFSET # non-CJK
112
+ end
113
+
114
+ end
115
+
116
+ # primary value
117
+ MIN_PRIMARY = 0xE0
118
+ MAX_PRIMARY = 0xe4
119
+
120
+ # final byte
121
+ MIN_TRAIL = 0x04
122
+ MAX_TRAIL = 0xFE
123
+
124
+ # gap for tailoring of 3-byte forms
125
+ GAP_3 = 1
126
+
127
+ # number of 3-byte primaries that can be used
128
+ PRIMARIES_3_COUNT = 1
129
+
130
+ # 2 * [Unicode range] + 2
131
+ MAX_INPUT = 0x220001
132
+
133
+ # medials can use full range
134
+ MEDIAL_COUNT = MAX_TRAIL - MIN_TRAIL + 1
135
+
136
+ # number of values we can use in trailing bytes
137
+ # leave room for empty values between AND above, e.g., if gap = 2
138
+ # range 3..7 => +3 -4 -5 -6 -7: so 1 value
139
+ # range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
140
+ # range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
141
+ FINAL_3_MULTIPLIER = GAP_3 + 1
142
+ FINAL_3_COUNT = MEDIAL_COUNT / FINAL_3_MULTIPLIER
143
+
144
+ # find out how many values fit in each form
145
+ THREE_BYTE_COUNT = MEDIAL_COUNT * FINAL_3_COUNT
146
+
147
+ # now determine where the 3/4 boundary is
148
+ # we use 3 bytes below the boundary, and 4 above
149
+ PRIMARIES_AVAILABLE = MAX_PRIMARY - MIN_PRIMARY + 1
150
+ PRIMARIES_4_COUNT = PRIMARIES_AVAILABLE - PRIMARIES_3_COUNT
151
+ MIN_4_PRIMARY = MIN_PRIMARY + PRIMARIES_3_COUNT
152
+ MIN_4_BOUNDARY = PRIMARIES_3_COUNT * THREE_BYTE_COUNT
153
+
154
+ TOTAL_NEEDED = MAX_INPUT - MIN_4_BOUNDARY
155
+ NEEDED_PER_PRIMARY_BYTE = (TOTAL_NEEDED - 1) / PRIMARIES_4_COUNT + 1
156
+ NEEDED_PER_FINAL_BYTE = (NEEDED_PER_PRIMARY_BYTE - 1) / (MEDIAL_COUNT * MEDIAL_COUNT) + 1
157
+
158
+ GAP_4 = (MAX_TRAIL - MIN_TRAIL - 1) / NEEDED_PER_FINAL_BYTE
159
+
160
+ FINAL_4_MULTIPLIER = GAP_4 + 1
161
+ FINAL_4_COUNT = NEEDED_PER_FINAL_BYTE
162
+
163
+ # CJK constants
164
+
165
+ NON_CJK_OFFSET = 0x110000
166
+
167
+ CJK_COMPAT_USED_BASE = 0xFA0E
168
+ CJK_COMPAT_USED_LIMIT = 0xFA2F + 1
169
+
170
+ CJK_BASE = 0x4E00 # 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
171
+ CJK_LIMIT = 0x9FCC + 1 # 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
172
+
173
+ CJK_A_BASE = 0x3400 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
174
+ CJK_A_LIMIT = 0x4DB5 + 1 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
175
+
176
+ CJK_B_BASE = 0x20000 # 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
177
+ CJK_B_LIMIT = 0x2A6D6 + 1 # 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
178
+
179
+ CJK_C_BASE = 0x2A700 # 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
180
+ CJK_C_LIMIT = 0x2B734 + 1 # 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
181
+
182
+ CJK_D_BASE = 0x2B740 # 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
183
+ CJK_D_LIMIT = 0x2B81D + 1 # 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
184
+
185
+ end
186
+
187
+ end
188
+ end
@@ -0,0 +1,199 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Collation
8
+
9
+ # SortKey builds a collation sort key from an array of collation elements.
10
+ #
11
+ class SortKey
12
+
13
+ PRIMARY_LEVEL, SECONDARY_LEVEL, TERTIARY_LEVEL = 0, 1, 2
14
+
15
+ LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes
16
+
17
+ TERTIARY_LEVEL_MASK = 0x3F # mask for removing case bits from tertiary weight ('CC' bits in 'CC00 0000')
18
+
19
+ attr_reader :collation_elements
20
+
21
+ # Returns a sort key as an array of bytes.
22
+ #
23
+ # Arguments:
24
+ #
25
+ # collation_elements - an array of collation elements, represented as arrays of integer weights.
26
+ #
27
+ # An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
28
+ # method into another while forming the sort key.
29
+ #
30
+ def self.build(collation_elements)
31
+ new(collation_elements).bytes_array
32
+ end
33
+
34
+ # Arguments:
35
+ #
36
+ # collation_elements - an array of collation elements, represented as arrays of integer weights.
37
+ #
38
+ def initialize(collation_elements)
39
+ @collation_elements = collation_elements
40
+ end
41
+
42
+ def bytes_array
43
+ @bytes_array ||= build_bytes_array
44
+ end
45
+
46
+ private
47
+
48
+ def build_bytes_array
49
+ @bytes_array = []
50
+
51
+ append_primary_bytes
52
+ append_secondary_bytes
53
+ append_tertiary_bytes
54
+
55
+ @bytes_array
56
+ end
57
+
58
+ def append_primary_bytes
59
+ @collation_elements.each do |collation_element|
60
+ append_weight(level_weight(collation_element, PRIMARY_LEVEL))
61
+ end
62
+ end
63
+
64
+ def append_secondary_bytes
65
+ @bytes_array << LEVEL_SEPARATOR
66
+
67
+ @common_count = 0
68
+
69
+ @collation_elements.each do |collation_element|
70
+ fixnum_to_bytes_array(level_weight(collation_element, SECONDARY_LEVEL)).each do |byte|
71
+ append_secondary_byte(byte)
72
+ end
73
+ end
74
+
75
+ # append compressed trailing common bytes
76
+ append_common_bytes(SECONDARY_BOTTOM, SECONDARY_BOTTOM_COUNT, false) if @common_count > 0
77
+ end
78
+
79
+ def append_tertiary_bytes
80
+ @bytes_array << LEVEL_SEPARATOR
81
+
82
+ @common_count = 0
83
+
84
+ @collation_elements.each do |collation_element|
85
+ fixnum_to_bytes_array(tertiary_weight(collation_element)).each do |byte|
86
+ append_tertiary_byte(byte)
87
+ end
88
+ end
89
+
90
+ # append compressed trailing common bytes
91
+ append_common_bytes(TERTIARY_BOTTOM, TERTIARY_BOTTOM_COUNT, false) if @common_count > 0
92
+ end
93
+
94
+ def append_secondary_byte(secondary)
95
+ if secondary == SECONDARY_COMMON
96
+ @common_count += 1
97
+ else
98
+ append_with_common_bytes(secondary, SECONDARY_COMMON_SPACE)
99
+ end
100
+ end
101
+
102
+ def append_tertiary_byte(tertiary)
103
+ if tertiary == TERTIARY_COMMON
104
+ @common_count += 1
105
+ else
106
+ tertiary += TERTIARY_TOP_ADDITION if tertiary > TERTIARY_COMMON # create a gap above TERTIARY_COMMON
107
+ append_with_common_bytes(tertiary, TERTIARY_COMMON_SPACE)
108
+ end
109
+ end
110
+
111
+ def append_with_common_bytes(byte, options)
112
+ if @common_count > 0
113
+ if byte < options[:common]
114
+ append_common_bytes(options[:bottom], options[:bottom_count], false)
115
+ else
116
+ append_common_bytes(options[:top], options[:top_count], true)
117
+ end
118
+ end
119
+
120
+ @bytes_array << byte
121
+ end
122
+
123
+ def append_common_bytes(boundary, count_limit, top)
124
+ sign = top ? -1 : +1
125
+
126
+ while @common_count > count_limit
127
+ @bytes_array << boundary + sign * count_limit
128
+ @common_count -= count_limit
129
+ end
130
+
131
+ @bytes_array << boundary + sign * (@common_count - 1)
132
+ @common_count = 0
133
+ end
134
+
135
+ def tertiary_weight(collation_element)
136
+ level_weight(collation_element, TERTIARY_LEVEL) & TERTIARY_LEVEL_MASK
137
+ end
138
+
139
+ def append_weight(weight)
140
+ @bytes_array.concat(fixnum_to_bytes_array(weight))
141
+ end
142
+
143
+ def level_weight(collation_element, level)
144
+ collation_element[level] || 0
145
+ end
146
+
147
+ def fixnum_to_bytes_array(number)
148
+ bytes = []
149
+
150
+ while number > 0
151
+ bytes.unshift(number & 0xFF)
152
+ number >>= 8
153
+ end
154
+
155
+ bytes
156
+ end
157
+
158
+ # Secondary level compression constants
159
+
160
+ SECONDARY_BOTTOM = 0x05
161
+ SECONDARY_TOP = 0x86
162
+ SECONDARY_PROPORTION = 0.5
163
+ SECONDARY_COMMON = SECONDARY_BOTTOM
164
+ SECONDARY_TOTAL_COUNT = SECONDARY_TOP - SECONDARY_BOTTOM - 1
165
+ SECONDARY_TOP_COUNT = (SECONDARY_PROPORTION * SECONDARY_TOTAL_COUNT).to_i
166
+ SECONDARY_BOTTOM_COUNT = SECONDARY_TOTAL_COUNT - SECONDARY_TOP_COUNT
167
+
168
+ SECONDARY_COMMON_SPACE = {
169
+ :common => SECONDARY_COMMON,
170
+ :bottom => SECONDARY_BOTTOM,
171
+ :bottom_count => SECONDARY_BOTTOM_COUNT,
172
+ :top => SECONDARY_TOP,
173
+ :top_count => SECONDARY_TOP_COUNT
174
+ }
175
+
176
+ # Tertiary level compression constants
177
+
178
+ TERTIARY_TOP_ADDITION = 0x80
179
+
180
+ TERTIARY_BOTTOM = 0x05
181
+ TERTIARY_TOP = 0x85
182
+ TERTIARY_PROPORTION = 0.667
183
+ TERTIARY_COMMON = TERTIARY_BOTTOM
184
+ TERTIARY_TOTAL_COUNT = TERTIARY_TOP - TERTIARY_BOTTOM - 1
185
+ TERTIARY_TOP_COUNT = (TERTIARY_PROPORTION * TERTIARY_TOTAL_COUNT).to_i
186
+ TERTIARY_BOTTOM_COUNT = TERTIARY_TOTAL_COUNT - TERTIARY_TOP_COUNT
187
+
188
+ TERTIARY_COMMON_SPACE = {
189
+ :common => TERTIARY_COMMON,
190
+ :bottom => TERTIARY_BOTTOM,
191
+ :bottom_count => TERTIARY_BOTTOM_COUNT,
192
+ :top => TERTIARY_TOP,
193
+ :top_count => TERTIARY_TOP_COUNT
194
+ }
195
+
196
+ end
197
+
198
+ end
199
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ module TwitterCldr
7
+ module Collation
8
+
9
+ # This class represents a trie - a tree data structure, also known as a prefix tree.
10
+ #
11
+ # Every node corresponds to a single character of the key. To find the value by key one goes down the trie
12
+ # starting from the root and descending one character at a time. If at some level current node doesn't have a
13
+ # child corresponding to the next character of the key, then the trie doesn't contain a value with the given key.
14
+ # Otherwise, the final node, corresponding to the last character of the key, should contain the value. If it's
15
+ # nil, then the trie doesn't contain a value with the given key (or the value itself is nil).
16
+ #
17
+ class Trie
18
+
19
+ # Initializes a new trie. If `trie_hash` value is passed it's used as the initial data for the trie. Usually,
20
+ # `trie_hash` is extracted from other trie and represents its sub-trie.
21
+ #
22
+ def initialize(trie_hash = {})
23
+ @root = [nil, trie_hash]
24
+ end
25
+
26
+ def add(key, value)
27
+ final = key.inject(@root) do |node, key_element|
28
+ node[1][key_element] ||= [nil, {}]
29
+ end
30
+
31
+ final[0] = value
32
+ end
33
+
34
+ def get(key)
35
+ final = key.inject(@root) do |node, key_element|
36
+ subtree = node[1][key_element]
37
+ return unless subtree
38
+ subtree
39
+ end
40
+
41
+ final[0]
42
+ end
43
+
44
+ # Finds the longest substring of the `key` that matches, as a key, a node in the trie.
45
+ #
46
+ # Returns a three elements array:
47
+ #
48
+ # 1. value in the last node that was visited
49
+ # 2. sub-trie of this node (as a hash)
50
+ # 3. size of the `key` prefix that matches this node
51
+ #
52
+ def find_prefix(key)
53
+ prefix_size = 0
54
+ node = @root
55
+
56
+ key.each do |key_element|
57
+ subtree = node[1][key_element]
58
+
59
+ if subtree
60
+ prefix_size += 1
61
+ node = subtree
62
+ else
63
+ break
64
+ end
65
+ end
66
+
67
+ node + [prefix_size]
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end