twitter_cldr 1.6.1 → 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/History.txt +4 -0
  2. data/Rakefile +8 -3
  3. data/lib/twitter_cldr/collation/collator.rb +9 -11
  4. data/lib/twitter_cldr/collation/trie.rb +8 -0
  5. data/lib/twitter_cldr/collation/trie_builder.rb +12 -15
  6. data/lib/twitter_cldr/collation/trie_loader.rb +50 -0
  7. data/lib/twitter_cldr/collation/trie_with_fallback.rb +6 -4
  8. data/lib/twitter_cldr/collation.rb +1 -0
  9. data/lib/twitter_cldr/resources/tailoring_importer.rb +203 -0
  10. data/lib/twitter_cldr/resources/tries_dumper.rb +43 -0
  11. data/lib/twitter_cldr/resources.rb +3 -2
  12. data/lib/twitter_cldr/version.rb +1 -1
  13. data/resources/collation/tries/af.dump +0 -0
  14. data/resources/collation/tries/ar.dump +0 -0
  15. data/resources/collation/tries/ca.dump +0 -0
  16. data/resources/collation/tries/cs.dump +0 -0
  17. data/resources/collation/tries/da.dump +0 -0
  18. data/resources/collation/tries/de.dump +0 -0
  19. data/resources/collation/tries/default.dump +0 -0
  20. data/resources/collation/tries/el.dump +0 -0
  21. data/resources/collation/tries/en.dump +0 -0
  22. data/resources/collation/tries/es.dump +0 -0
  23. data/resources/collation/tries/eu.dump +0 -0
  24. data/resources/collation/tries/fa.dump +0 -0
  25. data/resources/collation/tries/fi.dump +0 -0
  26. data/resources/collation/tries/fil.dump +0 -0
  27. data/resources/collation/tries/fr.dump +0 -0
  28. data/resources/collation/tries/he.dump +0 -0
  29. data/resources/collation/tries/hi.dump +0 -0
  30. data/resources/collation/tries/hu.dump +0 -0
  31. data/resources/collation/tries/id.dump +0 -0
  32. data/resources/collation/tries/it.dump +0 -0
  33. data/resources/collation/tries/ja.dump +0 -0
  34. data/resources/collation/tries/ko.dump +0 -0
  35. data/resources/collation/tries/ms.dump +0 -0
  36. data/resources/collation/tries/nb.dump +0 -0
  37. data/resources/collation/tries/nl.dump +0 -0
  38. data/resources/collation/tries/pl.dump +0 -0
  39. data/resources/collation/tries/pt.dump +0 -0
  40. data/resources/collation/tries/ru.dump +0 -0
  41. data/resources/collation/tries/sv.dump +0 -0
  42. data/resources/collation/tries/th.dump +0 -0
  43. data/resources/collation/tries/tr.dump +0 -0
  44. data/resources/collation/tries/uk.dump +0 -0
  45. data/resources/collation/tries/ur.dump +0 -0
  46. data/resources/collation/tries/zh-Hant.dump +0 -0
  47. data/resources/collation/tries/zh.dump +0 -0
  48. data/spec/collation/collation_spec.rb +4 -2
  49. data/spec/collation/collator_spec.rb +36 -30
  50. data/spec/collation/tailoring_spec.rb +3 -1
  51. data/spec/collation/tailoring_tests/ja.txt +6 -5
  52. data/spec/collation/trie_builder_spec.rb +21 -26
  53. data/spec/collation/trie_dumps_spec.rb +26 -0
  54. data/spec/collation/trie_loader_spec.rb +72 -0
  55. data/spec/collation/trie_spec.rb +14 -0
  56. data/spec/collation/trie_with_fallback_spec.rb +6 -0
  57. data/spec/normalization/normalization_spec.rb +2 -2
  58. metadata +43 -21
  59. data/lib/twitter_cldr/resources/import/tailoring.rb +0 -202
  60. data/lib/twitter_cldr/resources/import.rb +0 -12
@@ -1,202 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- require 'nokogiri'
7
- require 'yaml'
8
- require 'java'
9
-
10
- module TwitterCldr
11
- module Resources
12
- module Import
13
-
14
- # This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
15
- # http://download.icu-project.org/files/icu4j/49.1/icu4j-49_1.jar).
16
- #
17
- class Tailoring
18
-
19
- SUPPORTED_RULES = %w[p s t i pc sc tc ic x]
20
- SIMPLE_RULES = %w[p s t i]
21
- LEVEL_RULE_REGEXP = /^(p|s|t|i)(c?)$/
22
-
23
- IGNORED_TAGS = %w[reset text #comment]
24
-
25
- LAST_BYTE_MASK = 0xFF
26
-
27
- LOCALES_MAP = {
28
- :'zh-Hant' => :'zh_Hant',
29
- :id => :root,
30
- :it => :root,
31
- :ms => :root,
32
- :nl => :root,
33
- :pt => :root
34
- }
35
-
36
- EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
37
-
38
- class ImportError < RuntimeError; end
39
-
40
- # Arguments:
41
- #
42
- # input_path - path to a directory containing CLDR tailoring data (available at
43
- # http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
44
- # or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
45
- #
46
- # output_path - output directory for imported YAML files
47
- #
48
- # icu4j_path - path to ICU4J jar file
49
- #
50
- def initialize(input_path, output_path, icu4j_path)
51
- require icu4j_path
52
-
53
- @input_path = input_path
54
- @output_path = output_path
55
- end
56
-
57
- def import(locale)
58
- print "Importing %8s\t--\t" % locale
59
-
60
- if tailoring_present?(locale)
61
- YAML.dump(tailoring_data(locale), open(resource_file_path(locale), 'w'))
62
- puts "Done."
63
- else
64
- YAML.dump(EMPTY_TAILORING_DATA, open(resource_file_path(locale), 'w'))
65
- puts "Missing (generated empty tailoring resource)."
66
- end
67
- rescue ImportError => e
68
- puts "Error: #{e.message}"
69
- end
70
-
71
- private
72
-
73
- def tailoring_present?(locale)
74
- File.file?(locale_file_path(locale))
75
- end
76
-
77
- def translated_locale(locale)
78
- LOCALES_MAP.fetch(locale, locale)
79
- end
80
-
81
- def locale_file_path(locale)
82
- File.join(@input_path, "#{translated_locale(locale)}.xml")
83
- end
84
-
85
- def resource_file_path(locale)
86
- File.join(@output_path, "#{locale}.yml")
87
- end
88
-
89
- def tailoring_data(locale)
90
- doc = Nokogiri::XML(open(locale_file_path(locale)))
91
- collations = doc.at_xpath('//collations')
92
-
93
- collation_alias = collations.at_xpath('alias[@path="//ldml/collations"]')
94
- aliased_locale = collation_alias && collation_alias.attr('source')
95
-
96
- return tailoring_data(aliased_locale) if aliased_locale
97
-
98
- standard_tailoring = collations.at_xpath('collation[@type="standard"]')
99
-
100
- {
101
- 'collator_options' => parse_collator_options(standard_tailoring),
102
- 'tailored_table' => parse_tailorings(standard_tailoring, locale),
103
- 'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
104
- }
105
- end
106
-
107
- def parse_tailorings(data, locale)
108
- rules = data && data.at_xpath('rules')
109
-
110
- return '' unless rules
111
-
112
- collator = Java::ComIbmIcuText::Collator.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
113
-
114
- rules.children.map do |child|
115
- validate_tailoring_rule(child)
116
-
117
- if child.name =~ LEVEL_RULE_REGEXP
118
- if $2.empty?
119
- table_entry_for_rule(collator, child.text)
120
- else
121
- child.text.chars.map { |char| table_entry_for_rule(collator, char) }
122
- end
123
- elsif child.name == 'x'
124
- context = ''
125
- child.children.each_with_object([]) do |c, memo|
126
- if SIMPLE_RULES.include?(c.name)
127
- memo << table_entry_for_rule(collator, context + c.text)
128
- elsif c.name == 'context'
129
- context = c.text
130
- elsif c.name != 'extend'
131
- raise ImportError, "Rule '#{c.name}' inside <x></x> is not supported."
132
- end
133
- end
134
- else
135
- raise ImportError, "Tag '#{child.name}' is not supported." unless IGNORED_TAGS.include?(child.name)
136
- end
137
- end.flatten.compact.join("\n")
138
- end
139
-
140
- def table_entry_for_rule(collator, tailored_value)
141
- code_points = get_code_points(tailored_value)
142
-
143
- collation_elements = get_collation_elements(collator, tailored_value).map do |ce|
144
- ce.map { |l| l.to_s(16).upcase }.join(', ')
145
- end
146
-
147
- "#{code_points.join(' ')}; [#{collation_elements.join('][')}]"
148
- end
149
-
150
- def parse_suppressed_contractions(data)
151
- node = data && data.at_xpath('suppress_contractions')
152
- node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
153
- end
154
-
155
- def parse_collator_options(data)
156
- options = {}
157
-
158
- if data
159
- case_first_setting = data.at_xpath('settings[@caseFirst]')
160
- options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
161
- end
162
-
163
- options
164
- end
165
-
166
- def validate_tailoring_rule(rule)
167
- return if IGNORED_TAGS.include?(rule.name)
168
-
169
- raise ImportError, "Rule '#{rule.name}' is not supported." unless SUPPORTED_RULES.include?(rule.name)
170
- end
171
-
172
- def get_collation_elements(collator, string)
173
- iter = collator.get_collation_element_iterator(string)
174
-
175
- collation_elements = []
176
- ce = iter.next
177
-
178
- while ce != Java::ComIbmIcuText::CollationElementIterator::NULLORDER
179
- p1 = (ce >> 24) & LAST_BYTE_MASK
180
- p2 = (ce >> 16) & LAST_BYTE_MASK
181
-
182
- primary = p2.zero? ? p1 : (p1 << 8) + p2
183
- secondary = (ce >> 8) & LAST_BYTE_MASK
184
- tertiarly = ce & LAST_BYTE_MASK
185
-
186
- collation_elements << [primary, secondary, tertiarly]
187
-
188
- ce = iter.next
189
- end
190
-
191
- collation_elements
192
- end
193
-
194
- def get_code_points(string)
195
- TwitterCldr::Normalization::NFD.normalize_code_points(TwitterCldr::Utils::CodePoints.from_string(string))
196
- end
197
-
198
- end
199
-
200
- end
201
- end
202
- end
@@ -1,12 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Resources
8
- module Import
9
- autoload :Tailoring, 'twitter_cldr/resources/import/tailoring'
10
- end
11
- end
12
- end