twitter_cldr 1.6.1 → 1.6.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/History.txt +4 -0
  2. data/Rakefile +8 -3
  3. data/lib/twitter_cldr/collation/collator.rb +9 -11
  4. data/lib/twitter_cldr/collation/trie.rb +8 -0
  5. data/lib/twitter_cldr/collation/trie_builder.rb +12 -15
  6. data/lib/twitter_cldr/collation/trie_loader.rb +50 -0
  7. data/lib/twitter_cldr/collation/trie_with_fallback.rb +6 -4
  8. data/lib/twitter_cldr/collation.rb +1 -0
  9. data/lib/twitter_cldr/resources/tailoring_importer.rb +203 -0
  10. data/lib/twitter_cldr/resources/tries_dumper.rb +43 -0
  11. data/lib/twitter_cldr/resources.rb +3 -2
  12. data/lib/twitter_cldr/version.rb +1 -1
  13. data/resources/collation/tries/af.dump +0 -0
  14. data/resources/collation/tries/ar.dump +0 -0
  15. data/resources/collation/tries/ca.dump +0 -0
  16. data/resources/collation/tries/cs.dump +0 -0
  17. data/resources/collation/tries/da.dump +0 -0
  18. data/resources/collation/tries/de.dump +0 -0
  19. data/resources/collation/tries/default.dump +0 -0
  20. data/resources/collation/tries/el.dump +0 -0
  21. data/resources/collation/tries/en.dump +0 -0
  22. data/resources/collation/tries/es.dump +0 -0
  23. data/resources/collation/tries/eu.dump +0 -0
  24. data/resources/collation/tries/fa.dump +0 -0
  25. data/resources/collation/tries/fi.dump +0 -0
  26. data/resources/collation/tries/fil.dump +0 -0
  27. data/resources/collation/tries/fr.dump +0 -0
  28. data/resources/collation/tries/he.dump +0 -0
  29. data/resources/collation/tries/hi.dump +0 -0
  30. data/resources/collation/tries/hu.dump +0 -0
  31. data/resources/collation/tries/id.dump +0 -0
  32. data/resources/collation/tries/it.dump +0 -0
  33. data/resources/collation/tries/ja.dump +0 -0
  34. data/resources/collation/tries/ko.dump +0 -0
  35. data/resources/collation/tries/ms.dump +0 -0
  36. data/resources/collation/tries/nb.dump +0 -0
  37. data/resources/collation/tries/nl.dump +0 -0
  38. data/resources/collation/tries/pl.dump +0 -0
  39. data/resources/collation/tries/pt.dump +0 -0
  40. data/resources/collation/tries/ru.dump +0 -0
  41. data/resources/collation/tries/sv.dump +0 -0
  42. data/resources/collation/tries/th.dump +0 -0
  43. data/resources/collation/tries/tr.dump +0 -0
  44. data/resources/collation/tries/uk.dump +0 -0
  45. data/resources/collation/tries/ur.dump +0 -0
  46. data/resources/collation/tries/zh-Hant.dump +0 -0
  47. data/resources/collation/tries/zh.dump +0 -0
  48. data/spec/collation/collation_spec.rb +4 -2
  49. data/spec/collation/collator_spec.rb +36 -30
  50. data/spec/collation/tailoring_spec.rb +3 -1
  51. data/spec/collation/tailoring_tests/ja.txt +6 -5
  52. data/spec/collation/trie_builder_spec.rb +21 -26
  53. data/spec/collation/trie_dumps_spec.rb +26 -0
  54. data/spec/collation/trie_loader_spec.rb +72 -0
  55. data/spec/collation/trie_spec.rb +14 -0
  56. data/spec/collation/trie_with_fallback_spec.rb +6 -0
  57. data/spec/normalization/normalization_spec.rb +2 -2
  58. metadata +43 -21
  59. data/lib/twitter_cldr/resources/import/tailoring.rb +0 -202
  60. data/lib/twitter_cldr/resources/import.rb +0 -12
@@ -1,202 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- require 'nokogiri'
7
- require 'yaml'
8
- require 'java'
9
-
10
- module TwitterCldr
11
- module Resources
12
- module Import
13
-
14
- # This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
15
- # http://download.icu-project.org/files/icu4j/49.1/icu4j-49_1.jar).
16
- #
17
- class Tailoring
18
-
19
- SUPPORTED_RULES = %w[p s t i pc sc tc ic x]
20
- SIMPLE_RULES = %w[p s t i]
21
- LEVEL_RULE_REGEXP = /^(p|s|t|i)(c?)$/
22
-
23
- IGNORED_TAGS = %w[reset text #comment]
24
-
25
- LAST_BYTE_MASK = 0xFF
26
-
27
- LOCALES_MAP = {
28
- :'zh-Hant' => :'zh_Hant',
29
- :id => :root,
30
- :it => :root,
31
- :ms => :root,
32
- :nl => :root,
33
- :pt => :root
34
- }
35
-
36
- EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
37
-
38
- class ImportError < RuntimeError; end
39
-
40
- # Arguments:
41
- #
42
- # input_path - path to a directory containing CLDR tailoring data (available at
43
- # http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
44
- # or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
45
- #
46
- # output_path - output directory for imported YAML files
47
- #
48
- # icu4j_path - path to ICU4J jar file
49
- #
50
- def initialize(input_path, output_path, icu4j_path)
51
- require icu4j_path
52
-
53
- @input_path = input_path
54
- @output_path = output_path
55
- end
56
-
57
- def import(locale)
58
- print "Importing %8s\t--\t" % locale
59
-
60
- if tailoring_present?(locale)
61
- YAML.dump(tailoring_data(locale), open(resource_file_path(locale), 'w'))
62
- puts "Done."
63
- else
64
- YAML.dump(EMPTY_TAILORING_DATA, open(resource_file_path(locale), 'w'))
65
- puts "Missing (generated empty tailoring resource)."
66
- end
67
- rescue ImportError => e
68
- puts "Error: #{e.message}"
69
- end
70
-
71
- private
72
-
73
- def tailoring_present?(locale)
74
- File.file?(locale_file_path(locale))
75
- end
76
-
77
- def translated_locale(locale)
78
- LOCALES_MAP.fetch(locale, locale)
79
- end
80
-
81
- def locale_file_path(locale)
82
- File.join(@input_path, "#{translated_locale(locale)}.xml")
83
- end
84
-
85
- def resource_file_path(locale)
86
- File.join(@output_path, "#{locale}.yml")
87
- end
88
-
89
- def tailoring_data(locale)
90
- doc = Nokogiri::XML(open(locale_file_path(locale)))
91
- collations = doc.at_xpath('//collations')
92
-
93
- collation_alias = collations.at_xpath('alias[@path="//ldml/collations"]')
94
- aliased_locale = collation_alias && collation_alias.attr('source')
95
-
96
- return tailoring_data(aliased_locale) if aliased_locale
97
-
98
- standard_tailoring = collations.at_xpath('collation[@type="standard"]')
99
-
100
- {
101
- 'collator_options' => parse_collator_options(standard_tailoring),
102
- 'tailored_table' => parse_tailorings(standard_tailoring, locale),
103
- 'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
104
- }
105
- end
106
-
107
- def parse_tailorings(data, locale)
108
- rules = data && data.at_xpath('rules')
109
-
110
- return '' unless rules
111
-
112
- collator = Java::ComIbmIcuText::Collator.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
113
-
114
- rules.children.map do |child|
115
- validate_tailoring_rule(child)
116
-
117
- if child.name =~ LEVEL_RULE_REGEXP
118
- if $2.empty?
119
- table_entry_for_rule(collator, child.text)
120
- else
121
- child.text.chars.map { |char| table_entry_for_rule(collator, char) }
122
- end
123
- elsif child.name == 'x'
124
- context = ''
125
- child.children.each_with_object([]) do |c, memo|
126
- if SIMPLE_RULES.include?(c.name)
127
- memo << table_entry_for_rule(collator, context + c.text)
128
- elsif c.name == 'context'
129
- context = c.text
130
- elsif c.name != 'extend'
131
- raise ImportError, "Rule '#{c.name}' inside <x></x> is not supported."
132
- end
133
- end
134
- else
135
- raise ImportError, "Tag '#{child.name}' is not supported." unless IGNORED_TAGS.include?(child.name)
136
- end
137
- end.flatten.compact.join("\n")
138
- end
139
-
140
- def table_entry_for_rule(collator, tailored_value)
141
- code_points = get_code_points(tailored_value)
142
-
143
- collation_elements = get_collation_elements(collator, tailored_value).map do |ce|
144
- ce.map { |l| l.to_s(16).upcase }.join(', ')
145
- end
146
-
147
- "#{code_points.join(' ')}; [#{collation_elements.join('][')}]"
148
- end
149
-
150
- def parse_suppressed_contractions(data)
151
- node = data && data.at_xpath('suppress_contractions')
152
- node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
153
- end
154
-
155
- def parse_collator_options(data)
156
- options = {}
157
-
158
- if data
159
- case_first_setting = data.at_xpath('settings[@caseFirst]')
160
- options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
161
- end
162
-
163
- options
164
- end
165
-
166
- def validate_tailoring_rule(rule)
167
- return if IGNORED_TAGS.include?(rule.name)
168
-
169
- raise ImportError, "Rule '#{rule.name}' is not supported." unless SUPPORTED_RULES.include?(rule.name)
170
- end
171
-
172
- def get_collation_elements(collator, string)
173
- iter = collator.get_collation_element_iterator(string)
174
-
175
- collation_elements = []
176
- ce = iter.next
177
-
178
- while ce != Java::ComIbmIcuText::CollationElementIterator::NULLORDER
179
- p1 = (ce >> 24) & LAST_BYTE_MASK
180
- p2 = (ce >> 16) & LAST_BYTE_MASK
181
-
182
- primary = p2.zero? ? p1 : (p1 << 8) + p2
183
- secondary = (ce >> 8) & LAST_BYTE_MASK
184
- tertiarly = ce & LAST_BYTE_MASK
185
-
186
- collation_elements << [primary, secondary, tertiarly]
187
-
188
- ce = iter.next
189
- end
190
-
191
- collation_elements
192
- end
193
-
194
- def get_code_points(string)
195
- TwitterCldr::Normalization::NFD.normalize_code_points(TwitterCldr::Utils::CodePoints.from_string(string))
196
- end
197
-
198
- end
199
-
200
- end
201
- end
202
- end
@@ -1,12 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- # Copyright 2012 Twitter, Inc
4
- # http://www.apache.org/licenses/LICENSE-2.0
5
-
6
- module TwitterCldr
7
- module Resources
8
- module Import
9
- autoload :Tailoring, 'twitter_cldr/resources/import/tailoring'
10
- end
11
- end
12
- end