twitter_cldr 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Rakefile +8 -3
- data/lib/twitter_cldr/collation/collator.rb +9 -11
- data/lib/twitter_cldr/collation/trie.rb +8 -0
- data/lib/twitter_cldr/collation/trie_builder.rb +12 -15
- data/lib/twitter_cldr/collation/trie_loader.rb +50 -0
- data/lib/twitter_cldr/collation/trie_with_fallback.rb +6 -4
- data/lib/twitter_cldr/collation.rb +1 -0
- data/lib/twitter_cldr/resources/tailoring_importer.rb +203 -0
- data/lib/twitter_cldr/resources/tries_dumper.rb +43 -0
- data/lib/twitter_cldr/resources.rb +3 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tries/af.dump +0 -0
- data/resources/collation/tries/ar.dump +0 -0
- data/resources/collation/tries/ca.dump +0 -0
- data/resources/collation/tries/cs.dump +0 -0
- data/resources/collation/tries/da.dump +0 -0
- data/resources/collation/tries/de.dump +0 -0
- data/resources/collation/tries/default.dump +0 -0
- data/resources/collation/tries/el.dump +0 -0
- data/resources/collation/tries/en.dump +0 -0
- data/resources/collation/tries/es.dump +0 -0
- data/resources/collation/tries/eu.dump +0 -0
- data/resources/collation/tries/fa.dump +0 -0
- data/resources/collation/tries/fi.dump +0 -0
- data/resources/collation/tries/fil.dump +0 -0
- data/resources/collation/tries/fr.dump +0 -0
- data/resources/collation/tries/he.dump +0 -0
- data/resources/collation/tries/hi.dump +0 -0
- data/resources/collation/tries/hu.dump +0 -0
- data/resources/collation/tries/id.dump +0 -0
- data/resources/collation/tries/it.dump +0 -0
- data/resources/collation/tries/ja.dump +0 -0
- data/resources/collation/tries/ko.dump +0 -0
- data/resources/collation/tries/ms.dump +0 -0
- data/resources/collation/tries/nb.dump +0 -0
- data/resources/collation/tries/nl.dump +0 -0
- data/resources/collation/tries/pl.dump +0 -0
- data/resources/collation/tries/pt.dump +0 -0
- data/resources/collation/tries/ru.dump +0 -0
- data/resources/collation/tries/sv.dump +0 -0
- data/resources/collation/tries/th.dump +0 -0
- data/resources/collation/tries/tr.dump +0 -0
- data/resources/collation/tries/uk.dump +0 -0
- data/resources/collation/tries/ur.dump +0 -0
- data/resources/collation/tries/zh-Hant.dump +0 -0
- data/resources/collation/tries/zh.dump +0 -0
- data/spec/collation/collation_spec.rb +4 -2
- data/spec/collation/collator_spec.rb +36 -30
- data/spec/collation/tailoring_spec.rb +3 -1
- data/spec/collation/tailoring_tests/ja.txt +6 -5
- data/spec/collation/trie_builder_spec.rb +21 -26
- data/spec/collation/trie_dumps_spec.rb +26 -0
- data/spec/collation/trie_loader_spec.rb +72 -0
- data/spec/collation/trie_spec.rb +14 -0
- data/spec/collation/trie_with_fallback_spec.rb +6 -0
- data/spec/normalization/normalization_spec.rb +2 -2
- metadata +43 -21
- data/lib/twitter_cldr/resources/import/tailoring.rb +0 -202
- data/lib/twitter_cldr/resources/import.rb +0 -12
@@ -1,202 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'nokogiri'
|
7
|
-
require 'yaml'
|
8
|
-
require 'java'
|
9
|
-
|
10
|
-
module TwitterCldr
|
11
|
-
module Resources
|
12
|
-
module Import
|
13
|
-
|
14
|
-
# This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
|
15
|
-
# http://download.icu-project.org/files/icu4j/49.1/icu4j-49_1.jar).
|
16
|
-
#
|
17
|
-
class Tailoring
|
18
|
-
|
19
|
-
SUPPORTED_RULES = %w[p s t i pc sc tc ic x]
|
20
|
-
SIMPLE_RULES = %w[p s t i]
|
21
|
-
LEVEL_RULE_REGEXP = /^(p|s|t|i)(c?)$/
|
22
|
-
|
23
|
-
IGNORED_TAGS = %w[reset text #comment]
|
24
|
-
|
25
|
-
LAST_BYTE_MASK = 0xFF
|
26
|
-
|
27
|
-
LOCALES_MAP = {
|
28
|
-
:'zh-Hant' => :'zh_Hant',
|
29
|
-
:id => :root,
|
30
|
-
:it => :root,
|
31
|
-
:ms => :root,
|
32
|
-
:nl => :root,
|
33
|
-
:pt => :root
|
34
|
-
}
|
35
|
-
|
36
|
-
EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
|
37
|
-
|
38
|
-
class ImportError < RuntimeError; end
|
39
|
-
|
40
|
-
# Arguments:
|
41
|
-
#
|
42
|
-
# input_path - path to a directory containing CLDR tailoring data (available at
|
43
|
-
# http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
|
44
|
-
# or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
|
45
|
-
#
|
46
|
-
# output_path - output directory for imported YAML files
|
47
|
-
#
|
48
|
-
# icu4j_path - path to ICU4J jar file
|
49
|
-
#
|
50
|
-
def initialize(input_path, output_path, icu4j_path)
|
51
|
-
require icu4j_path
|
52
|
-
|
53
|
-
@input_path = input_path
|
54
|
-
@output_path = output_path
|
55
|
-
end
|
56
|
-
|
57
|
-
def import(locale)
|
58
|
-
print "Importing %8s\t--\t" % locale
|
59
|
-
|
60
|
-
if tailoring_present?(locale)
|
61
|
-
YAML.dump(tailoring_data(locale), open(resource_file_path(locale), 'w'))
|
62
|
-
puts "Done."
|
63
|
-
else
|
64
|
-
YAML.dump(EMPTY_TAILORING_DATA, open(resource_file_path(locale), 'w'))
|
65
|
-
puts "Missing (generated empty tailoring resource)."
|
66
|
-
end
|
67
|
-
rescue ImportError => e
|
68
|
-
puts "Error: #{e.message}"
|
69
|
-
end
|
70
|
-
|
71
|
-
private
|
72
|
-
|
73
|
-
def tailoring_present?(locale)
|
74
|
-
File.file?(locale_file_path(locale))
|
75
|
-
end
|
76
|
-
|
77
|
-
def translated_locale(locale)
|
78
|
-
LOCALES_MAP.fetch(locale, locale)
|
79
|
-
end
|
80
|
-
|
81
|
-
def locale_file_path(locale)
|
82
|
-
File.join(@input_path, "#{translated_locale(locale)}.xml")
|
83
|
-
end
|
84
|
-
|
85
|
-
def resource_file_path(locale)
|
86
|
-
File.join(@output_path, "#{locale}.yml")
|
87
|
-
end
|
88
|
-
|
89
|
-
def tailoring_data(locale)
|
90
|
-
doc = Nokogiri::XML(open(locale_file_path(locale)))
|
91
|
-
collations = doc.at_xpath('//collations')
|
92
|
-
|
93
|
-
collation_alias = collations.at_xpath('alias[@path="//ldml/collations"]')
|
94
|
-
aliased_locale = collation_alias && collation_alias.attr('source')
|
95
|
-
|
96
|
-
return tailoring_data(aliased_locale) if aliased_locale
|
97
|
-
|
98
|
-
standard_tailoring = collations.at_xpath('collation[@type="standard"]')
|
99
|
-
|
100
|
-
{
|
101
|
-
'collator_options' => parse_collator_options(standard_tailoring),
|
102
|
-
'tailored_table' => parse_tailorings(standard_tailoring, locale),
|
103
|
-
'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
|
104
|
-
}
|
105
|
-
end
|
106
|
-
|
107
|
-
def parse_tailorings(data, locale)
|
108
|
-
rules = data && data.at_xpath('rules')
|
109
|
-
|
110
|
-
return '' unless rules
|
111
|
-
|
112
|
-
collator = Java::ComIbmIcuText::Collator.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
|
113
|
-
|
114
|
-
rules.children.map do |child|
|
115
|
-
validate_tailoring_rule(child)
|
116
|
-
|
117
|
-
if child.name =~ LEVEL_RULE_REGEXP
|
118
|
-
if $2.empty?
|
119
|
-
table_entry_for_rule(collator, child.text)
|
120
|
-
else
|
121
|
-
child.text.chars.map { |char| table_entry_for_rule(collator, char) }
|
122
|
-
end
|
123
|
-
elsif child.name == 'x'
|
124
|
-
context = ''
|
125
|
-
child.children.each_with_object([]) do |c, memo|
|
126
|
-
if SIMPLE_RULES.include?(c.name)
|
127
|
-
memo << table_entry_for_rule(collator, context + c.text)
|
128
|
-
elsif c.name == 'context'
|
129
|
-
context = c.text
|
130
|
-
elsif c.name != 'extend'
|
131
|
-
raise ImportError, "Rule '#{c.name}' inside <x></x> is not supported."
|
132
|
-
end
|
133
|
-
end
|
134
|
-
else
|
135
|
-
raise ImportError, "Tag '#{child.name}' is not supported." unless IGNORED_TAGS.include?(child.name)
|
136
|
-
end
|
137
|
-
end.flatten.compact.join("\n")
|
138
|
-
end
|
139
|
-
|
140
|
-
def table_entry_for_rule(collator, tailored_value)
|
141
|
-
code_points = get_code_points(tailored_value)
|
142
|
-
|
143
|
-
collation_elements = get_collation_elements(collator, tailored_value).map do |ce|
|
144
|
-
ce.map { |l| l.to_s(16).upcase }.join(', ')
|
145
|
-
end
|
146
|
-
|
147
|
-
"#{code_points.join(' ')}; [#{collation_elements.join('][')}]"
|
148
|
-
end
|
149
|
-
|
150
|
-
def parse_suppressed_contractions(data)
|
151
|
-
node = data && data.at_xpath('suppress_contractions')
|
152
|
-
node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
|
153
|
-
end
|
154
|
-
|
155
|
-
def parse_collator_options(data)
|
156
|
-
options = {}
|
157
|
-
|
158
|
-
if data
|
159
|
-
case_first_setting = data.at_xpath('settings[@caseFirst]')
|
160
|
-
options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
|
161
|
-
end
|
162
|
-
|
163
|
-
options
|
164
|
-
end
|
165
|
-
|
166
|
-
def validate_tailoring_rule(rule)
|
167
|
-
return if IGNORED_TAGS.include?(rule.name)
|
168
|
-
|
169
|
-
raise ImportError, "Rule '#{rule.name}' is not supported." unless SUPPORTED_RULES.include?(rule.name)
|
170
|
-
end
|
171
|
-
|
172
|
-
def get_collation_elements(collator, string)
|
173
|
-
iter = collator.get_collation_element_iterator(string)
|
174
|
-
|
175
|
-
collation_elements = []
|
176
|
-
ce = iter.next
|
177
|
-
|
178
|
-
while ce != Java::ComIbmIcuText::CollationElementIterator::NULLORDER
|
179
|
-
p1 = (ce >> 24) & LAST_BYTE_MASK
|
180
|
-
p2 = (ce >> 16) & LAST_BYTE_MASK
|
181
|
-
|
182
|
-
primary = p2.zero? ? p1 : (p1 << 8) + p2
|
183
|
-
secondary = (ce >> 8) & LAST_BYTE_MASK
|
184
|
-
tertiarly = ce & LAST_BYTE_MASK
|
185
|
-
|
186
|
-
collation_elements << [primary, secondary, tertiarly]
|
187
|
-
|
188
|
-
ce = iter.next
|
189
|
-
end
|
190
|
-
|
191
|
-
collation_elements
|
192
|
-
end
|
193
|
-
|
194
|
-
def get_code_points(string)
|
195
|
-
TwitterCldr::Normalization::NFD.normalize_code_points(TwitterCldr::Utils::CodePoints.from_string(string))
|
196
|
-
end
|
197
|
-
|
198
|
-
end
|
199
|
-
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|