twitter_cldr 1.6.1 → 1.6.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Rakefile +8 -3
- data/lib/twitter_cldr/collation/collator.rb +9 -11
- data/lib/twitter_cldr/collation/trie.rb +8 -0
- data/lib/twitter_cldr/collation/trie_builder.rb +12 -15
- data/lib/twitter_cldr/collation/trie_loader.rb +50 -0
- data/lib/twitter_cldr/collation/trie_with_fallback.rb +6 -4
- data/lib/twitter_cldr/collation.rb +1 -0
- data/lib/twitter_cldr/resources/tailoring_importer.rb +203 -0
- data/lib/twitter_cldr/resources/tries_dumper.rb +43 -0
- data/lib/twitter_cldr/resources.rb +3 -2
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tries/af.dump +0 -0
- data/resources/collation/tries/ar.dump +0 -0
- data/resources/collation/tries/ca.dump +0 -0
- data/resources/collation/tries/cs.dump +0 -0
- data/resources/collation/tries/da.dump +0 -0
- data/resources/collation/tries/de.dump +0 -0
- data/resources/collation/tries/default.dump +0 -0
- data/resources/collation/tries/el.dump +0 -0
- data/resources/collation/tries/en.dump +0 -0
- data/resources/collation/tries/es.dump +0 -0
- data/resources/collation/tries/eu.dump +0 -0
- data/resources/collation/tries/fa.dump +0 -0
- data/resources/collation/tries/fi.dump +0 -0
- data/resources/collation/tries/fil.dump +0 -0
- data/resources/collation/tries/fr.dump +0 -0
- data/resources/collation/tries/he.dump +0 -0
- data/resources/collation/tries/hi.dump +0 -0
- data/resources/collation/tries/hu.dump +0 -0
- data/resources/collation/tries/id.dump +0 -0
- data/resources/collation/tries/it.dump +0 -0
- data/resources/collation/tries/ja.dump +0 -0
- data/resources/collation/tries/ko.dump +0 -0
- data/resources/collation/tries/ms.dump +0 -0
- data/resources/collation/tries/nb.dump +0 -0
- data/resources/collation/tries/nl.dump +0 -0
- data/resources/collation/tries/pl.dump +0 -0
- data/resources/collation/tries/pt.dump +0 -0
- data/resources/collation/tries/ru.dump +0 -0
- data/resources/collation/tries/sv.dump +0 -0
- data/resources/collation/tries/th.dump +0 -0
- data/resources/collation/tries/tr.dump +0 -0
- data/resources/collation/tries/uk.dump +0 -0
- data/resources/collation/tries/ur.dump +0 -0
- data/resources/collation/tries/zh-Hant.dump +0 -0
- data/resources/collation/tries/zh.dump +0 -0
- data/spec/collation/collation_spec.rb +4 -2
- data/spec/collation/collator_spec.rb +36 -30
- data/spec/collation/tailoring_spec.rb +3 -1
- data/spec/collation/tailoring_tests/ja.txt +6 -5
- data/spec/collation/trie_builder_spec.rb +21 -26
- data/spec/collation/trie_dumps_spec.rb +26 -0
- data/spec/collation/trie_loader_spec.rb +72 -0
- data/spec/collation/trie_spec.rb +14 -0
- data/spec/collation/trie_with_fallback_spec.rb +6 -0
- data/spec/normalization/normalization_spec.rb +2 -2
- metadata +43 -21
- data/lib/twitter_cldr/resources/import/tailoring.rb +0 -202
- data/lib/twitter_cldr/resources/import.rb +0 -12
@@ -1,202 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
# Copyright 2012 Twitter, Inc
|
4
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
-
|
6
|
-
require 'nokogiri'
|
7
|
-
require 'yaml'
|
8
|
-
require 'java'
|
9
|
-
|
10
|
-
module TwitterCldr
|
11
|
-
module Resources
|
12
|
-
module Import
|
13
|
-
|
14
|
-
# This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
|
15
|
-
# http://download.icu-project.org/files/icu4j/49.1/icu4j-49_1.jar).
|
16
|
-
#
|
17
|
-
class Tailoring
|
18
|
-
|
19
|
-
SUPPORTED_RULES = %w[p s t i pc sc tc ic x]
|
20
|
-
SIMPLE_RULES = %w[p s t i]
|
21
|
-
LEVEL_RULE_REGEXP = /^(p|s|t|i)(c?)$/
|
22
|
-
|
23
|
-
IGNORED_TAGS = %w[reset text #comment]
|
24
|
-
|
25
|
-
LAST_BYTE_MASK = 0xFF
|
26
|
-
|
27
|
-
LOCALES_MAP = {
|
28
|
-
:'zh-Hant' => :'zh_Hant',
|
29
|
-
:id => :root,
|
30
|
-
:it => :root,
|
31
|
-
:ms => :root,
|
32
|
-
:nl => :root,
|
33
|
-
:pt => :root
|
34
|
-
}
|
35
|
-
|
36
|
-
EMPTY_TAILORING_DATA = { 'collator_options' => {}, 'tailored_table' => '', 'suppressed_contractions' => '' }
|
37
|
-
|
38
|
-
class ImportError < RuntimeError; end
|
39
|
-
|
40
|
-
# Arguments:
|
41
|
-
#
|
42
|
-
# input_path - path to a directory containing CLDR tailoring data (available at
|
43
|
-
# http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
|
44
|
-
# or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
|
45
|
-
#
|
46
|
-
# output_path - output directory for imported YAML files
|
47
|
-
#
|
48
|
-
# icu4j_path - path to ICU4J jar file
|
49
|
-
#
|
50
|
-
def initialize(input_path, output_path, icu4j_path)
|
51
|
-
require icu4j_path
|
52
|
-
|
53
|
-
@input_path = input_path
|
54
|
-
@output_path = output_path
|
55
|
-
end
|
56
|
-
|
57
|
-
def import(locale)
|
58
|
-
print "Importing %8s\t--\t" % locale
|
59
|
-
|
60
|
-
if tailoring_present?(locale)
|
61
|
-
YAML.dump(tailoring_data(locale), open(resource_file_path(locale), 'w'))
|
62
|
-
puts "Done."
|
63
|
-
else
|
64
|
-
YAML.dump(EMPTY_TAILORING_DATA, open(resource_file_path(locale), 'w'))
|
65
|
-
puts "Missing (generated empty tailoring resource)."
|
66
|
-
end
|
67
|
-
rescue ImportError => e
|
68
|
-
puts "Error: #{e.message}"
|
69
|
-
end
|
70
|
-
|
71
|
-
private
|
72
|
-
|
73
|
-
def tailoring_present?(locale)
|
74
|
-
File.file?(locale_file_path(locale))
|
75
|
-
end
|
76
|
-
|
77
|
-
def translated_locale(locale)
|
78
|
-
LOCALES_MAP.fetch(locale, locale)
|
79
|
-
end
|
80
|
-
|
81
|
-
def locale_file_path(locale)
|
82
|
-
File.join(@input_path, "#{translated_locale(locale)}.xml")
|
83
|
-
end
|
84
|
-
|
85
|
-
def resource_file_path(locale)
|
86
|
-
File.join(@output_path, "#{locale}.yml")
|
87
|
-
end
|
88
|
-
|
89
|
-
def tailoring_data(locale)
|
90
|
-
doc = Nokogiri::XML(open(locale_file_path(locale)))
|
91
|
-
collations = doc.at_xpath('//collations')
|
92
|
-
|
93
|
-
collation_alias = collations.at_xpath('alias[@path="//ldml/collations"]')
|
94
|
-
aliased_locale = collation_alias && collation_alias.attr('source')
|
95
|
-
|
96
|
-
return tailoring_data(aliased_locale) if aliased_locale
|
97
|
-
|
98
|
-
standard_tailoring = collations.at_xpath('collation[@type="standard"]')
|
99
|
-
|
100
|
-
{
|
101
|
-
'collator_options' => parse_collator_options(standard_tailoring),
|
102
|
-
'tailored_table' => parse_tailorings(standard_tailoring, locale),
|
103
|
-
'suppressed_contractions' => parse_suppressed_contractions(standard_tailoring)
|
104
|
-
}
|
105
|
-
end
|
106
|
-
|
107
|
-
def parse_tailorings(data, locale)
|
108
|
-
rules = data && data.at_xpath('rules')
|
109
|
-
|
110
|
-
return '' unless rules
|
111
|
-
|
112
|
-
collator = Java::ComIbmIcuText::Collator.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
|
113
|
-
|
114
|
-
rules.children.map do |child|
|
115
|
-
validate_tailoring_rule(child)
|
116
|
-
|
117
|
-
if child.name =~ LEVEL_RULE_REGEXP
|
118
|
-
if $2.empty?
|
119
|
-
table_entry_for_rule(collator, child.text)
|
120
|
-
else
|
121
|
-
child.text.chars.map { |char| table_entry_for_rule(collator, char) }
|
122
|
-
end
|
123
|
-
elsif child.name == 'x'
|
124
|
-
context = ''
|
125
|
-
child.children.each_with_object([]) do |c, memo|
|
126
|
-
if SIMPLE_RULES.include?(c.name)
|
127
|
-
memo << table_entry_for_rule(collator, context + c.text)
|
128
|
-
elsif c.name == 'context'
|
129
|
-
context = c.text
|
130
|
-
elsif c.name != 'extend'
|
131
|
-
raise ImportError, "Rule '#{c.name}' inside <x></x> is not supported."
|
132
|
-
end
|
133
|
-
end
|
134
|
-
else
|
135
|
-
raise ImportError, "Tag '#{child.name}' is not supported." unless IGNORED_TAGS.include?(child.name)
|
136
|
-
end
|
137
|
-
end.flatten.compact.join("\n")
|
138
|
-
end
|
139
|
-
|
140
|
-
def table_entry_for_rule(collator, tailored_value)
|
141
|
-
code_points = get_code_points(tailored_value)
|
142
|
-
|
143
|
-
collation_elements = get_collation_elements(collator, tailored_value).map do |ce|
|
144
|
-
ce.map { |l| l.to_s(16).upcase }.join(', ')
|
145
|
-
end
|
146
|
-
|
147
|
-
"#{code_points.join(' ')}; [#{collation_elements.join('][')}]"
|
148
|
-
end
|
149
|
-
|
150
|
-
def parse_suppressed_contractions(data)
|
151
|
-
node = data && data.at_xpath('suppress_contractions')
|
152
|
-
node ? Java::ComIbmIcuText::UnicodeSet.to_array(Java::ComIbmIcuText::UnicodeSet.new(node.text)).to_a.join : ''
|
153
|
-
end
|
154
|
-
|
155
|
-
def parse_collator_options(data)
|
156
|
-
options = {}
|
157
|
-
|
158
|
-
if data
|
159
|
-
case_first_setting = data.at_xpath('settings[@caseFirst]')
|
160
|
-
options['case_first'] = case_first_setting.attr('caseFirst').to_sym if case_first_setting
|
161
|
-
end
|
162
|
-
|
163
|
-
options
|
164
|
-
end
|
165
|
-
|
166
|
-
def validate_tailoring_rule(rule)
|
167
|
-
return if IGNORED_TAGS.include?(rule.name)
|
168
|
-
|
169
|
-
raise ImportError, "Rule '#{rule.name}' is not supported." unless SUPPORTED_RULES.include?(rule.name)
|
170
|
-
end
|
171
|
-
|
172
|
-
def get_collation_elements(collator, string)
|
173
|
-
iter = collator.get_collation_element_iterator(string)
|
174
|
-
|
175
|
-
collation_elements = []
|
176
|
-
ce = iter.next
|
177
|
-
|
178
|
-
while ce != Java::ComIbmIcuText::CollationElementIterator::NULLORDER
|
179
|
-
p1 = (ce >> 24) & LAST_BYTE_MASK
|
180
|
-
p2 = (ce >> 16) & LAST_BYTE_MASK
|
181
|
-
|
182
|
-
primary = p2.zero? ? p1 : (p1 << 8) + p2
|
183
|
-
secondary = (ce >> 8) & LAST_BYTE_MASK
|
184
|
-
tertiarly = ce & LAST_BYTE_MASK
|
185
|
-
|
186
|
-
collation_elements << [primary, secondary, tertiarly]
|
187
|
-
|
188
|
-
ce = iter.next
|
189
|
-
end
|
190
|
-
|
191
|
-
collation_elements
|
192
|
-
end
|
193
|
-
|
194
|
-
def get_code_points(string)
|
195
|
-
TwitterCldr::Normalization::NFD.normalize_code_points(TwitterCldr::Utils::CodePoints.from_string(string))
|
196
|
-
end
|
197
|
-
|
198
|
-
end
|
199
|
-
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|