twitter_cldr 4.0.0 → 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -2
- data/README.md +18 -2
- data/Rakefile +39 -122
- data/lib/twitter_cldr.rb +3 -0
- data/lib/twitter_cldr/formatters/numbers/rbnf.rb +5 -1
- data/lib/twitter_cldr/resources.rb +86 -5
- data/lib/twitter_cldr/resources/bidi_test_importer.rb +50 -44
- data/lib/twitter_cldr/resources/casefolder_class_generator.rb +22 -13
- data/lib/twitter_cldr/resources/collation_tries_importer.rb +44 -0
- data/lib/twitter_cldr/resources/hyphenation_importer.rb +16 -42
- data/lib/twitter_cldr/resources/import_resolver.rb +71 -0
- data/lib/twitter_cldr/resources/importer.rb +107 -0
- data/lib/twitter_cldr/resources/language_codes_importer.rb +35 -38
- data/lib/twitter_cldr/resources/loader.rb +3 -3
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +48 -35
- data/lib/twitter_cldr/resources/phone_codes_importer.rb +24 -23
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +10 -11
- data/lib/twitter_cldr/resources/properties.rb +0 -4
- data/lib/twitter_cldr/resources/properties/age_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/arabic_shaping_property_importer.rb +9 -11
- data/lib/twitter_cldr/resources/properties/bidi_brackets_property_importer.rb +11 -9
- data/lib/twitter_cldr/resources/properties/blocks_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/derived_core_properties_importer.rb +9 -11
- data/lib/twitter_cldr/resources/properties/east_asian_width_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/grapheme_break_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/hangul_syllable_type_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/indic_positional_category_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/indic_syllabic_category_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/jamo_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/line_break_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/prop_list_importer.rb +9 -11
- data/lib/twitter_cldr/resources/properties/property_importer.rb +13 -22
- data/lib/twitter_cldr/resources/properties/script_extensions_property_importer.rb +12 -10
- data/lib/twitter_cldr/resources/properties/script_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/sentence_break_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/properties/unicode_data_properties_importer.rb +11 -9
- data/lib/twitter_cldr/resources/properties/word_break_property_importer.rb +13 -9
- data/lib/twitter_cldr/resources/rbnf_test_importer.rb +41 -38
- data/lib/twitter_cldr/resources/readme_renderer.rb +1 -2
- data/lib/twitter_cldr/resources/requirements.rb +18 -0
- data/lib/twitter_cldr/resources/requirements/cldr_requirement.rb +66 -0
- data/lib/twitter_cldr/resources/requirements/dependency_requirement.rb +23 -0
- data/lib/twitter_cldr/resources/requirements/git_requirement.rb +66 -0
- data/lib/twitter_cldr/resources/requirements/icu_requirement.rb +111 -0
- data/lib/twitter_cldr/resources/requirements/unicode_requirement.rb +51 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +15 -30
- data/lib/twitter_cldr/resources/tailoring_importer.rb +33 -26
- data/lib/twitter_cldr/resources/transform_test_importer.rb +15 -17
- data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +29 -17
- data/lib/twitter_cldr/resources/unicode_data_importer.rb +38 -31
- data/lib/twitter_cldr/resources/unicode_file_parser.rb +37 -0
- data/lib/twitter_cldr/resources/unicode_property_aliases_importer.rb +23 -27
- data/lib/twitter_cldr/shared/casefolder.rb +139 -115
- data/lib/twitter_cldr/version.rb +1 -1
- data/lib/twitter_cldr/versions.rb +0 -4
- data/resources/collation/tailoring/bo.yml +4 -0
- data/resources/collation/tries/bo.dump +0 -0
- data/resources/locales/bo/calendars.yml +247 -0
- data/resources/locales/bo/currencies.yml +208 -0
- data/resources/locales/bo/fields.yml +31 -0
- data/resources/locales/bo/languages.yml +24 -0
- data/resources/locales/bo/layout.yml +5 -0
- data/resources/locales/bo/lists.yml +12 -0
- data/resources/locales/bo/numbers.yml +111 -0
- data/resources/locales/bo/plural_rules.yml +6 -0
- data/resources/locales/bo/plurals.yml +12 -0
- data/resources/locales/bo/territories.yml +14 -0
- data/resources/locales/bo/units.yml +283 -0
- data/resources/shared/transforms/Arab-Latn.yml +109 -0
- data/resources/shared/transforms/Beng-Deva.yml +13 -0
- data/resources/shared/transforms/Beng-Gujr.yml +13 -0
- data/resources/shared/transforms/Beng-Guru.yml +13 -0
- data/resources/shared/transforms/Beng-Knda.yml +13 -0
- data/resources/shared/transforms/Beng-Latn.yml +13 -0
- data/resources/shared/transforms/Beng-Mlym.yml +13 -0
- data/resources/shared/transforms/Beng-Orya.yml +13 -0
- data/resources/shared/transforms/Beng-Taml.yml +13 -0
- data/resources/shared/transforms/Beng-Telu.yml +13 -0
- data/resources/shared/transforms/Cyrl-Latn.yml +128 -0
- data/resources/shared/transforms/Deva-Beng.yml +13 -0
- data/resources/shared/transforms/Deva-Gujr.yml +13 -0
- data/resources/shared/transforms/Deva-Guru.yml +13 -0
- data/resources/shared/transforms/Deva-Knda.yml +13 -0
- data/resources/shared/transforms/Deva-Latn.yml +13 -0
- data/resources/shared/transforms/Deva-Mlym.yml +13 -0
- data/resources/shared/transforms/Deva-Orya.yml +13 -0
- data/resources/shared/transforms/Deva-Taml.yml +13 -0
- data/resources/shared/transforms/Deva-Telu.yml +13 -0
- data/resources/shared/transforms/Geor-Latn.yml +43 -0
- data/resources/shared/transforms/Grek-Latn-UNGEGN.yml +160 -0
- data/resources/shared/transforms/Grek-Latn.yml +206 -0
- data/resources/shared/transforms/Gujr-Beng.yml +13 -0
- data/resources/shared/transforms/Gujr-Deva.yml +13 -0
- data/resources/shared/transforms/Gujr-Guru.yml +13 -0
- data/resources/shared/transforms/Gujr-Knda.yml +13 -0
- data/resources/shared/transforms/Gujr-Latn.yml +13 -0
- data/resources/shared/transforms/Gujr-Mlym.yml +13 -0
- data/resources/shared/transforms/Gujr-Orya.yml +13 -0
- data/resources/shared/transforms/Gujr-Taml.yml +13 -0
- data/resources/shared/transforms/Gujr-Telu.yml +13 -0
- data/resources/shared/transforms/Guru-Beng.yml +13 -0
- data/resources/shared/transforms/Guru-Deva.yml +13 -0
- data/resources/shared/transforms/Guru-Gujr.yml +13 -0
- data/resources/shared/transforms/Guru-Knda.yml +13 -0
- data/resources/shared/transforms/Guru-Latn.yml +13 -0
- data/resources/shared/transforms/Guru-Mlym.yml +13 -0
- data/resources/shared/transforms/Guru-Orya.yml +13 -0
- data/resources/shared/transforms/Guru-Taml.yml +13 -0
- data/resources/shared/transforms/Guru-Telu.yml +13 -0
- data/resources/shared/transforms/Han-Spacedhan.yml +1 -1
- data/resources/shared/transforms/Hang-Latn.yml +12 -0
- data/resources/shared/transforms/Hani-Latn.yml +1605 -0
- data/resources/shared/transforms/Hans-Hant.yml +3982 -0
- data/resources/shared/transforms/Hebr-Latn.yml +72 -0
- data/resources/shared/transforms/Hira-Kana.yml +114 -0
- data/resources/shared/transforms/Hira-Latn.yml +15 -0
- data/resources/shared/transforms/InterIndic-Latin.yml +2 -2
- data/resources/shared/transforms/Jamo-Latn.yml +12 -0
- data/resources/shared/transforms/Knda-Beng.yml +13 -0
- data/resources/shared/transforms/Knda-Deva.yml +13 -0
- data/resources/shared/transforms/Knda-Gujr.yml +13 -0
- data/resources/shared/transforms/Knda-Guru.yml +13 -0
- data/resources/shared/transforms/Knda-Latn.yml +13 -0
- data/resources/shared/transforms/Knda-Mlym.yml +13 -0
- data/resources/shared/transforms/Knda-Orya.yml +13 -0
- data/resources/shared/transforms/Knda-Taml.yml +13 -0
- data/resources/shared/transforms/Knda-Telu.yml +13 -0
- data/resources/shared/transforms/Latin-ASCII.yml +16 -1
- data/resources/shared/transforms/Latin-InterIndic.yml +2 -2
- data/resources/shared/transforms/Latn-Armn.yml +90 -0
- data/resources/shared/transforms/Latn-Beng.yml +14 -0
- data/resources/shared/transforms/Latn-Bopo.yml +1336 -0
- data/resources/shared/transforms/Latn-Cans.yml +190 -0
- data/resources/shared/transforms/Latn-Deva.yml +14 -0
- data/resources/shared/transforms/Latn-Ethi.yml +278 -0
- data/resources/shared/transforms/Latn-Gujr.yml +14 -0
- data/resources/shared/transforms/Latn-Guru.yml +14 -0
- data/resources/shared/transforms/Latn-Hang.yml +13 -0
- data/resources/shared/transforms/Latn-Jamo.yml +13 -0
- data/resources/shared/transforms/Latn-Kana.yml +274 -0
- data/resources/shared/transforms/Latn-Knda.yml +14 -0
- data/resources/shared/transforms/Latn-Mlym.yml +14 -0
- data/resources/shared/transforms/Latn-Orya.yml +14 -0
- data/resources/shared/transforms/Latn-Taml.yml +14 -0
- data/resources/shared/transforms/Latn-Telu.yml +14 -0
- data/resources/shared/transforms/Latn-Thaa.yml +439 -0
- data/resources/shared/transforms/Latn-Thai.yml +13 -0
- data/resources/shared/transforms/Mlym-Beng.yml +13 -0
- data/resources/shared/transforms/Mlym-Deva.yml +13 -0
- data/resources/shared/transforms/Mlym-Gujr.yml +13 -0
- data/resources/shared/transforms/Mlym-Guru.yml +13 -0
- data/resources/shared/transforms/Mlym-Knda.yml +13 -0
- data/resources/shared/transforms/Mlym-Latn.yml +13 -0
- data/resources/shared/transforms/Mlym-Orya.yml +13 -0
- data/resources/shared/transforms/Mlym-Taml.yml +13 -0
- data/resources/shared/transforms/Mlym-Telu.yml +13 -0
- data/resources/shared/transforms/Orya-Beng.yml +13 -0
- data/resources/shared/transforms/Orya-Deva.yml +13 -0
- data/resources/shared/transforms/Orya-Gujr.yml +13 -0
- data/resources/shared/transforms/Orya-Guru.yml +13 -0
- data/resources/shared/transforms/Orya-Knda.yml +13 -0
- data/resources/shared/transforms/Orya-Latn.yml +13 -0
- data/resources/shared/transforms/Orya-Mlym.yml +13 -0
- data/resources/shared/transforms/Orya-Taml.yml +13 -0
- data/resources/shared/transforms/Orya-Telu.yml +13 -0
- data/resources/shared/transforms/Syrc-Latn.yml +55 -0
- data/resources/shared/transforms/Taml-Beng.yml +13 -0
- data/resources/shared/transforms/Taml-Deva.yml +13 -0
- data/resources/shared/transforms/Taml-Gujr.yml +13 -0
- data/resources/shared/transforms/Taml-Guru.yml +13 -0
- data/resources/shared/transforms/Taml-Knda.yml +13 -0
- data/resources/shared/transforms/Taml-Latn.yml +13 -0
- data/resources/shared/transforms/Taml-Mlym.yml +13 -0
- data/resources/shared/transforms/Taml-Orya.yml +13 -0
- data/resources/shared/transforms/Taml-Telu.yml +13 -0
- data/resources/shared/transforms/Telu-Beng.yml +13 -0
- data/resources/shared/transforms/Telu-Deva.yml +13 -0
- data/resources/shared/transforms/Telu-Gujr.yml +13 -0
- data/resources/shared/transforms/Telu-Guru.yml +13 -0
- data/resources/shared/transforms/Telu-Knda.yml +13 -0
- data/resources/shared/transforms/Telu-Latn.yml +13 -0
- data/resources/shared/transforms/Telu-Mlym.yml +13 -0
- data/resources/shared/transforms/Telu-Orya.yml +13 -0
- data/resources/shared/transforms/Telu-Taml.yml +13 -0
- data/resources/shared/transforms/Thai-Latn.yml +15 -0
- data/resources/shared/transforms/am-am_FONIPA.yml +609 -0
- data/resources/shared/transforms/am-am_Latn-BGN.yml +336 -0
- data/resources/shared/transforms/am-ar.yml +11 -0
- data/resources/shared/transforms/am-fa.yml +10 -0
- data/resources/shared/transforms/ar-ar_Latn-BGN.yml +122 -0
- data/resources/shared/transforms/az_Cyrl-az-BGN.yml +93 -0
- data/resources/shared/transforms/be-be_Latn-BGN.yml +108 -0
- data/resources/shared/transforms/bg-bg_Latn-BGN.yml +99 -0
- data/resources/shared/transforms/ch-am.yml +10 -0
- data/resources/shared/transforms/ch-ar.yml +10 -0
- data/resources/shared/transforms/ch-ch_FONIPA.yml +0 -8
- data/resources/shared/transforms/ch-fa.yml +10 -0
- data/resources/shared/transforms/cs-am.yml +10 -0
- data/resources/shared/transforms/cs-ar.yml +10 -0
- data/resources/shared/transforms/cs-fa.yml +10 -0
- data/resources/shared/transforms/dsb-dsb_FONIPA.yml +0 -5
- data/resources/shared/transforms/dv-dv_Latn-BGN.yml +112 -0
- data/resources/shared/transforms/el-el_Latn-BGN.yml +208 -0
- data/resources/shared/transforms/eo-am.yml +10 -0
- data/resources/shared/transforms/eo-ar.yml +10 -0
- data/resources/shared/transforms/eo-eo_FONIPA.yml +52 -0
- data/resources/shared/transforms/eo-fa.yml +10 -0
- data/resources/shared/transforms/es-ar.yml +13 -0
- data/resources/shared/transforms/es-fa.yml +13 -0
- data/resources/shared/transforms/es_419-am.yml +11 -0
- data/resources/shared/transforms/es_419-ar.yml +14 -0
- data/resources/shared/transforms/es_419-fa.yml +14 -0
- data/resources/shared/transforms/fa-fa_Latn-BGN.yml +123 -0
- data/resources/shared/transforms/he-he_Latn-BGN.yml +62 -0
- data/resources/shared/transforms/hy-am.yml +10 -0
- data/resources/shared/transforms/hy-ar.yml +10 -0
- data/resources/shared/transforms/hy-fa.yml +10 -0
- data/resources/shared/transforms/hy-hy_FONIPA.yml +56 -0
- data/resources/shared/transforms/hy-hy_Latn-BGN.yml +133 -0
- data/resources/shared/transforms/hy_AREVMDA-am.yml +10 -0
- data/resources/shared/transforms/hy_AREVMDA-ar.yml +10 -0
- data/resources/shared/transforms/hy_AREVMDA-fa.yml +10 -0
- data/resources/shared/transforms/hy_AREVMDA-hy_AREVMDA_FONIPA.yml +82 -0
- data/resources/shared/transforms/ia-am.yml +10 -0
- data/resources/shared/transforms/ia-ar.yml +10 -0
- data/resources/shared/transforms/ia-fa.yml +10 -0
- data/resources/shared/transforms/ia-ia_FONIPA.yml +69 -0
- data/resources/shared/transforms/ja_Hrkt-ja_Latn-BGN.yml +310 -0
- data/resources/shared/transforms/ka-ka_Latn-BGN.yml +44 -0
- data/resources/shared/transforms/kk-am.yml +10 -0
- data/resources/shared/transforms/kk-ar.yml +10 -0
- data/resources/shared/transforms/kk-fa.yml +10 -0
- data/resources/shared/transforms/kk-kk_FONIPA.yml +53 -0
- data/resources/shared/transforms/kk-kk_Latn-BGN.yml +136 -0
- data/resources/shared/transforms/ko-ko_Latn-BGN.yml +282 -0
- data/resources/shared/transforms/ky-am.yml +10 -0
- data/resources/shared/transforms/ky-ar.yml +10 -0
- data/resources/shared/transforms/ky-fa.yml +10 -0
- data/resources/shared/transforms/ky-ky_FONIPA.yml +73 -0
- data/resources/shared/transforms/ky-ky_Latn-BGN.yml +107 -0
- data/resources/shared/transforms/la-la_FONIPA.yml +0 -8
- data/resources/shared/transforms/mk-mk_Latn-BGN.yml +89 -0
- data/resources/shared/transforms/mn-mn_Latn-BGN.yml +101 -0
- data/resources/shared/transforms/mn-mn_Latn-MNS.yml +89 -0
- data/resources/shared/transforms/my-am.yml +10 -0
- data/resources/shared/transforms/my-ar.yml +10 -0
- data/resources/shared/transforms/my-fa.yml +10 -0
- data/resources/shared/transforms/my-my_FONIPA.yml +260 -0
- data/resources/shared/transforms/pl-am.yml +10 -0
- data/resources/shared/transforms/pl-ar.yml +10 -0
- data/resources/shared/transforms/pl-fa.yml +10 -0
- data/resources/shared/transforms/ps-ps_Latn-BGN.yml +151 -0
- data/resources/shared/transforms/rm_SURSILV-am.yml +10 -0
- data/resources/shared/transforms/rm_SURSILV-ar.yml +10 -0
- data/resources/shared/transforms/rm_SURSILV-fa.yml +10 -0
- data/resources/shared/transforms/rm_SURSILV-rm_FONIPA_SURSILV.yml +84 -0
- data/resources/shared/transforms/ro-am.yml +10 -0
- data/resources/shared/transforms/ro-ar.yml +10 -0
- data/resources/shared/transforms/ro-fa.yml +10 -0
- data/resources/shared/transforms/ro-ro_FONIPA.yml +38 -6
- data/resources/shared/transforms/ro_FONIPA-ja.yml +1 -0
- data/resources/shared/transforms/ru-ru_Latn-BGN.yml +121 -0
- data/resources/shared/transforms/ru_Latn-ru-BGN.yml +101 -0
- data/resources/shared/transforms/sat-am.yml +10 -0
- data/resources/shared/transforms/sat-ar.yml +10 -0
- data/resources/shared/transforms/sat-fa.yml +10 -0
- data/resources/shared/transforms/sat_Olck-sat_FONIPA.yml +132 -0
- data/resources/shared/transforms/si-am.yml +10 -0
- data/resources/shared/transforms/si-ar.yml +10 -0
- data/resources/shared/transforms/si-fa.yml +10 -0
- data/resources/shared/transforms/si-si_FONIPA.yml +128 -0
- data/resources/shared/transforms/si-si_Latn.yml +96 -0
- data/resources/shared/transforms/sk-am.yml +10 -0
- data/resources/shared/transforms/sk-ar.yml +10 -0
- data/resources/shared/transforms/sk-fa.yml +10 -0
- data/resources/shared/transforms/sk-sk_FONIPA.yml +18 -2
- data/resources/shared/transforms/sk_FONIPA-ja.yml +2 -0
- data/resources/shared/transforms/sr-sr_Latn-BGN.yml +81 -0
- data/resources/shared/transforms/tk_Cyrl-tk-BGN.yml +122 -0
- data/resources/shared/transforms/tlh-am.yml +10 -0
- data/resources/shared/transforms/tlh-ar.yml +10 -0
- data/resources/shared/transforms/tlh-fa.yml +10 -0
- data/resources/shared/transforms/tlh-tlh_FONIPA.yml +0 -8
- data/resources/shared/transforms/uk-uk_Latn-BGN.yml +115 -0
- data/resources/shared/transforms/und_FONIPA-ar.yml +96 -0
- data/resources/shared/transforms/und_FONIPA-fa.yml +88 -0
- data/resources/shared/transforms/und_FONIPA-und_FONXSAMP.yml +198 -0
- data/resources/shared/transforms/uz_Cyrl-uz-BGN.yml +117 -0
- data/resources/shared/transforms/xh-am.yml +10 -0
- data/resources/shared/transforms/xh-ar.yml +10 -0
- data/resources/shared/transforms/xh-fa.yml +10 -0
- data/resources/shared/transforms/xh-xh_FONIPA.yml +71 -0
- data/resources/shared/transforms/zu-am.yml +10 -0
- data/resources/shared/transforms/zu-ar.yml +10 -0
- data/resources/shared/transforms/zu-fa.yml +10 -0
- data/resources/shared/transforms/zu-zu_FONIPA.yml +58 -0
- data/spec/formatters/numbers/rbnf/rbnf_spec.rb +3 -1
- data/spec/resources/loader_spec.rb +12 -5
- data/spec/spec_helper.rb +1 -1
- metadata +242 -10
- data/History.txt +0 -282
- data/lib/twitter_cldr/resources/collation_tries_dumper.rb +0 -43
- data/lib/twitter_cldr/resources/custom_locales_resources_importer.rb +0 -80
- data/lib/twitter_cldr/resources/download.rb +0 -64
- data/lib/twitter_cldr/resources/icu_based_importer.rb +0 -18
- data/lib/twitter_cldr/resources/properties/properties_importer.rb +0 -59
- data/lib/twitter_cldr/resources/unicode_importer.rb +0 -37
@@ -3,29 +3,22 @@
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
|
-
require 'twitter_cldr/resources/download'
|
7
6
|
require 'fileutils'
|
8
7
|
|
9
8
|
module TwitterCldr
|
10
9
|
module Resources
|
11
|
-
class SegmentTestsImporter <
|
10
|
+
class SegmentTestsImporter < Importer
|
12
11
|
|
13
|
-
URL_ROOT = "ucd/auxiliary"
|
14
12
|
TEST_FILES = [
|
15
|
-
'WordBreakTest.txt',
|
13
|
+
'ucd/auxiliary/WordBreakTest.txt',
|
14
|
+
'ucd/auxiliary/SentenceBreakTest.txt'
|
16
15
|
]
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
@input_path = input_path
|
22
|
-
@output_path = output_path
|
23
|
-
end
|
24
|
-
|
25
|
-
def import
|
26
|
-
FileUtils.mkdir_p(input_path)
|
27
|
-
FileUtils.mkdir_p(output_path)
|
17
|
+
requirement :unicode, Versions.unicode_version, TEST_FILES
|
18
|
+
output_path 'shared/segments/tests'
|
19
|
+
ruby_engine :mri
|
28
20
|
|
21
|
+
def execute
|
29
22
|
TEST_FILES.each do |test_file|
|
30
23
|
import_test_file(test_file)
|
31
24
|
end
|
@@ -34,27 +27,19 @@ module TwitterCldr
|
|
34
27
|
private
|
35
28
|
|
36
29
|
def import_test_file(test_file)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
result = parse_standard_file(input_file).map(&:first)
|
42
|
-
File.write(output_file, YAML.dump(result))
|
30
|
+
source_file = source_path_for(test_file)
|
31
|
+
FileUtils.mkdir_p(File.dirname(source_file))
|
32
|
+
result = UnicodeFileParser.parse_standard_file(source_file).map(&:first)
|
33
|
+
File.write(output_path_for(test_file), YAML.dump(result))
|
43
34
|
end
|
44
35
|
|
45
|
-
def
|
46
|
-
|
36
|
+
def source_path_for(test_file)
|
37
|
+
requirements[:unicode].source_path_for(test_file)
|
47
38
|
end
|
48
39
|
|
49
40
|
def output_path_for(test_file)
|
50
|
-
|
51
|
-
File.join(output_path, "#{
|
52
|
-
end
|
53
|
-
|
54
|
-
def download(input_file, url)
|
55
|
-
TwitterCldr::Resources.download_unicode_data_if_necessary(
|
56
|
-
input_file, url
|
57
|
-
)
|
41
|
+
file = underscore(File.basename(test_file).chomp(File.extname(test_file)))
|
42
|
+
File.join(params.fetch(:output_path), "#{file}.yml")
|
58
43
|
end
|
59
44
|
|
60
45
|
def underscore(str)
|
@@ -4,16 +4,19 @@
|
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
6
|
require 'nokogiri'
|
7
|
-
require 'java'
|
8
|
-
|
9
|
-
require 'twitter_cldr/resources/download'
|
10
7
|
|
11
8
|
module TwitterCldr
|
12
9
|
module Resources
|
13
10
|
# This class should be used with JRuby 1.7 in 1.9 mode, ICU4J version >= 49.1,
|
14
11
|
# and CLDR version <= 23 (v24 syntax is not supported yet).
|
15
12
|
#
|
16
|
-
class TailoringImporter <
|
13
|
+
class TailoringImporter < Importer
|
14
|
+
|
15
|
+
requirement :icu, '51.2'
|
16
|
+
requirement :cldr, '23.1'
|
17
|
+
output_path 'collation/tailoring'
|
18
|
+
locales TwitterCldr.supported_locales
|
19
|
+
ruby_engine :jruby
|
17
20
|
|
18
21
|
SUPPORTED_RULES = %w[p s t i pc sc tc ic x]
|
19
22
|
SIMPLE_RULES = %w[p s t i]
|
@@ -40,26 +43,12 @@ module TwitterCldr
|
|
40
43
|
|
41
44
|
class ImportError < RuntimeError; end
|
42
45
|
|
43
|
-
|
44
|
-
#
|
45
|
-
# input_path - path to a directory containing CLDR data
|
46
|
-
# output_path - output directory for imported YAML files
|
47
|
-
# icu4j_path - path to ICU4J jar file
|
48
|
-
#
|
49
|
-
def initialize(input_path, output_path, icu4j_path)
|
50
|
-
require_icu4j(icu4j_path)
|
51
|
-
|
52
|
-
@input_path = input_path
|
53
|
-
@output_path = output_path
|
54
|
-
end
|
46
|
+
private
|
55
47
|
|
56
|
-
def
|
57
|
-
|
58
|
-
locales.each { |locale| import_locale(locale) }
|
48
|
+
def execute
|
49
|
+
params[:locales].each { |locale| import_locale(locale) }
|
59
50
|
end
|
60
51
|
|
61
|
-
private
|
62
|
-
|
63
52
|
def import_locale(locale)
|
64
53
|
print "Importing %8s\t--\t" % locale
|
65
54
|
|
@@ -87,11 +76,13 @@ module TwitterCldr
|
|
87
76
|
end
|
88
77
|
|
89
78
|
def locale_file_path(locale)
|
90
|
-
File.join(
|
79
|
+
File.join(
|
80
|
+
requirements[:cldr].common_path, 'collation', "#{translated_locale(locale)}.xml"
|
81
|
+
)
|
91
82
|
end
|
92
83
|
|
93
84
|
def resource_file_path(locale)
|
94
|
-
File.join(
|
85
|
+
File.join(params[:output_path], "#{locale}.yml")
|
95
86
|
end
|
96
87
|
|
97
88
|
def tailoring_data(locale)
|
@@ -136,12 +127,28 @@ module TwitterCldr
|
|
136
127
|
default_type_node && default_type_node.attr('type')
|
137
128
|
end
|
138
129
|
|
130
|
+
def get_class(name)
|
131
|
+
requirements[:icu].get_class(name)
|
132
|
+
end
|
133
|
+
|
134
|
+
def collator_class
|
135
|
+
@collator_class ||= get_class('com.ibm.icu.text.Collator')
|
136
|
+
end
|
137
|
+
|
138
|
+
def unicode_set_class
|
139
|
+
@unicode_set_class ||= get_class('com.ibm.icu.text.UnicodeSet')
|
140
|
+
end
|
141
|
+
|
142
|
+
def collation_element_iterator_class
|
143
|
+
@collation_element_iterator_class ||= get_class('com.ibm.icu.text.CollationElementIterator')
|
144
|
+
end
|
145
|
+
|
139
146
|
def parse_tailorings(data, locale)
|
140
147
|
rules = data && data.at_xpath('rules')
|
141
148
|
|
142
149
|
return '' unless rules
|
143
150
|
|
144
|
-
collator =
|
151
|
+
collator = collator_class.get_instance(Java::JavaUtil::Locale.new(locale.to_s))
|
145
152
|
|
146
153
|
rules.children.map do |child|
|
147
154
|
validate_tailoring_rule(child)
|
@@ -183,7 +190,7 @@ module TwitterCldr
|
|
183
190
|
|
184
191
|
def parse_suppressed_contractions(data)
|
185
192
|
node = data && data.at_xpath('suppress_contractions')
|
186
|
-
node ?
|
193
|
+
node ? unicode_set_class.to_array(unicode_set_class.new(node.text)).to_a.join : ''
|
187
194
|
end
|
188
195
|
|
189
196
|
def parse_collator_options(data)
|
@@ -209,7 +216,7 @@ module TwitterCldr
|
|
209
216
|
collation_elements = []
|
210
217
|
ce = iter.next
|
211
218
|
|
212
|
-
while ce !=
|
219
|
+
while ce != collation_element_iterator_class::NULLORDER
|
213
220
|
p1 = (ce >> 24) & LAST_BYTE_MASK
|
214
221
|
p2 = (ce >> 16) & LAST_BYTE_MASK
|
215
222
|
|
@@ -3,17 +3,13 @@
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
|
-
require 'java'
|
7
6
|
require 'fileutils'
|
8
7
|
|
9
8
|
module TwitterCldr
|
10
9
|
module Resources
|
11
10
|
|
12
11
|
# This class should be used with JRuby in 1.9 mode
|
13
|
-
class TransformTestImporter <
|
14
|
-
|
15
|
-
attr_reader :output_file, :icu4j_path
|
16
|
-
|
12
|
+
class TransformTestImporter < Importer
|
17
13
|
# most of these were taken from wikipedia, lol
|
18
14
|
TEXT_SAMPLES = {
|
19
15
|
latin: ["From today's featured article"], # @TODO test capital letters,
|
@@ -23,7 +19,7 @@ module TwitterCldr
|
|
23
19
|
gurmukhi: ["ਅੱਜ ਇਤਿਹਾਸ ਵਿੱਚ"],
|
24
20
|
gujarati: ["આ માસનો ઉમદા લેખ"],
|
25
21
|
bengali: ["নির্বাচিত নিবন্ধ"],
|
26
|
-
hangul: ["김창옥"],
|
22
|
+
hangul: ["김창옥", '모든 사용자는 위키백과에 직접 참여해 확인 가능'],
|
27
23
|
arabic: ["مقالة اليوم المختارة"],
|
28
24
|
han: ["因此只有两场风暴因造成"],
|
29
25
|
hiragana: ["くろねこさま"],
|
@@ -37,7 +33,6 @@ module TwitterCldr
|
|
37
33
|
malayalam: ['ഇടുക്കിയിലെ സൂര്യനെല്ലി സ്വദേശിനിയായ'],
|
38
34
|
tamil: ['சென்னையில் வாழும் உலோ.செந்தமிழ்க்கோதை'],
|
39
35
|
interindic: [' '],
|
40
|
-
hangul: ['모든 사용자는 위키백과에 직접 참여해 확인 가능'],
|
41
36
|
hebrew: ['על שמן של המיילדות במצרים, שפרה ופועה, נקראו'],
|
42
37
|
simplified: ['系统源于墨西哥以西的扰动天气区,并且位于更大规模的天气系统以内'],
|
43
38
|
traditional: ['系統源於墨西哥以西的擾動天氣區,並且位於更大規模的天氣系統以內'],
|
@@ -50,15 +45,12 @@ module TwitterCldr
|
|
50
45
|
|
51
46
|
BGN_SAMPLES = [:armenian, :katakana, :korean]
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end
|
48
|
+
requirement :icu, Versions.icu_version
|
49
|
+
output_path File.join(TwitterCldr::SPEC_DIR, 'transforms', 'test_data.yml')
|
50
|
+
ruby_engine :jruby
|
57
51
|
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
File.open(output_file, 'w+') do |f|
|
52
|
+
def execute
|
53
|
+
File.open(params.fetch(:output_path), 'w+') do |f|
|
62
54
|
f.write(
|
63
55
|
YAML.dump(
|
64
56
|
generate_test_data(transformer.each_transform)
|
@@ -67,6 +59,8 @@ module TwitterCldr
|
|
67
59
|
end
|
68
60
|
end
|
69
61
|
|
62
|
+
private
|
63
|
+
|
70
64
|
def generate_test_data(transforms)
|
71
65
|
transforms.each_with_object([]) do |transform_id_str, ret|
|
72
66
|
forward_id = transform_id.parse(transform_id_str)
|
@@ -104,8 +98,12 @@ module TwitterCldr
|
|
104
98
|
TwitterCldr::Transforms::Transformer.exists?(id)
|
105
99
|
end
|
106
100
|
|
101
|
+
def transliterator_class
|
102
|
+
@transliterator_class ||= requirements[:icu].get_class('com.ibm.icu.text.Transliterator')
|
103
|
+
end
|
104
|
+
|
107
105
|
def generate_transform_samples(id, samples)
|
108
|
-
trans =
|
106
|
+
trans = transliterator_class.getInstance(id.to_s)
|
109
107
|
samples.each_with_object({}) do |sample, ret|
|
110
108
|
ret[sample] = trans.transliterate(sample)
|
111
109
|
end
|
@@ -131,7 +129,7 @@ module TwitterCldr
|
|
131
129
|
def transform_id
|
132
130
|
TwitterCldr::Transforms::TransformId
|
133
131
|
end
|
134
|
-
|
135
132
|
end
|
133
|
+
|
136
134
|
end
|
137
135
|
end
|
@@ -3,41 +3,42 @@
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
|
-
require 'twitter_cldr/resources/download'
|
7
6
|
require 'fileutils'
|
7
|
+
require 'open-uri'
|
8
8
|
require 'json'
|
9
9
|
|
10
10
|
module TwitterCldr
|
11
11
|
module Resources
|
12
12
|
module Uli
|
13
|
-
class SegmentExceptionsImporter
|
13
|
+
class SegmentExceptionsImporter < Resources::Importer
|
14
14
|
|
15
15
|
URL = "http://unicode.org/uli/trac/export/58/trunk/abbrs/json/%{locale}.json"
|
16
|
+
LOCALES = [:de, :en, :es, :fr, :it, :pt, :ru] # these are the only locales ULI supports at the moment
|
16
17
|
|
17
|
-
|
18
|
+
output_path 'uli/segments'
|
19
|
+
ruby_engine :mri
|
18
20
|
|
19
|
-
def
|
20
|
-
@input_path = input_path
|
21
|
-
@output_path = output_path
|
22
|
-
end
|
23
|
-
|
24
|
-
def import(locales)
|
21
|
+
def execute
|
25
22
|
FileUtils.mkdir_p(input_path)
|
26
23
|
FileUtils.mkdir_p(output_path)
|
27
|
-
|
24
|
+
LOCALES.each { |locale| import_locale(locale) }
|
28
25
|
end
|
29
26
|
|
30
27
|
private
|
31
28
|
|
29
|
+
def output_path
|
30
|
+
params.fetch(:output_path)
|
31
|
+
end
|
32
|
+
|
32
33
|
def import_locale(locale)
|
33
34
|
if input_file = download_resource_for(locale)
|
34
35
|
output_file = File.join(output_path, "#{locale}.yml")
|
35
36
|
exceptions = JSON.parse(File.read(input_file))
|
36
37
|
|
37
|
-
File.open(output_file,
|
38
|
+
File.open(output_file, 'w+') do |f|
|
38
39
|
YAML.dump({
|
39
40
|
locale => {
|
40
|
-
exceptions: exceptions[
|
41
|
+
exceptions: exceptions['data']['abbrs']
|
41
42
|
}
|
42
43
|
}, f)
|
43
44
|
end
|
@@ -45,14 +46,25 @@ module TwitterCldr
|
|
45
46
|
end
|
46
47
|
|
47
48
|
def download_resource_for(locale)
|
48
|
-
input_file =
|
49
|
-
|
50
|
-
|
51
|
-
)
|
49
|
+
input_file = input_file_for(locale)
|
50
|
+
url = URL % { locale: locale }
|
51
|
+
|
52
|
+
unless File.file?(input_file)
|
53
|
+
STDOUT.write("Downloading #{url}... ")
|
54
|
+
open(input_file, 'wb') { |file| file << open(url).read }
|
55
|
+
puts 'done'
|
56
|
+
end
|
57
|
+
|
52
58
|
input_file
|
53
59
|
end
|
54
60
|
|
55
|
-
def
|
61
|
+
def input_path
|
62
|
+
@input_path ||= File.join(
|
63
|
+
TwitterCldr::VENDOR_DIR, 'uli', 'segments'
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
def input_file_for(locale)
|
56
68
|
File.join(input_path, "#{locale}.json")
|
57
69
|
end
|
58
70
|
|
@@ -3,52 +3,53 @@
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
|
-
require 'twitter_cldr/resources/download'
|
7
|
-
|
8
6
|
module TwitterCldr
|
9
7
|
module Resources
|
10
8
|
|
11
|
-
class UnicodeDataImporter <
|
9
|
+
class UnicodeDataImporter < Importer
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
BLOCKS_FILE = 'ucd/Blocks.txt'
|
12
|
+
UNICODE_DATA_FILE = 'ucd/UnicodeData.txt'
|
13
|
+
CASEFOLDING_DATA_FILE = 'ucd/CaseFolding.txt'
|
16
14
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# output_path - output directory for imported YAML files
|
21
|
-
#
|
22
|
-
def initialize(input_path, output_path)
|
23
|
-
@input_path = input_path
|
24
|
-
@output_path = output_path
|
25
|
-
end
|
15
|
+
requirement :unicode, Versions.unicode_version, [BLOCKS_FILE, UNICODE_DATA_FILE, CASEFOLDING_DATA_FILE]
|
16
|
+
output_path 'unicode_data'
|
17
|
+
ruby_engine :mri
|
26
18
|
|
27
|
-
def
|
19
|
+
def execute
|
28
20
|
blocks = import_blocks
|
29
21
|
unicode_data = import_unicode_data(blocks)
|
30
22
|
casefolding_data = import_casefolding_data
|
31
23
|
|
32
|
-
|
24
|
+
STDOUT.write('Writing data to disk... ')
|
25
|
+
|
26
|
+
File.open(File.join(output_path, 'blocks.yml'), 'w') do |output|
|
33
27
|
YAML.dump(blocks, output)
|
34
28
|
end
|
35
29
|
|
36
|
-
FileUtils.mkdir_p(File.join(
|
30
|
+
FileUtils.mkdir_p(File.join(output_path, 'blocks'))
|
37
31
|
|
38
32
|
unicode_data.each do |block_name, code_points|
|
39
|
-
File.open(File.join(
|
33
|
+
File.open(File.join(output_path, 'blocks', "#{block_name}.yml"), 'w') do |output|
|
40
34
|
YAML.dump(code_points, output)
|
41
35
|
end
|
42
36
|
end
|
43
37
|
|
44
|
-
File.open(File.join(
|
38
|
+
File.open(File.join(output_path, 'casefolding.yml'), 'w') do |output|
|
45
39
|
YAML.dump(casefolding_data, output)
|
46
40
|
end
|
41
|
+
|
42
|
+
puts 'done'
|
47
43
|
end
|
48
44
|
|
49
45
|
private
|
50
46
|
|
47
|
+
def output_path
|
48
|
+
params.fetch(:output_path)
|
49
|
+
end
|
50
|
+
|
51
51
|
def import_blocks
|
52
|
+
STDOUT.write('Importing blocks... ')
|
52
53
|
blocks = {}
|
53
54
|
|
54
55
|
File.open(blocks_file) do |input|
|
@@ -62,48 +63,54 @@ module TwitterCldr
|
|
62
63
|
end
|
63
64
|
end
|
64
65
|
|
66
|
+
puts 'done'
|
65
67
|
blocks
|
66
68
|
end
|
67
69
|
|
70
|
+
def parse_file(file, &block)
|
71
|
+
UnicodeFileParser.parse_standard_file(file, &block)
|
72
|
+
end
|
73
|
+
|
68
74
|
def import_unicode_data(blocks)
|
75
|
+
STDOUT.write('Importing Unicode data... ')
|
69
76
|
unicode_data = Hash.new do |hash, key|
|
70
77
|
hash[key] = Hash.new { |h, k| h[k] = {} }
|
71
78
|
end
|
72
79
|
|
73
|
-
|
80
|
+
parse_file(unicode_data_file) do |data|
|
74
81
|
data[0] = data[0].hex
|
75
82
|
unicode_data[find_block(blocks, data[0]).first][data[0]] = data
|
76
83
|
end
|
77
84
|
|
85
|
+
puts 'done'
|
78
86
|
unicode_data
|
79
87
|
end
|
80
88
|
|
81
89
|
def import_casefolding_data
|
82
|
-
|
90
|
+
STDOUT.write('Importing casefolding data... ')
|
91
|
+
|
92
|
+
casefolding_data = parse_file(casefold_data_file).map do |data|
|
83
93
|
{
|
84
94
|
source: data[0].hex,
|
85
95
|
target: data[2].split(" ").map(&:hex),
|
86
96
|
status: data[1]
|
87
97
|
}
|
88
98
|
end
|
99
|
+
|
100
|
+
puts 'done'
|
101
|
+
casefolding_data
|
89
102
|
end
|
90
103
|
|
91
104
|
def casefold_data_file
|
92
|
-
|
93
|
-
File.join(@input_path, 'CaseFolding.txt'), CASEFOLDING_DATA_URL
|
94
|
-
)
|
105
|
+
requirements[:unicode].source_path_for(CASEFOLDING_DATA_FILE)
|
95
106
|
end
|
96
107
|
|
97
108
|
def unicode_data_file
|
98
|
-
|
99
|
-
File.join(@input_path, 'UnicodeData.txt'), UNICODE_DATA_URL
|
100
|
-
)
|
109
|
+
requirements[:unicode].source_path_for(UNICODE_DATA_FILE)
|
101
110
|
end
|
102
111
|
|
103
112
|
def blocks_file
|
104
|
-
|
105
|
-
File.join(@input_path, 'Blocks.txt'), BLOCKS_URL
|
106
|
-
)
|
113
|
+
requirements[:unicode].source_path_for(BLOCKS_FILE)
|
107
114
|
end
|
108
115
|
|
109
116
|
def find_block(blocks, code_point)
|