twitter_cldr 5.2.0 → 5.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -4
- data/Rakefile +19 -8
- data/lib/twitter_cldr/normalization.rb +18 -5
- data/lib/twitter_cldr/resources.rb +3 -1
- data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
- data/lib/twitter_cldr/resources/loader.rb +22 -1
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
- data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
- data/lib/twitter_cldr/segmentation.rb +25 -10
- data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
- data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
- data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
- data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
- data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
- data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
- data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
- data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
- data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
- data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
- data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
- data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
- data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
- data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
- data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
- data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
- data/lib/twitter_cldr/shared.rb +1 -0
- data/lib/twitter_cldr/shared/caser.rb +3 -3
- data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
- data/lib/twitter_cldr/utils/range_set.rb +10 -1
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/km.yml +82 -0
- data/resources/collation/tailoring/lo.yml +4 -0
- data/resources/collation/tailoring/my.yml +940 -0
- data/resources/collation/tries/km.dump +0 -0
- data/resources/collation/tries/lo.dump +0 -0
- data/resources/collation/tries/my.dump +0 -0
- data/resources/locales/km/calendars.yml +373 -0
- data/resources/locales/km/currencies.yml +654 -0
- data/resources/locales/km/day_periods.yml +96 -0
- data/resources/locales/km/fields.yml +495 -0
- data/resources/locales/km/languages.yml +397 -0
- data/resources/locales/km/layout.yml +5 -0
- data/resources/locales/km/lists.yml +37 -0
- data/resources/locales/km/numbers.yml +402 -0
- data/resources/locales/km/plural_rules.yml +6 -0
- data/resources/locales/km/plurals.yml +12 -0
- data/resources/locales/km/rbnf.yml +131 -0
- data/resources/locales/km/territories.yml +267 -0
- data/resources/locales/km/timezones.yml +1471 -0
- data/resources/locales/km/units.yml +721 -0
- data/resources/locales/lo/calendars.yml +368 -0
- data/resources/locales/lo/currencies.yml +918 -0
- data/resources/locales/lo/day_periods.yml +96 -0
- data/resources/locales/lo/fields.yml +437 -0
- data/resources/locales/lo/languages.yml +529 -0
- data/resources/locales/lo/layout.yml +5 -0
- data/resources/locales/lo/lists.yml +42 -0
- data/resources/locales/lo/numbers.yml +476 -0
- data/resources/locales/lo/plural_rules.yml +7 -0
- data/resources/locales/lo/plurals.yml +14 -0
- data/resources/locales/lo/rbnf.yml +119 -0
- data/resources/locales/lo/territories.yml +265 -0
- data/resources/locales/lo/timezones.yml +1513 -0
- data/resources/locales/lo/units.yml +750 -0
- data/resources/locales/my/calendars.yml +374 -0
- data/resources/locales/my/currencies.yml +697 -0
- data/resources/locales/my/day_periods.yml +96 -0
- data/resources/locales/my/fields.yml +459 -0
- data/resources/locales/my/languages.yml +420 -0
- data/resources/locales/my/layout.yml +5 -0
- data/resources/locales/my/lists.yml +43 -0
- data/resources/locales/my/numbers.yml +417 -0
- data/resources/locales/my/plural_rules.yml +6 -0
- data/resources/locales/my/plurals.yml +12 -0
- data/resources/locales/my/rbnf.yml +145 -0
- data/resources/locales/my/territories.yml +265 -0
- data/resources/locales/my/timezones.yml +1479 -0
- data/resources/locales/my/units.yml +759 -0
- data/resources/locales/th/plurals.yml +1 -1
- data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
- data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
- data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
- data/resources/shared/segments/dictionaries/laodict.dump +0 -0
- data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
- data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
- data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
- data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
- data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
- data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
- data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
- data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
- data/resources/shared/segments/tests/line_break_test.yml +68 -68
- data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
- data/resources/supported_locales.yml +3 -0
- data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
- data/spec/segmentation/dictionary_break_spec.rb +42 -0
- data/spec/segmentation/rule_set_spec.rb +3 -1
- data/spec/timezones/tests/km.yml +12475 -0
- data/spec/timezones/tests/lo.yml +12475 -0
- data/spec/timezones/tests/my.yml +12475 -0
- metadata +87 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2783ec225d4f260deb8038237125dbc97b78840e951cbff630f3da502e31f41d
|
4
|
+
data.tar.gz: 751cc8931ee11db35a533584a1b9f3d8a946e9505641d33caad6a5a5dbc6e866
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cdc1ec2718ac86b0645813fc0bdfe13b1c2bee075ca90e13231a3d566d915e7ef9bcdf6d69f8d0a7cbcdaf6868eae5c27bfb4b32b52efbd7dcf1adabc20c39d
|
7
|
+
data.tar.gz: 20b7e24ec990cc00fb77d60d62c75247108748629c5478a0a315d96e04a76c443f8abc2a4df21b7c4093ad7cc46ee81c4276ca15f97c8698073b7ac9bca98c65
|
data/Gemfile
CHANGED
data/Rakefile
CHANGED
@@ -57,16 +57,22 @@ task :update do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
task :add_locale, :locale do |_, args|
|
60
|
+
locales = [args[:locale]] + args.extras
|
61
|
+
|
60
62
|
File.write(
|
61
63
|
TwitterCldr::SUPPORTED_LOCALES_FILE,
|
62
64
|
YAML.dump(
|
63
|
-
(TwitterCldr::SUPPORTED_LOCALES +
|
65
|
+
(TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
|
64
66
|
)
|
65
67
|
)
|
66
68
|
|
67
69
|
klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
|
68
|
-
instances = klasses.map { |klass| klass.new(locales:
|
69
|
-
TwitterCldr::Resources::ImportResolver.new(
|
70
|
+
instances = klasses.map { |klass| klass.new(locales: locales) }
|
71
|
+
resolver = TwitterCldr::Resources::ImportResolver.new(
|
72
|
+
instances, allow_missing_dependencies: true
|
73
|
+
)
|
74
|
+
|
75
|
+
resolver.import
|
70
76
|
end
|
71
77
|
|
72
78
|
# add_locale and update_locale do the same thing
|
@@ -160,6 +166,16 @@ namespace :update do
|
|
160
166
|
TwitterCldr::Resources::SegmentRulesImporter.new.import
|
161
167
|
end
|
162
168
|
|
169
|
+
desc 'Import segmentation dictionaries'
|
170
|
+
task :segment_dictionaries do
|
171
|
+
TwitterCldr::Resources::SegmentDictionariesImporter.new.import
|
172
|
+
end
|
173
|
+
|
174
|
+
desc 'Import segment tests'
|
175
|
+
task :segment_tests do
|
176
|
+
TwitterCldr::Resources::SegmentTestsImporter.new.import
|
177
|
+
end
|
178
|
+
|
163
179
|
desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
|
164
180
|
task :bidi_tests do
|
165
181
|
TwitterCldr::Resources::BidiTestImporter.new.import
|
@@ -200,11 +216,6 @@ namespace :update do
|
|
200
216
|
TwitterCldr::Resources::TransformTestsImporter.new.import
|
201
217
|
end
|
202
218
|
|
203
|
-
desc 'Import segment tests'
|
204
|
-
task :segment_tests do
|
205
|
-
TwitterCldr::Resources::SegmentTestsImporter.new.import
|
206
|
-
end
|
207
|
-
|
208
219
|
desc 'Import hyphenation dictionaries'
|
209
220
|
task :hyphenation_dictionaries do
|
210
221
|
TwitterCldr::Resources::HyphenationImporter.new.import
|
@@ -14,12 +14,25 @@ module TwitterCldr
|
|
14
14
|
class << self
|
15
15
|
|
16
16
|
def normalize(string, options = {})
|
17
|
-
form = options
|
17
|
+
validate_form(form = extract_form_from(options))
|
18
|
+
Eprun.normalize(string, form)
|
19
|
+
end
|
20
|
+
|
21
|
+
def normalized?(string, options = {})
|
22
|
+
validate_form(form = extract_form_from(options))
|
23
|
+
Eprun.normalized?(string, form)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def extract_form_from(options)
|
29
|
+
options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
|
30
|
+
end
|
18
31
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
32
|
+
def validate_form(form)
|
33
|
+
unless VALID_NORMALIZERS.include?(form)
|
34
|
+
raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
|
35
|
+
"(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
|
23
36
|
end
|
24
37
|
end
|
25
38
|
|
@@ -22,12 +22,13 @@ module TwitterCldr
|
|
22
22
|
autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
|
23
23
|
autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
|
24
24
|
autoload :Properties, 'twitter_cldr/resources/properties'
|
25
|
+
autoload :SegmentDictionariesImporter, 'twitter_cldr/resources/segment_dictionaries_importer'
|
25
26
|
autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
|
27
|
+
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
|
26
28
|
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
|
27
29
|
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
|
28
30
|
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
|
29
31
|
autoload :Requirements, 'twitter_cldr/resources/requirements'
|
30
|
-
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
|
31
32
|
autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
|
32
33
|
autoload :TerritoriesImporter, 'twitter_cldr/resources/territories_importer'
|
33
34
|
autoload :TimezonesImporter, 'twitter_cldr/resources/timezones_importer'
|
@@ -60,6 +61,7 @@ module TwitterCldr
|
|
60
61
|
NumberFormatsImporter,
|
61
62
|
PostalCodesImporter,
|
62
63
|
RbnfTestImporter,
|
64
|
+
SegmentDictionariesImporter,
|
63
65
|
SegmentRulesImporter,
|
64
66
|
SegmentTestsImporter,
|
65
67
|
TailoringImporter,
|
@@ -5,10 +5,11 @@ module TwitterCldr
|
|
5
5
|
class ImportResolver
|
6
6
|
include TSort
|
7
7
|
|
8
|
-
attr_reader :importers
|
8
|
+
attr_reader :importers, :options
|
9
9
|
|
10
|
-
def initialize(importers = Resources.importer_classes_for_ruby_engine)
|
10
|
+
def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
|
11
11
|
@importers = importers
|
12
|
+
@options = options
|
12
13
|
end
|
13
14
|
|
14
15
|
def import
|
@@ -28,7 +29,12 @@ module TwitterCldr
|
|
28
29
|
|
29
30
|
def tsort_each_child(instance, &block)
|
30
31
|
deps_for(instance).map do |dep_class|
|
31
|
-
|
32
|
+
dep = instances.find { |ins| ins.class == dep_class }
|
33
|
+
yield dep if dep
|
34
|
+
|
35
|
+
unless options[:allow_missing_dependencies]
|
36
|
+
raise "Could not find dependency #{dep_class.name}"
|
37
|
+
end
|
32
38
|
end
|
33
39
|
end
|
34
40
|
|
@@ -39,6 +45,8 @@ module TwitterCldr
|
|
39
45
|
end
|
40
46
|
|
41
47
|
def check_unmet_instance_deps(instance)
|
48
|
+
return if options[:allow_missing_dependencies]
|
49
|
+
|
42
50
|
unmet_deps = unmet_deps_for(instance)
|
43
51
|
|
44
52
|
unless unmet_deps.empty?
|
@@ -10,6 +10,8 @@ module TwitterCldr
|
|
10
10
|
|
11
11
|
class Loader
|
12
12
|
|
13
|
+
VALID_EXTS = %w(.yml .dump).freeze
|
14
|
+
|
13
15
|
def get_resource(*path)
|
14
16
|
resources_cache[resource_file_path(path)]
|
15
17
|
end
|
@@ -75,7 +77,7 @@ module TwitterCldr
|
|
75
77
|
|
76
78
|
def resource_file_path(path)
|
77
79
|
file = File.join(*path.map(&:to_s))
|
78
|
-
file << '.yml' unless
|
80
|
+
file << '.yml' unless VALID_EXTS.include?(File.extname(file))
|
79
81
|
file
|
80
82
|
end
|
81
83
|
|
@@ -92,6 +94,17 @@ module TwitterCldr
|
|
92
94
|
end
|
93
95
|
|
94
96
|
def load_resource(path, merge_custom = true)
|
97
|
+
case File.extname(path)
|
98
|
+
when '.yml'
|
99
|
+
load_yaml_resource(path, merge_custom)
|
100
|
+
when '.dump'
|
101
|
+
load_marshalled_resource(path, merge_custom)
|
102
|
+
else
|
103
|
+
load_raw_resource(path, merge_custom)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_yaml_resource(path, merge_custom = true)
|
95
108
|
base = YAML.load(read_resource_file(path))
|
96
109
|
custom_path = File.join("custom", path)
|
97
110
|
|
@@ -102,6 +115,14 @@ module TwitterCldr
|
|
102
115
|
base
|
103
116
|
end
|
104
117
|
|
118
|
+
def load_marshalled_resource(path, _merge_custom = :unused)
|
119
|
+
Marshal.load(read_resource_file(path))
|
120
|
+
end
|
121
|
+
|
122
|
+
def load_raw_resource(path, _merge_custom = :unused)
|
123
|
+
read_resource_file(path)
|
124
|
+
end
|
125
|
+
|
105
126
|
def custom_resource_exists?(custom_path)
|
106
127
|
File.exist?(
|
107
128
|
File.join(TwitterCldr::RESOURCES_DIR, custom_path)
|
@@ -56,13 +56,6 @@ module TwitterCldr
|
|
56
56
|
params.fetch(:output_path)
|
57
57
|
end
|
58
58
|
|
59
|
-
def move_segments_root_file
|
60
|
-
old_file_path = File.join(output_path, *%w(shared segments_root.yml))
|
61
|
-
new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
|
62
|
-
FileUtils.mkdir_p(File.dirname(new_file_path))
|
63
|
-
FileUtils.move(old_file_path, new_file_path)
|
64
|
-
end
|
65
|
-
|
66
59
|
def import_components
|
67
60
|
locales = Set.new
|
68
61
|
|
@@ -100,8 +93,6 @@ module TwitterCldr
|
|
100
93
|
Cldr::Export.export(export_args) do |component, locale, path|
|
101
94
|
deep_symbolize(path)
|
102
95
|
end
|
103
|
-
|
104
|
-
move_segments_root_file
|
105
96
|
end
|
106
97
|
|
107
98
|
def components_for(locale)
|
@@ -3,8 +3,8 @@
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
5
5
|
|
6
|
-
require 'rest-client'
|
7
6
|
require 'json'
|
7
|
+
require 'open-uri'
|
8
8
|
require 'set'
|
9
9
|
require 'yaml'
|
10
10
|
|
@@ -21,38 +21,36 @@ module TwitterCldr
|
|
21
21
|
private
|
22
22
|
|
23
23
|
def execute
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
data = YAML.dump(fetch_data)
|
25
|
+
File.write(File.join(output_path, 'postal_codes.yml'), data)
|
26
|
+
puts
|
27
27
|
end
|
28
28
|
|
29
29
|
def output_path
|
30
30
|
params.fetch(:output_path)
|
31
31
|
end
|
32
32
|
|
33
|
-
def
|
33
|
+
def fetch_data
|
34
34
|
territories = Set.new
|
35
35
|
|
36
36
|
each_territory.each_with_object({}) do |territory, ret|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
37
|
+
if regex = get_regex_for(territory)
|
38
|
+
ret[territory] = {
|
39
|
+
regex: Regexp.compile(regex),
|
40
|
+
ast: TwitterCldr::Utils::RegexpAst.dump(
|
41
|
+
RegexpAstGenerator.generate(regex)
|
42
|
+
)
|
43
|
+
}
|
44
|
+
end
|
45
45
|
|
46
46
|
territories.add(territory)
|
47
47
|
STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
|
48
48
|
end
|
49
|
-
|
50
|
-
puts
|
51
49
|
end
|
52
50
|
|
53
51
|
def get_regex_for(territory)
|
54
|
-
result =
|
55
|
-
data = JSON.parse(result
|
52
|
+
result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
|
53
|
+
data = JSON.parse(result)
|
56
54
|
data['zip']
|
57
55
|
end
|
58
56
|
|
@@ -61,12 +59,10 @@ module TwitterCldr
|
|
61
59
|
end
|
62
60
|
|
63
61
|
def each_territory
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
else
|
69
|
-
to_enum(__method__)
|
62
|
+
return to_enum(__method__) unless block_given?
|
63
|
+
|
64
|
+
TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
|
65
|
+
yield territory
|
70
66
|
end
|
71
67
|
end
|
72
68
|
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright 2012 Twitter, Inc
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
5
|
+
|
6
|
+
require 'fileutils'
|
7
|
+
require 'open-uri'
|
8
|
+
|
9
|
+
module TwitterCldr
|
10
|
+
module Resources
|
11
|
+
class SegmentDictionariesImporter < Importer
|
12
|
+
|
13
|
+
URL_TEMPLATE = 'https://raw.githubusercontent.com/unicode-org/icu/%{icu_version}/%{path}'
|
14
|
+
|
15
|
+
DICTIONARY_FILES = [
|
16
|
+
'icu4c/source/data/brkitr/dictionaries/burmesedict.txt',
|
17
|
+
'icu4c/source/data/brkitr/dictionaries/cjdict.txt',
|
18
|
+
'icu4c/source/data/brkitr/dictionaries/khmerdict.txt',
|
19
|
+
'icu4c/source/data/brkitr/dictionaries/laodict.txt',
|
20
|
+
'icu4c/source/data/brkitr/dictionaries/thaidict.txt'
|
21
|
+
]
|
22
|
+
|
23
|
+
output_path File.join(*%w(shared segments dictionaries))
|
24
|
+
ruby_engine :mri
|
25
|
+
|
26
|
+
def execute
|
27
|
+
FileUtils.mkdir_p(output_path)
|
28
|
+
|
29
|
+
DICTIONARY_FILES.each do |test_file|
|
30
|
+
import_dictionary_file(test_file)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def import_dictionary_file(dictionary_file)
|
37
|
+
source_url = url_for(dictionary_file)
|
38
|
+
source = open(source_url).read
|
39
|
+
lines = source.split("\n")
|
40
|
+
trie = TwitterCldr::Utils::Trie.new
|
41
|
+
space_regexp = TwitterCldr::Shared::UnicodeRegex.compile('\A[[:Z:][:C:]]+').to_regexp
|
42
|
+
|
43
|
+
lines.each do |line|
|
44
|
+
line.sub!(space_regexp, '')
|
45
|
+
next if line.start_with?('#')
|
46
|
+
|
47
|
+
characters, frequency = line.split("\t")
|
48
|
+
frequency = frequency ? frequency.to_i : 0
|
49
|
+
|
50
|
+
trie.add(characters.unpack('U*'), frequency)
|
51
|
+
end
|
52
|
+
|
53
|
+
output_path = output_path_for(dictionary_file)
|
54
|
+
File.write(output_path, Marshal.dump(trie))
|
55
|
+
end
|
56
|
+
|
57
|
+
def url_for(dictionary_file)
|
58
|
+
URL_TEMPLATE % {
|
59
|
+
icu_version: "release-#{Versions.icu_version.gsub('.', '-')}",
|
60
|
+
path: dictionary_file
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
def output_path_for(dictionary_file)
|
65
|
+
file = File.basename(dictionary_file).chomp(File.extname(dictionary_file))
|
66
|
+
File.join(output_path, "#{file}.dump")
|
67
|
+
end
|
68
|
+
|
69
|
+
def output_path
|
70
|
+
params.fetch(:output_path)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -9,47 +9,164 @@ module TwitterCldr
|
|
9
9
|
module Resources
|
10
10
|
class SegmentTestsImporter < Importer
|
11
11
|
|
12
|
-
|
12
|
+
CONFORMANCE_FILES = [
|
13
13
|
'ucd/auxiliary/WordBreakTest.txt',
|
14
14
|
'ucd/auxiliary/SentenceBreakTest.txt',
|
15
15
|
'ucd/auxiliary/GraphemeBreakTest.txt',
|
16
16
|
'ucd/auxiliary/LineBreakTest.txt'
|
17
17
|
]
|
18
18
|
|
19
|
-
|
19
|
+
DICTIONARY_BREAK_SAMPLES = {
|
20
|
+
# Chinese
|
21
|
+
zh: '無畏號航空母艦是一艘隸屬於美國海軍的航空母艦,為艾塞克斯級航空母艦的三號艦。'\
|
22
|
+
'無畏號於1941年開始建造,1943年下水服役,開始參與太平洋戰爭。戰後無畏號退役封存,'\
|
23
|
+
'在韓戰後開始進行SCB-27C改建,又在期間重編為攻擊航母,於1954年在大西洋艦隊重新服役。'\
|
24
|
+
'稍後無畏號又進行SCB-125現代化改建,增設斜角飛行甲板。1962年無畏號重編為反潛航母,'\
|
25
|
+
'舷號改為CVS-11,繼續留在大西洋及地中海執勤。稍後無畏號參與美國的太空計畫,'\
|
26
|
+
'分別擔任水星-宇宙神7號及雙子座3號的救援船。1966年至1969年,無畏號曾三次前往西太平洋,'\
|
27
|
+
'參與越戰。無畏號在1974年退役,並一度預備出售拆解;但在民間組織努力下,'\
|
28
|
+
'海軍在1981年將無畏號捐贈到紐約作博物館艦。1986年,無畏號獲評為美國國家歷史地標。',
|
29
|
+
|
30
|
+
ko: '파일은 이용자가 공용 또는 위키백과 한국어판에 업로드하여 라이선스에 따라 사용 가능한 형태로 제공됩니다. '\
|
31
|
+
'업로드된 파일은 간단한 조작으로 페이지에 삽입할 수 있습니다. 업로드는 신규 이용자를 제외한 등록 이용자라면 '\
|
32
|
+
'가능합니다. 파일을 업로드하기 전에 다음 문단의 업로드를 할 수 없는 파일을 반드시 읽어 보시기 바랍니다. '\
|
33
|
+
'공용 이용 방법 및 업로드에 대해서는 Commons:초보자 길라잡이/업로드를 읽어 보시기 바랍니다. 업로드하는 '\
|
34
|
+
'페이지는 위키백과:파일 올리기를 참조하십시오. 파일의 라이선스가 삽입되는 위키백과의 문서와는 별도로 '\
|
35
|
+
'개별적으로 설정해야 합니다. 파일을 업로드할 때 적절한 라이선스를 선택하고 반드시 표시하십시오.',
|
36
|
+
|
37
|
+
# Thai
|
38
|
+
th: 'ธงไชย แมคอินไตย์ ชื่อเล่น เบิร์ด (เกิด 8 ธันวาคม พ.ศ. 2501) เป็นนักร้อง นักแสดงชาวไทย '\
|
39
|
+
'ได้รับขนานนามว่าเป็น "ซูเปอร์สตาร์เมืองไทย" โดยคนไทยรู้จักกันดี เรียกกันว่า : พี่เบิร์ด '\
|
40
|
+
'แรกเข้าวงการบันเทิงเป็นนักแสดงสมทบ ต่อมาได้รับบทพระเอก โดยภาพยนตร์ที่สร้างชื่อเสียงให้กับเขาที่สุดเรื่อง '\
|
41
|
+
'ด้วยรักคือรัก ส่วนละครที่สร้างชื่อเสียงที่สุดของเขาคือบท "โกโบริ" ในละครคู่กรรม '\
|
42
|
+
'ด้านวงการเพลงซึ่งเป็นอาชีพหลักเขาเริ่มต้นจากการประกวดร้องเพลงของสยามกลการ '\
|
43
|
+
'ต่อมาเป็นนักร้องในสังกัดบริษัท จีเอ็มเอ็ม แกรมมี่ จำกัด (มหาชน) ซึ่งประสบความสำเร็จสูงสุดของประเทศไทย'\
|
44
|
+
'มียอดจำหน่ายอยู่ในระดับแนวหน้าของทวีปเอเชียยอดรวมกว่า 25 ล้านชุด',
|
45
|
+
|
46
|
+
# Khmer
|
47
|
+
km: 'វីគីភីឌា (អង់គ្លេស ៖ Wikipedia) ជាសព្វវចនាធិប្បាយសេរីច្រើនភាសានៅលើអ៊ីនធឺណិត '\
|
48
|
+
'ដែលមនុស្សគ្រប់គ្នាអាចអាននិងធ្វើឱ្យមាតិកាទាន់សម័យបន្ថែមទៀត '\
|
49
|
+
'ធ្វើឱ្យវីគីភីឌាសព្វវចនាធិប្បាយបានក្លាយទៅជាការកែប្រែ '\
|
50
|
+
'ការប្រមូលនិងការអភិរក្សរាប់រយរាប់ពាន់នាក់នៃអ្នកស្ម័គ្រចិត្តនៅជុំវិញពិភពលោក '\
|
51
|
+
'តាមរយៈកម្មវិធីដែលគេហៅថាមេឌាវិគី ។ វីគីភីឌាចាប់ផ្តើមនៅថ្ងៃទី ១៥ មករា ឆ្នាំ ២០០១ '\
|
52
|
+
'ដោយចាប់ផ្តើមគម្រោងពីឈ្មោះសព្វវចនាធិប្បាយណូ៉ភីឌាដែលសរសេរដោយហ្ស៊ីម្ម៊ី '\
|
53
|
+
'វេល្ស និងឡែរ្រី សែងក័រ ។ នៅបច្ចុប្បន្ននេះ វីគីភីឌាមានទាំង់អស់ ២៩៣ ភាសា[៤] ដោយវីគីភីឌាភាសាខ្មែរមាន '\
|
54
|
+
'៧៨៩៨ អត្ថបទ ។ មានវីគីភីឌាច្រើនជាង ៥០ ភាសាដែលមានអត្ថបទច្រើនជាង ១០០.០០០ អត្ថបទ ។ '\
|
55
|
+
'វីគីភីឌាភាសាអាល្លឺម៉ងត្រូវបានគេចែកចាយនៅក្នុងទ្រង់ទ្រាយឌីវីឌី-រ៉ូម ។',
|
56
|
+
|
57
|
+
# Lao
|
58
|
+
lo: 'ວິກິພີເດຍ (ອັງກິດ: Wikipedia) ເປັນສາລະນຸກົມເນື້ອຫາເສລີຫຼາຍພາສາໃນເວັບໄຊ້ '\
|
59
|
+
'ເຊິ່ງໄດ້ຮັບການສະໜັບສະໜຸນຈາກມູນລະນິທິວິກິພີເດຍ ອົງກອນບໍ່ສະແຫວງຫາຜົນກຳໄລ ເນື້ອຫາກວ່າ 35 ລ້ານບົດຄວາມ '\
|
60
|
+
'(ສະເພາະວິກິພີເດຍພາສາອັງກິດມີເນື້ອຫາກວ່າ 4.9 ລ້ານບົດຄວາມ) ເກີດຂຶ້ນຈາກການຮ່ວມຂຽນຂອງອາສາສະໝັກທົ່ວໂລກ '\
|
61
|
+
'ທຸກຄົນທີ່ສາມາດເຂົ້າເຖິງວິກິພີເດຍສາມາດຮ່ວມແກ້ໄຂເກືອບທຸກບົດຄວາມໄດ້ຢ່າງເສລີ ໂດຍມີຜູ້ຂຽນປະມານ 100,000ຄົນ '\
|
62
|
+
'ຈົນເຖິງເດືອນເມສາ ຄ.ສ. 2013 ວິກິພີເດຍມີ 286 ຮຸ່ນພາສາ ແລະ '\
|
63
|
+
'ໄດ້ກາຍມາເປັນງານອ້າງອິງທົ່ວໄປທີ່ໃກຍ່ທີ່ສຸດແລະໄດ້ຮັບຄວາມນິຍົມຫຼາຍທີ່ສຸດຢູ່ອິນເຕີເນັດ ຈົນຖືກຈັດເປັນເວັບໄຊ້ ອັນດັບທີ 6 '\
|
64
|
+
'ທີ່ມີຜູ້ເຂົ້າເບິ່ງຫຼາຍທີ່ສຸດໃນໂລກ ຕາມການຈັດອັນດັບຂອງອາເລັກຊ້າ ດ້ວຍຈຳນວນຜູ້ອ່ານກວ່າ 365 ລ້ານຄົນ '\
|
65
|
+
'ມີການປະເມີນວ່າວິກິພີເດຍມີການຄົ້ນຫາຂໍ້ມູນໃນວິກິພີເດຍກວ່າ 2,700 ລ້ານເທື່ອຕໍ່ເດືອນໃນສະຫະລັດ ອາເມຣິກາ',
|
66
|
+
|
67
|
+
# Burmese
|
68
|
+
my: 'ကိန်းဆိုသည်မှာ ရေတွက်ရန်နှင့် တိုင်းတာရန် အတွက် အသုံးပြုသော သင်္ချာဆိုင်ရာ အရာဝတ္ထုတစ်ခု '\
|
69
|
+
'ဖြစ်သည်။ သင်္ချာပညာတွင် ကိန်းဂဏန်းများ၏ အဓိပ္ပာယ်ဖွင့်ဆိုချက်ကို တဖြည်းဖြည်း ချဲ့ကားလာခဲ့သဖြင့် '\
|
70
|
+
'နှစ်ပေါင်းများစွာ ကြာသောအခါတွင် သုည၊ အနှုတ်ကိန်းများ (negative numbers)၊ ရာရှင်နယ်ကိန်း '\
|
71
|
+
'(rational number) ခေါ် အပိုင်းကိန်းများ၊ အီရာရှင်နယ်ကိန်း (irrational number) ခေါ် '\
|
72
|
+
'အပိုင်းကိန်းမဟုတ်သောကိန်းများ နှင့် ကွန်ပလက်စ်ကိန်း (complex number) ခေါ် ကိန်းရှုပ်များ စသည်ဖြင့် '\
|
73
|
+
'ပါဝင်လာကြသည်။ သင်္ချာဆိုင်ရာ တွက်ချက်မှုများ (mathematical operations) တွင် ဂဏန်းတစ်ခု '\
|
74
|
+
'သို့မဟုတ် တစ်ခုထက်ပိုသော ဂဏန်းများကို အဝင်ကိန်းအဖြစ် လက်ခံကြပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း '\
|
75
|
+
'အဖြစ် ပြန်ထုတ်ပေးသည်။ ယူနရီ တွက်ချက်မှု (unary operation) ခေါ် တစ်လုံးသွင်းတွက်ချက်မှုတွင် '\
|
76
|
+
'ဂဏန်းတစ်ခုကို အဝင်ကိန်း အဖြစ် လက်ခံပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း အဖြစ် ထုတ်ပေးသည်။ '
|
77
|
+
}.freeze
|
78
|
+
|
79
|
+
requirement :unicode, Versions.unicode_version, CONFORMANCE_FILES
|
80
|
+
requirement :icu, Versions.icu_version
|
20
81
|
output_path 'shared/segments/tests'
|
21
|
-
ruby_engine :
|
82
|
+
ruby_engine :jruby
|
22
83
|
|
23
84
|
def execute
|
24
|
-
|
25
|
-
|
26
|
-
|
85
|
+
import_conformance_files
|
86
|
+
import_dictionary_break_tests
|
87
|
+
import_combined_dictionary_break_test
|
27
88
|
end
|
28
89
|
|
29
90
|
private
|
30
91
|
|
31
|
-
def
|
32
|
-
|
92
|
+
def import_conformance_files
|
93
|
+
CONFORMANCE_FILES.each do |test_file|
|
94
|
+
import_conformance_file(test_file)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def import_conformance_file(conformance_file)
|
99
|
+
source_file = conformance_source_path_for(conformance_file)
|
33
100
|
FileUtils.mkdir_p(File.dirname(source_file))
|
34
101
|
result = UnicodeFileParser.parse_standard_file(source_file).map(&:first)
|
35
|
-
output_file =
|
102
|
+
output_file = conformance_output_path_for(conformance_file)
|
36
103
|
FileUtils.mkdir_p(File.dirname(output_file))
|
37
104
|
File.write(output_file, YAML.dump(result))
|
38
105
|
end
|
39
106
|
|
40
|
-
def
|
41
|
-
|
107
|
+
def import_dictionary_break_tests
|
108
|
+
DICTIONARY_BREAK_SAMPLES.each do |locale, text_sample|
|
109
|
+
data = create_dictionary_break_test(locale.to_s, text_sample)
|
110
|
+
dump_dictionary_break_test(locale, data)
|
111
|
+
end
|
42
112
|
end
|
43
113
|
|
44
|
-
def
|
45
|
-
|
114
|
+
def import_combined_dictionary_break_test
|
115
|
+
text_sample = DICTIONARY_BREAK_SAMPLES.values.join(' ')
|
116
|
+
data = create_dictionary_break_test('en', text_sample)
|
117
|
+
dump_dictionary_break_test('combined', data)
|
118
|
+
end
|
119
|
+
|
120
|
+
def create_dictionary_break_test(locale, text_sample)
|
121
|
+
done = break_iterator.const_get(:DONE)
|
122
|
+
iter = break_iterator.get_word_instance(ulocale_class.new(locale))
|
123
|
+
iter.set_text(text_sample)
|
124
|
+
start = iter.first
|
125
|
+
segments = []
|
126
|
+
|
127
|
+
until (stop = iter.next) == done
|
128
|
+
segments << text_sample[start...stop]
|
129
|
+
start = stop
|
130
|
+
end
|
131
|
+
|
132
|
+
{
|
133
|
+
locale: locale,
|
134
|
+
text: text_sample,
|
135
|
+
segments: segments
|
136
|
+
}
|
137
|
+
end
|
138
|
+
|
139
|
+
def dump_dictionary_break_test(name, data)
|
140
|
+
output_file = dictionary_test_output_path_for(name)
|
141
|
+
FileUtils.mkdir_p(File.dirname(output_file))
|
142
|
+
File.write(output_file, YAML.dump(data))
|
143
|
+
end
|
144
|
+
|
145
|
+
def conformance_source_path_for(conformance_file)
|
146
|
+
requirements[:unicode].source_path_for(conformance_file)
|
147
|
+
end
|
148
|
+
|
149
|
+
def conformance_output_path_for(conformance_file)
|
150
|
+
file = underscore(File.basename(conformance_file).chomp(File.extname(conformance_file)))
|
46
151
|
File.join(params.fetch(:output_path), "#{file}.yml")
|
47
152
|
end
|
48
153
|
|
154
|
+
def dictionary_test_output_path_for(locale)
|
155
|
+
File.join(params.fetch(:output_path), 'dictionary_tests', "#{locale}.yml")
|
156
|
+
end
|
157
|
+
|
49
158
|
def underscore(str)
|
50
159
|
str.gsub(/(.)([A-Z])/, '\1_\2').downcase
|
51
160
|
end
|
52
161
|
|
162
|
+
def ulocale_class
|
163
|
+
@ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
|
164
|
+
end
|
165
|
+
|
166
|
+
def break_iterator
|
167
|
+
@break_iterator ||= requirements[:icu].get_class('com.ibm.icu.text.BreakIterator')
|
168
|
+
end
|
169
|
+
|
53
170
|
end
|
54
171
|
end
|
55
172
|
end
|