twitter_cldr 5.2.0 → 5.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -4
- data/Rakefile +19 -8
- data/lib/twitter_cldr/normalization.rb +18 -5
- data/lib/twitter_cldr/resources.rb +3 -1
- data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
- data/lib/twitter_cldr/resources/loader.rb +22 -1
- data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
- data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
- data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
- data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
- data/lib/twitter_cldr/segmentation.rb +25 -10
- data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
- data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
- data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
- data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
- data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
- data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
- data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
- data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
- data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
- data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
- data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
- data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
- data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
- data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
- data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
- data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
- data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
- data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
- data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
- data/lib/twitter_cldr/shared.rb +1 -0
- data/lib/twitter_cldr/shared/caser.rb +3 -3
- data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
- data/lib/twitter_cldr/utils/range_set.rb +10 -1
- data/lib/twitter_cldr/version.rb +1 -1
- data/resources/collation/tailoring/km.yml +82 -0
- data/resources/collation/tailoring/lo.yml +4 -0
- data/resources/collation/tailoring/my.yml +940 -0
- data/resources/collation/tries/km.dump +0 -0
- data/resources/collation/tries/lo.dump +0 -0
- data/resources/collation/tries/my.dump +0 -0
- data/resources/locales/km/calendars.yml +373 -0
- data/resources/locales/km/currencies.yml +654 -0
- data/resources/locales/km/day_periods.yml +96 -0
- data/resources/locales/km/fields.yml +495 -0
- data/resources/locales/km/languages.yml +397 -0
- data/resources/locales/km/layout.yml +5 -0
- data/resources/locales/km/lists.yml +37 -0
- data/resources/locales/km/numbers.yml +402 -0
- data/resources/locales/km/plural_rules.yml +6 -0
- data/resources/locales/km/plurals.yml +12 -0
- data/resources/locales/km/rbnf.yml +131 -0
- data/resources/locales/km/territories.yml +267 -0
- data/resources/locales/km/timezones.yml +1471 -0
- data/resources/locales/km/units.yml +721 -0
- data/resources/locales/lo/calendars.yml +368 -0
- data/resources/locales/lo/currencies.yml +918 -0
- data/resources/locales/lo/day_periods.yml +96 -0
- data/resources/locales/lo/fields.yml +437 -0
- data/resources/locales/lo/languages.yml +529 -0
- data/resources/locales/lo/layout.yml +5 -0
- data/resources/locales/lo/lists.yml +42 -0
- data/resources/locales/lo/numbers.yml +476 -0
- data/resources/locales/lo/plural_rules.yml +7 -0
- data/resources/locales/lo/plurals.yml +14 -0
- data/resources/locales/lo/rbnf.yml +119 -0
- data/resources/locales/lo/territories.yml +265 -0
- data/resources/locales/lo/timezones.yml +1513 -0
- data/resources/locales/lo/units.yml +750 -0
- data/resources/locales/my/calendars.yml +374 -0
- data/resources/locales/my/currencies.yml +697 -0
- data/resources/locales/my/day_periods.yml +96 -0
- data/resources/locales/my/fields.yml +459 -0
- data/resources/locales/my/languages.yml +420 -0
- data/resources/locales/my/layout.yml +5 -0
- data/resources/locales/my/lists.yml +43 -0
- data/resources/locales/my/numbers.yml +417 -0
- data/resources/locales/my/plural_rules.yml +6 -0
- data/resources/locales/my/plurals.yml +12 -0
- data/resources/locales/my/rbnf.yml +145 -0
- data/resources/locales/my/territories.yml +265 -0
- data/resources/locales/my/timezones.yml +1479 -0
- data/resources/locales/my/units.yml +759 -0
- data/resources/locales/th/plurals.yml +1 -1
- data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
- data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
- data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
- data/resources/shared/segments/dictionaries/laodict.dump +0 -0
- data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
- data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
- data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
- data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
- data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
- data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
- data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
- data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
- data/resources/shared/segments/tests/line_break_test.yml +68 -68
- data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
- data/resources/supported_locales.yml +3 -0
- data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
- data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
- data/spec/segmentation/dictionary_break_spec.rb +42 -0
- data/spec/segmentation/rule_set_spec.rb +3 -1
- data/spec/timezones/tests/km.yml +12475 -0
- data/spec/timezones/tests/lo.yml +12475 -0
- data/spec/timezones/tests/my.yml +12475 -0
- metadata +87 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2783ec225d4f260deb8038237125dbc97b78840e951cbff630f3da502e31f41d
|
|
4
|
+
data.tar.gz: 751cc8931ee11db35a533584a1b9f3d8a946e9505641d33caad6a5a5dbc6e866
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7cdc1ec2718ac86b0645813fc0bdfe13b1c2bee075ca90e13231a3d566d915e7ef9bcdf6d69f8d0a7cbcdaf6868eae5c27bfb4b32b52efbd7dcf1adabc20c39d
|
|
7
|
+
data.tar.gz: 20b7e24ec990cc00fb77d60d62c75247108748629c5478a0a315d96e04a76c443f8abc2a4df21b7c4093ad7cc46ee81c4276ca15f97c8698073b7ac9bca98c65
|
data/Gemfile
CHANGED
data/Rakefile
CHANGED
|
@@ -57,16 +57,22 @@ task :update do
|
|
|
57
57
|
end
|
|
58
58
|
|
|
59
59
|
task :add_locale, :locale do |_, args|
|
|
60
|
+
locales = [args[:locale]] + args.extras
|
|
61
|
+
|
|
60
62
|
File.write(
|
|
61
63
|
TwitterCldr::SUPPORTED_LOCALES_FILE,
|
|
62
64
|
YAML.dump(
|
|
63
|
-
(TwitterCldr::SUPPORTED_LOCALES +
|
|
65
|
+
(TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
|
|
64
66
|
)
|
|
65
67
|
)
|
|
66
68
|
|
|
67
69
|
klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
|
|
68
|
-
instances = klasses.map { |klass| klass.new(locales:
|
|
69
|
-
TwitterCldr::Resources::ImportResolver.new(
|
|
70
|
+
instances = klasses.map { |klass| klass.new(locales: locales) }
|
|
71
|
+
resolver = TwitterCldr::Resources::ImportResolver.new(
|
|
72
|
+
instances, allow_missing_dependencies: true
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
resolver.import
|
|
70
76
|
end
|
|
71
77
|
|
|
72
78
|
# add_locale and update_locale do the same thing
|
|
@@ -160,6 +166,16 @@ namespace :update do
|
|
|
160
166
|
TwitterCldr::Resources::SegmentRulesImporter.new.import
|
|
161
167
|
end
|
|
162
168
|
|
|
169
|
+
desc 'Import segmentation dictionaries'
|
|
170
|
+
task :segment_dictionaries do
|
|
171
|
+
TwitterCldr::Resources::SegmentDictionariesImporter.new.import
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
desc 'Import segment tests'
|
|
175
|
+
task :segment_tests do
|
|
176
|
+
TwitterCldr::Resources::SegmentTestsImporter.new.import
|
|
177
|
+
end
|
|
178
|
+
|
|
163
179
|
desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
|
|
164
180
|
task :bidi_tests do
|
|
165
181
|
TwitterCldr::Resources::BidiTestImporter.new.import
|
|
@@ -200,11 +216,6 @@ namespace :update do
|
|
|
200
216
|
TwitterCldr::Resources::TransformTestsImporter.new.import
|
|
201
217
|
end
|
|
202
218
|
|
|
203
|
-
desc 'Import segment tests'
|
|
204
|
-
task :segment_tests do
|
|
205
|
-
TwitterCldr::Resources::SegmentTestsImporter.new.import
|
|
206
|
-
end
|
|
207
|
-
|
|
208
219
|
desc 'Import hyphenation dictionaries'
|
|
209
220
|
task :hyphenation_dictionaries do
|
|
210
221
|
TwitterCldr::Resources::HyphenationImporter.new.import
|
|
@@ -14,12 +14,25 @@ module TwitterCldr
|
|
|
14
14
|
class << self
|
|
15
15
|
|
|
16
16
|
def normalize(string, options = {})
|
|
17
|
-
form = options
|
|
17
|
+
validate_form(form = extract_form_from(options))
|
|
18
|
+
Eprun.normalize(string, form)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def normalized?(string, options = {})
|
|
22
|
+
validate_form(form = extract_form_from(options))
|
|
23
|
+
Eprun.normalized?(string, form)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def extract_form_from(options)
|
|
29
|
+
options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
|
|
30
|
+
end
|
|
18
31
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
32
|
+
def validate_form(form)
|
|
33
|
+
unless VALID_NORMALIZERS.include?(form)
|
|
34
|
+
raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
|
|
35
|
+
"(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
|
|
23
36
|
end
|
|
24
37
|
end
|
|
25
38
|
|
|
@@ -22,12 +22,13 @@ module TwitterCldr
|
|
|
22
22
|
autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
|
|
23
23
|
autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
|
|
24
24
|
autoload :Properties, 'twitter_cldr/resources/properties'
|
|
25
|
+
autoload :SegmentDictionariesImporter, 'twitter_cldr/resources/segment_dictionaries_importer'
|
|
25
26
|
autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
|
|
27
|
+
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
|
|
26
28
|
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
|
|
27
29
|
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
|
|
28
30
|
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
|
|
29
31
|
autoload :Requirements, 'twitter_cldr/resources/requirements'
|
|
30
|
-
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
|
|
31
32
|
autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
|
|
32
33
|
autoload :TerritoriesImporter, 'twitter_cldr/resources/territories_importer'
|
|
33
34
|
autoload :TimezonesImporter, 'twitter_cldr/resources/timezones_importer'
|
|
@@ -60,6 +61,7 @@ module TwitterCldr
|
|
|
60
61
|
NumberFormatsImporter,
|
|
61
62
|
PostalCodesImporter,
|
|
62
63
|
RbnfTestImporter,
|
|
64
|
+
SegmentDictionariesImporter,
|
|
63
65
|
SegmentRulesImporter,
|
|
64
66
|
SegmentTestsImporter,
|
|
65
67
|
TailoringImporter,
|
|
@@ -5,10 +5,11 @@ module TwitterCldr
|
|
|
5
5
|
class ImportResolver
|
|
6
6
|
include TSort
|
|
7
7
|
|
|
8
|
-
attr_reader :importers
|
|
8
|
+
attr_reader :importers, :options
|
|
9
9
|
|
|
10
|
-
def initialize(importers = Resources.importer_classes_for_ruby_engine)
|
|
10
|
+
def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
|
|
11
11
|
@importers = importers
|
|
12
|
+
@options = options
|
|
12
13
|
end
|
|
13
14
|
|
|
14
15
|
def import
|
|
@@ -28,7 +29,12 @@ module TwitterCldr
|
|
|
28
29
|
|
|
29
30
|
def tsort_each_child(instance, &block)
|
|
30
31
|
deps_for(instance).map do |dep_class|
|
|
31
|
-
|
|
32
|
+
dep = instances.find { |ins| ins.class == dep_class }
|
|
33
|
+
yield dep if dep
|
|
34
|
+
|
|
35
|
+
unless options[:allow_missing_dependencies]
|
|
36
|
+
raise "Could not find dependency #{dep_class.name}"
|
|
37
|
+
end
|
|
32
38
|
end
|
|
33
39
|
end
|
|
34
40
|
|
|
@@ -39,6 +45,8 @@ module TwitterCldr
|
|
|
39
45
|
end
|
|
40
46
|
|
|
41
47
|
def check_unmet_instance_deps(instance)
|
|
48
|
+
return if options[:allow_missing_dependencies]
|
|
49
|
+
|
|
42
50
|
unmet_deps = unmet_deps_for(instance)
|
|
43
51
|
|
|
44
52
|
unless unmet_deps.empty?
|
|
@@ -10,6 +10,8 @@ module TwitterCldr
|
|
|
10
10
|
|
|
11
11
|
class Loader
|
|
12
12
|
|
|
13
|
+
VALID_EXTS = %w(.yml .dump).freeze
|
|
14
|
+
|
|
13
15
|
def get_resource(*path)
|
|
14
16
|
resources_cache[resource_file_path(path)]
|
|
15
17
|
end
|
|
@@ -75,7 +77,7 @@ module TwitterCldr
|
|
|
75
77
|
|
|
76
78
|
def resource_file_path(path)
|
|
77
79
|
file = File.join(*path.map(&:to_s))
|
|
78
|
-
file << '.yml' unless
|
|
80
|
+
file << '.yml' unless VALID_EXTS.include?(File.extname(file))
|
|
79
81
|
file
|
|
80
82
|
end
|
|
81
83
|
|
|
@@ -92,6 +94,17 @@ module TwitterCldr
|
|
|
92
94
|
end
|
|
93
95
|
|
|
94
96
|
def load_resource(path, merge_custom = true)
|
|
97
|
+
case File.extname(path)
|
|
98
|
+
when '.yml'
|
|
99
|
+
load_yaml_resource(path, merge_custom)
|
|
100
|
+
when '.dump'
|
|
101
|
+
load_marshalled_resource(path, merge_custom)
|
|
102
|
+
else
|
|
103
|
+
load_raw_resource(path, merge_custom)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def load_yaml_resource(path, merge_custom = true)
|
|
95
108
|
base = YAML.load(read_resource_file(path))
|
|
96
109
|
custom_path = File.join("custom", path)
|
|
97
110
|
|
|
@@ -102,6 +115,14 @@ module TwitterCldr
|
|
|
102
115
|
base
|
|
103
116
|
end
|
|
104
117
|
|
|
118
|
+
def load_marshalled_resource(path, _merge_custom = :unused)
|
|
119
|
+
Marshal.load(read_resource_file(path))
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def load_raw_resource(path, _merge_custom = :unused)
|
|
123
|
+
read_resource_file(path)
|
|
124
|
+
end
|
|
125
|
+
|
|
105
126
|
def custom_resource_exists?(custom_path)
|
|
106
127
|
File.exist?(
|
|
107
128
|
File.join(TwitterCldr::RESOURCES_DIR, custom_path)
|
|
@@ -56,13 +56,6 @@ module TwitterCldr
|
|
|
56
56
|
params.fetch(:output_path)
|
|
57
57
|
end
|
|
58
58
|
|
|
59
|
-
def move_segments_root_file
|
|
60
|
-
old_file_path = File.join(output_path, *%w(shared segments_root.yml))
|
|
61
|
-
new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
|
|
62
|
-
FileUtils.mkdir_p(File.dirname(new_file_path))
|
|
63
|
-
FileUtils.move(old_file_path, new_file_path)
|
|
64
|
-
end
|
|
65
|
-
|
|
66
59
|
def import_components
|
|
67
60
|
locales = Set.new
|
|
68
61
|
|
|
@@ -100,8 +93,6 @@ module TwitterCldr
|
|
|
100
93
|
Cldr::Export.export(export_args) do |component, locale, path|
|
|
101
94
|
deep_symbolize(path)
|
|
102
95
|
end
|
|
103
|
-
|
|
104
|
-
move_segments_root_file
|
|
105
96
|
end
|
|
106
97
|
|
|
107
98
|
def components_for(locale)
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# Copyright 2012 Twitter, Inc
|
|
4
4
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
5
|
|
|
6
|
-
require 'rest-client'
|
|
7
6
|
require 'json'
|
|
7
|
+
require 'open-uri'
|
|
8
8
|
require 'set'
|
|
9
9
|
require 'yaml'
|
|
10
10
|
|
|
@@ -21,38 +21,36 @@ module TwitterCldr
|
|
|
21
21
|
private
|
|
22
22
|
|
|
23
23
|
def execute
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
data = YAML.dump(fetch_data)
|
|
25
|
+
File.write(File.join(output_path, 'postal_codes.yml'), data)
|
|
26
|
+
puts
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
def output_path
|
|
30
30
|
params.fetch(:output_path)
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
def
|
|
33
|
+
def fetch_data
|
|
34
34
|
territories = Set.new
|
|
35
35
|
|
|
36
36
|
each_territory.each_with_object({}) do |territory, ret|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
37
|
+
if regex = get_regex_for(territory)
|
|
38
|
+
ret[territory] = {
|
|
39
|
+
regex: Regexp.compile(regex),
|
|
40
|
+
ast: TwitterCldr::Utils::RegexpAst.dump(
|
|
41
|
+
RegexpAstGenerator.generate(regex)
|
|
42
|
+
)
|
|
43
|
+
}
|
|
44
|
+
end
|
|
45
45
|
|
|
46
46
|
territories.add(territory)
|
|
47
47
|
STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
|
|
48
48
|
end
|
|
49
|
-
|
|
50
|
-
puts
|
|
51
49
|
end
|
|
52
50
|
|
|
53
51
|
def get_regex_for(territory)
|
|
54
|
-
result =
|
|
55
|
-
data = JSON.parse(result
|
|
52
|
+
result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
|
|
53
|
+
data = JSON.parse(result)
|
|
56
54
|
data['zip']
|
|
57
55
|
end
|
|
58
56
|
|
|
@@ -61,12 +59,10 @@ module TwitterCldr
|
|
|
61
59
|
end
|
|
62
60
|
|
|
63
61
|
def each_territory
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
else
|
|
69
|
-
to_enum(__method__)
|
|
62
|
+
return to_enum(__method__) unless block_given?
|
|
63
|
+
|
|
64
|
+
TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
|
|
65
|
+
yield territory
|
|
70
66
|
end
|
|
71
67
|
end
|
|
72
68
|
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
# Copyright 2012 Twitter, Inc
|
|
4
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
5
|
+
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
require 'open-uri'
|
|
8
|
+
|
|
9
|
+
module TwitterCldr
|
|
10
|
+
module Resources
|
|
11
|
+
class SegmentDictionariesImporter < Importer
|
|
12
|
+
|
|
13
|
+
URL_TEMPLATE = 'https://raw.githubusercontent.com/unicode-org/icu/%{icu_version}/%{path}'
|
|
14
|
+
|
|
15
|
+
DICTIONARY_FILES = [
|
|
16
|
+
'icu4c/source/data/brkitr/dictionaries/burmesedict.txt',
|
|
17
|
+
'icu4c/source/data/brkitr/dictionaries/cjdict.txt',
|
|
18
|
+
'icu4c/source/data/brkitr/dictionaries/khmerdict.txt',
|
|
19
|
+
'icu4c/source/data/brkitr/dictionaries/laodict.txt',
|
|
20
|
+
'icu4c/source/data/brkitr/dictionaries/thaidict.txt'
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
output_path File.join(*%w(shared segments dictionaries))
|
|
24
|
+
ruby_engine :mri
|
|
25
|
+
|
|
26
|
+
def execute
|
|
27
|
+
FileUtils.mkdir_p(output_path)
|
|
28
|
+
|
|
29
|
+
DICTIONARY_FILES.each do |test_file|
|
|
30
|
+
import_dictionary_file(test_file)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def import_dictionary_file(dictionary_file)
|
|
37
|
+
source_url = url_for(dictionary_file)
|
|
38
|
+
source = open(source_url).read
|
|
39
|
+
lines = source.split("\n")
|
|
40
|
+
trie = TwitterCldr::Utils::Trie.new
|
|
41
|
+
space_regexp = TwitterCldr::Shared::UnicodeRegex.compile('\A[[:Z:][:C:]]+').to_regexp
|
|
42
|
+
|
|
43
|
+
lines.each do |line|
|
|
44
|
+
line.sub!(space_regexp, '')
|
|
45
|
+
next if line.start_with?('#')
|
|
46
|
+
|
|
47
|
+
characters, frequency = line.split("\t")
|
|
48
|
+
frequency = frequency ? frequency.to_i : 0
|
|
49
|
+
|
|
50
|
+
trie.add(characters.unpack('U*'), frequency)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
output_path = output_path_for(dictionary_file)
|
|
54
|
+
File.write(output_path, Marshal.dump(trie))
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def url_for(dictionary_file)
|
|
58
|
+
URL_TEMPLATE % {
|
|
59
|
+
icu_version: "release-#{Versions.icu_version.gsub('.', '-')}",
|
|
60
|
+
path: dictionary_file
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def output_path_for(dictionary_file)
|
|
65
|
+
file = File.basename(dictionary_file).chomp(File.extname(dictionary_file))
|
|
66
|
+
File.join(output_path, "#{file}.dump")
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def output_path
|
|
70
|
+
params.fetch(:output_path)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -9,47 +9,164 @@ module TwitterCldr
|
|
|
9
9
|
module Resources
|
|
10
10
|
class SegmentTestsImporter < Importer
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
CONFORMANCE_FILES = [
|
|
13
13
|
'ucd/auxiliary/WordBreakTest.txt',
|
|
14
14
|
'ucd/auxiliary/SentenceBreakTest.txt',
|
|
15
15
|
'ucd/auxiliary/GraphemeBreakTest.txt',
|
|
16
16
|
'ucd/auxiliary/LineBreakTest.txt'
|
|
17
17
|
]
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
DICTIONARY_BREAK_SAMPLES = {
|
|
20
|
+
# Chinese
|
|
21
|
+
zh: '無畏號航空母艦是一艘隸屬於美國海軍的航空母艦,為艾塞克斯級航空母艦的三號艦。'\
|
|
22
|
+
'無畏號於1941年開始建造,1943年下水服役,開始參與太平洋戰爭。戰後無畏號退役封存,'\
|
|
23
|
+
'在韓戰後開始進行SCB-27C改建,又在期間重編為攻擊航母,於1954年在大西洋艦隊重新服役。'\
|
|
24
|
+
'稍後無畏號又進行SCB-125現代化改建,增設斜角飛行甲板。1962年無畏號重編為反潛航母,'\
|
|
25
|
+
'舷號改為CVS-11,繼續留在大西洋及地中海執勤。稍後無畏號參與美國的太空計畫,'\
|
|
26
|
+
'分別擔任水星-宇宙神7號及雙子座3號的救援船。1966年至1969年,無畏號曾三次前往西太平洋,'\
|
|
27
|
+
'參與越戰。無畏號在1974年退役,並一度預備出售拆解;但在民間組織努力下,'\
|
|
28
|
+
'海軍在1981年將無畏號捐贈到紐約作博物館艦。1986年,無畏號獲評為美國國家歷史地標。',
|
|
29
|
+
|
|
30
|
+
ko: '파일은 이용자가 공용 또는 위키백과 한국어판에 업로드하여 라이선스에 따라 사용 가능한 형태로 제공됩니다. '\
|
|
31
|
+
'업로드된 파일은 간단한 조작으로 페이지에 삽입할 수 있습니다. 업로드는 신규 이용자를 제외한 등록 이용자라면 '\
|
|
32
|
+
'가능합니다. 파일을 업로드하기 전에 다음 문단의 업로드를 할 수 없는 파일을 반드시 읽어 보시기 바랍니다. '\
|
|
33
|
+
'공용 이용 방법 및 업로드에 대해서는 Commons:초보자 길라잡이/업로드를 읽어 보시기 바랍니다. 업로드하는 '\
|
|
34
|
+
'페이지는 위키백과:파일 올리기를 참조하십시오. 파일의 라이선스가 삽입되는 위키백과의 문서와는 별도로 '\
|
|
35
|
+
'개별적으로 설정해야 합니다. 파일을 업로드할 때 적절한 라이선스를 선택하고 반드시 표시하십시오.',
|
|
36
|
+
|
|
37
|
+
# Thai
|
|
38
|
+
th: 'ธงไชย แมคอินไตย์ ชื่อเล่น เบิร์ด (เกิด 8 ธันวาคม พ.ศ. 2501) เป็นนักร้อง นักแสดงชาวไทย '\
|
|
39
|
+
'ได้รับขนานนามว่าเป็น "ซูเปอร์สตาร์เมืองไทย" โดยคนไทยรู้จักกันดี เรียกกันว่า : พี่เบิร์ด '\
|
|
40
|
+
'แรกเข้าวงการบันเทิงเป็นนักแสดงสมทบ ต่อมาได้รับบทพระเอก โดยภาพยนตร์ที่สร้างชื่อเสียงให้กับเขาที่สุดเรื่อง '\
|
|
41
|
+
'ด้วยรักคือรัก ส่วนละครที่สร้างชื่อเสียงที่สุดของเขาคือบท "โกโบริ" ในละครคู่กรรม '\
|
|
42
|
+
'ด้านวงการเพลงซึ่งเป็นอาชีพหลักเขาเริ่มต้นจากการประกวดร้องเพลงของสยามกลการ '\
|
|
43
|
+
'ต่อมาเป็นนักร้องในสังกัดบริษัท จีเอ็มเอ็ม แกรมมี่ จำกัด (มหาชน) ซึ่งประสบความสำเร็จสูงสุดของประเทศไทย'\
|
|
44
|
+
'มียอดจำหน่ายอยู่ในระดับแนวหน้าของทวีปเอเชียยอดรวมกว่า 25 ล้านชุด',
|
|
45
|
+
|
|
46
|
+
# Khmer
|
|
47
|
+
km: 'វីគីភីឌា (អង់គ្លេស ៖ Wikipedia) ជាសព្វវចនាធិប្បាយសេរីច្រើនភាសានៅលើអ៊ីនធឺណិត '\
|
|
48
|
+
'ដែលមនុស្សគ្រប់គ្នាអាចអាននិងធ្វើឱ្យមាតិកាទាន់សម័យបន្ថែមទៀត '\
|
|
49
|
+
'ធ្វើឱ្យវីគីភីឌាសព្វវចនាធិប្បាយបានក្លាយទៅជាការកែប្រែ '\
|
|
50
|
+
'ការប្រមូលនិងការអភិរក្សរាប់រយរាប់ពាន់នាក់នៃអ្នកស្ម័គ្រចិត្តនៅជុំវិញពិភពលោក '\
|
|
51
|
+
'តាមរយៈកម្មវិធីដែលគេហៅថាមេឌាវិគី ។ វីគីភីឌាចាប់ផ្តើមនៅថ្ងៃទី ១៥ មករា ឆ្នាំ ២០០១ '\
|
|
52
|
+
'ដោយចាប់ផ្តើមគម្រោងពីឈ្មោះសព្វវចនាធិប្បាយណូ៉ភីឌាដែលសរសេរដោយហ្ស៊ីម្ម៊ី '\
|
|
53
|
+
'វេល្ស និងឡែរ្រី សែងក័រ ។ នៅបច្ចុប្បន្ននេះ វីគីភីឌាមានទាំង់អស់ ២៩៣ ភាសា[៤] ដោយវីគីភីឌាភាសាខ្មែរមាន '\
|
|
54
|
+
'៧៨៩៨ អត្ថបទ ។ មានវីគីភីឌាច្រើនជាង ៥០ ភាសាដែលមានអត្ថបទច្រើនជាង ១០០.០០០ អត្ថបទ ។ '\
|
|
55
|
+
'វីគីភីឌាភាសាអាល្លឺម៉ងត្រូវបានគេចែកចាយនៅក្នុងទ្រង់ទ្រាយឌីវីឌី-រ៉ូម ។',
|
|
56
|
+
|
|
57
|
+
# Lao
|
|
58
|
+
lo: 'ວິກິພີເດຍ (ອັງກິດ: Wikipedia) ເປັນສາລະນຸກົມເນື້ອຫາເສລີຫຼາຍພາສາໃນເວັບໄຊ້ '\
|
|
59
|
+
'ເຊິ່ງໄດ້ຮັບການສະໜັບສະໜຸນຈາກມູນລະນິທິວິກິພີເດຍ ອົງກອນບໍ່ສະແຫວງຫາຜົນກຳໄລ ເນື້ອຫາກວ່າ 35 ລ້ານບົດຄວາມ '\
|
|
60
|
+
'(ສະເພາະວິກິພີເດຍພາສາອັງກິດມີເນື້ອຫາກວ່າ 4.9 ລ້ານບົດຄວາມ) ເກີດຂຶ້ນຈາກການຮ່ວມຂຽນຂອງອາສາສະໝັກທົ່ວໂລກ '\
|
|
61
|
+
'ທຸກຄົນທີ່ສາມາດເຂົ້າເຖິງວິກິພີເດຍສາມາດຮ່ວມແກ້ໄຂເກືອບທຸກບົດຄວາມໄດ້ຢ່າງເສລີ ໂດຍມີຜູ້ຂຽນປະມານ 100,000ຄົນ '\
|
|
62
|
+
'ຈົນເຖິງເດືອນເມສາ ຄ.ສ. 2013 ວິກິພີເດຍມີ 286 ຮຸ່ນພາສາ ແລະ '\
|
|
63
|
+
'ໄດ້ກາຍມາເປັນງານອ້າງອິງທົ່ວໄປທີ່ໃກຍ່ທີ່ສຸດແລະໄດ້ຮັບຄວາມນິຍົມຫຼາຍທີ່ສຸດຢູ່ອິນເຕີເນັດ ຈົນຖືກຈັດເປັນເວັບໄຊ້ ອັນດັບທີ 6 '\
|
|
64
|
+
'ທີ່ມີຜູ້ເຂົ້າເບິ່ງຫຼາຍທີ່ສຸດໃນໂລກ ຕາມການຈັດອັນດັບຂອງອາເລັກຊ້າ ດ້ວຍຈຳນວນຜູ້ອ່ານກວ່າ 365 ລ້ານຄົນ '\
|
|
65
|
+
'ມີການປະເມີນວ່າວິກິພີເດຍມີການຄົ້ນຫາຂໍ້ມູນໃນວິກິພີເດຍກວ່າ 2,700 ລ້ານເທື່ອຕໍ່ເດືອນໃນສະຫະລັດ ອາເມຣິກາ',
|
|
66
|
+
|
|
67
|
+
# Burmese
|
|
68
|
+
my: 'ကိန်းဆိုသည်မှာ ရေတွက်ရန်နှင့် တိုင်းတာရန် အတွက် အသုံးပြုသော သင်္ချာဆိုင်ရာ အရာဝတ္ထုတစ်ခု '\
|
|
69
|
+
'ဖြစ်သည်။ သင်္ချာပညာတွင် ကိန်းဂဏန်းများ၏ အဓိပ္ပာယ်ဖွင့်ဆိုချက်ကို တဖြည်းဖြည်း ချဲ့ကားလာခဲ့သဖြင့် '\
|
|
70
|
+
'နှစ်ပေါင်းများစွာ ကြာသောအခါတွင် သုည၊ အနှုတ်ကိန်းများ (negative numbers)၊ ရာရှင်နယ်ကိန်း '\
|
|
71
|
+
'(rational number) ခေါ် အပိုင်းကိန်းများ၊ အီရာရှင်နယ်ကိန်း (irrational number) ခေါ် '\
|
|
72
|
+
'အပိုင်းကိန်းမဟုတ်သောကိန်းများ နှင့် ကွန်ပလက်စ်ကိန်း (complex number) ခေါ် ကိန်းရှုပ်များ စသည်ဖြင့် '\
|
|
73
|
+
'ပါဝင်လာကြသည်။ သင်္ချာဆိုင်ရာ တွက်ချက်မှုများ (mathematical operations) တွင် ဂဏန်းတစ်ခု '\
|
|
74
|
+
'သို့မဟုတ် တစ်ခုထက်ပိုသော ဂဏန်းများကို အဝင်ကိန်းအဖြစ် လက်ခံကြပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း '\
|
|
75
|
+
'အဖြစ် ပြန်ထုတ်ပေးသည်။ ယူနရီ တွက်ချက်မှု (unary operation) ခေါ် တစ်လုံးသွင်းတွက်ချက်မှုတွင် '\
|
|
76
|
+
'ဂဏန်းတစ်ခုကို အဝင်ကိန်း အဖြစ် လက်ခံပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း အဖြစ် ထုတ်ပေးသည်။ '
|
|
77
|
+
}.freeze
|
|
78
|
+
|
|
79
|
+
requirement :unicode, Versions.unicode_version, CONFORMANCE_FILES
|
|
80
|
+
requirement :icu, Versions.icu_version
|
|
20
81
|
output_path 'shared/segments/tests'
|
|
21
|
-
ruby_engine :
|
|
82
|
+
ruby_engine :jruby
|
|
22
83
|
|
|
23
84
|
def execute
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
85
|
+
import_conformance_files
|
|
86
|
+
import_dictionary_break_tests
|
|
87
|
+
import_combined_dictionary_break_test
|
|
27
88
|
end
|
|
28
89
|
|
|
29
90
|
private
|
|
30
91
|
|
|
31
|
-
def
|
|
32
|
-
|
|
92
|
+
def import_conformance_files
|
|
93
|
+
CONFORMANCE_FILES.each do |test_file|
|
|
94
|
+
import_conformance_file(test_file)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def import_conformance_file(conformance_file)
|
|
99
|
+
source_file = conformance_source_path_for(conformance_file)
|
|
33
100
|
FileUtils.mkdir_p(File.dirname(source_file))
|
|
34
101
|
result = UnicodeFileParser.parse_standard_file(source_file).map(&:first)
|
|
35
|
-
output_file =
|
|
102
|
+
output_file = conformance_output_path_for(conformance_file)
|
|
36
103
|
FileUtils.mkdir_p(File.dirname(output_file))
|
|
37
104
|
File.write(output_file, YAML.dump(result))
|
|
38
105
|
end
|
|
39
106
|
|
|
40
|
-
def
|
|
41
|
-
|
|
107
|
+
def import_dictionary_break_tests
|
|
108
|
+
DICTIONARY_BREAK_SAMPLES.each do |locale, text_sample|
|
|
109
|
+
data = create_dictionary_break_test(locale.to_s, text_sample)
|
|
110
|
+
dump_dictionary_break_test(locale, data)
|
|
111
|
+
end
|
|
42
112
|
end
|
|
43
113
|
|
|
44
|
-
def
|
|
45
|
-
|
|
114
|
+
def import_combined_dictionary_break_test
|
|
115
|
+
text_sample = DICTIONARY_BREAK_SAMPLES.values.join(' ')
|
|
116
|
+
data = create_dictionary_break_test('en', text_sample)
|
|
117
|
+
dump_dictionary_break_test('combined', data)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def create_dictionary_break_test(locale, text_sample)
|
|
121
|
+
done = break_iterator.const_get(:DONE)
|
|
122
|
+
iter = break_iterator.get_word_instance(ulocale_class.new(locale))
|
|
123
|
+
iter.set_text(text_sample)
|
|
124
|
+
start = iter.first
|
|
125
|
+
segments = []
|
|
126
|
+
|
|
127
|
+
until (stop = iter.next) == done
|
|
128
|
+
segments << text_sample[start...stop]
|
|
129
|
+
start = stop
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
{
|
|
133
|
+
locale: locale,
|
|
134
|
+
text: text_sample,
|
|
135
|
+
segments: segments
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def dump_dictionary_break_test(name, data)
|
|
140
|
+
output_file = dictionary_test_output_path_for(name)
|
|
141
|
+
FileUtils.mkdir_p(File.dirname(output_file))
|
|
142
|
+
File.write(output_file, YAML.dump(data))
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def conformance_source_path_for(conformance_file)
|
|
146
|
+
requirements[:unicode].source_path_for(conformance_file)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def conformance_output_path_for(conformance_file)
|
|
150
|
+
file = underscore(File.basename(conformance_file).chomp(File.extname(conformance_file)))
|
|
46
151
|
File.join(params.fetch(:output_path), "#{file}.yml")
|
|
47
152
|
end
|
|
48
153
|
|
|
154
|
+
def dictionary_test_output_path_for(locale)
|
|
155
|
+
File.join(params.fetch(:output_path), 'dictionary_tests', "#{locale}.yml")
|
|
156
|
+
end
|
|
157
|
+
|
|
49
158
|
def underscore(str)
|
|
50
159
|
str.gsub(/(.)([A-Z])/, '\1_\2').downcase
|
|
51
160
|
end
|
|
52
161
|
|
|
162
|
+
def ulocale_class
|
|
163
|
+
@ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def break_iterator
|
|
167
|
+
@break_iterator ||= requirements[:icu].get_class('com.ibm.icu.text.BreakIterator')
|
|
168
|
+
end
|
|
169
|
+
|
|
53
170
|
end
|
|
54
171
|
end
|
|
55
172
|
end
|