twitter_cldr 5.2.0 → 5.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/Rakefile +19 -8
  4. data/lib/twitter_cldr/normalization.rb +18 -5
  5. data/lib/twitter_cldr/resources.rb +3 -1
  6. data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
  7. data/lib/twitter_cldr/resources/loader.rb +22 -1
  8. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
  9. data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
  10. data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
  11. data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
  12. data/lib/twitter_cldr/segmentation.rb +25 -10
  13. data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
  14. data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
  15. data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
  16. data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
  17. data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
  18. data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
  19. data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
  20. data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
  21. data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
  22. data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
  23. data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
  24. data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
  25. data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
  26. data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
  27. data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
  28. data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
  29. data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
  30. data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
  31. data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
  32. data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
  33. data/lib/twitter_cldr/shared.rb +1 -0
  34. data/lib/twitter_cldr/shared/caser.rb +3 -3
  35. data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
  36. data/lib/twitter_cldr/utils/range_set.rb +10 -1
  37. data/lib/twitter_cldr/version.rb +1 -1
  38. data/resources/collation/tailoring/km.yml +82 -0
  39. data/resources/collation/tailoring/lo.yml +4 -0
  40. data/resources/collation/tailoring/my.yml +940 -0
  41. data/resources/collation/tries/km.dump +0 -0
  42. data/resources/collation/tries/lo.dump +0 -0
  43. data/resources/collation/tries/my.dump +0 -0
  44. data/resources/locales/km/calendars.yml +373 -0
  45. data/resources/locales/km/currencies.yml +654 -0
  46. data/resources/locales/km/day_periods.yml +96 -0
  47. data/resources/locales/km/fields.yml +495 -0
  48. data/resources/locales/km/languages.yml +397 -0
  49. data/resources/locales/km/layout.yml +5 -0
  50. data/resources/locales/km/lists.yml +37 -0
  51. data/resources/locales/km/numbers.yml +402 -0
  52. data/resources/locales/km/plural_rules.yml +6 -0
  53. data/resources/locales/km/plurals.yml +12 -0
  54. data/resources/locales/km/rbnf.yml +131 -0
  55. data/resources/locales/km/territories.yml +267 -0
  56. data/resources/locales/km/timezones.yml +1471 -0
  57. data/resources/locales/km/units.yml +721 -0
  58. data/resources/locales/lo/calendars.yml +368 -0
  59. data/resources/locales/lo/currencies.yml +918 -0
  60. data/resources/locales/lo/day_periods.yml +96 -0
  61. data/resources/locales/lo/fields.yml +437 -0
  62. data/resources/locales/lo/languages.yml +529 -0
  63. data/resources/locales/lo/layout.yml +5 -0
  64. data/resources/locales/lo/lists.yml +42 -0
  65. data/resources/locales/lo/numbers.yml +476 -0
  66. data/resources/locales/lo/plural_rules.yml +7 -0
  67. data/resources/locales/lo/plurals.yml +14 -0
  68. data/resources/locales/lo/rbnf.yml +119 -0
  69. data/resources/locales/lo/territories.yml +265 -0
  70. data/resources/locales/lo/timezones.yml +1513 -0
  71. data/resources/locales/lo/units.yml +750 -0
  72. data/resources/locales/my/calendars.yml +374 -0
  73. data/resources/locales/my/currencies.yml +697 -0
  74. data/resources/locales/my/day_periods.yml +96 -0
  75. data/resources/locales/my/fields.yml +459 -0
  76. data/resources/locales/my/languages.yml +420 -0
  77. data/resources/locales/my/layout.yml +5 -0
  78. data/resources/locales/my/lists.yml +43 -0
  79. data/resources/locales/my/numbers.yml +417 -0
  80. data/resources/locales/my/plural_rules.yml +6 -0
  81. data/resources/locales/my/plurals.yml +12 -0
  82. data/resources/locales/my/rbnf.yml +145 -0
  83. data/resources/locales/my/territories.yml +265 -0
  84. data/resources/locales/my/timezones.yml +1479 -0
  85. data/resources/locales/my/units.yml +759 -0
  86. data/resources/locales/th/plurals.yml +1 -1
  87. data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
  88. data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
  89. data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
  90. data/resources/shared/segments/dictionaries/laodict.dump +0 -0
  91. data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
  92. data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
  93. data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
  94. data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
  95. data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
  96. data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
  97. data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
  98. data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
  99. data/resources/shared/segments/tests/line_break_test.yml +68 -68
  100. data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
  101. data/resources/supported_locales.yml +3 -0
  102. data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
  103. data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
  104. data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
  105. data/spec/segmentation/dictionary_break_spec.rb +42 -0
  106. data/spec/segmentation/rule_set_spec.rb +3 -1
  107. data/spec/timezones/tests/km.yml +12475 -0
  108. data/spec/timezones/tests/lo.yml +12475 -0
  109. data/spec/timezones/tests/my.yml +12475 -0
  110. metadata +87 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b616d55c343da67733837c0f1549329d895ea3758011b4c5dd8c90c3c631f53a
4
- data.tar.gz: 3717867c2412adcc7a95ff1dfbe032e1754ade3cc005b4445d1f8ff644048b06
3
+ metadata.gz: 2783ec225d4f260deb8038237125dbc97b78840e951cbff630f3da502e31f41d
4
+ data.tar.gz: 751cc8931ee11db35a533584a1b9f3d8a946e9505641d33caad6a5a5dbc6e866
5
5
  SHA512:
6
- metadata.gz: f82323e912a622930f192a2ffe8b742ce1378feff847a082e04b7ea0feb5df215faa183861e31a35e69bc966c6222d0182689cdc5264de69263acbc60acce8ff
7
- data.tar.gz: dc11c5d5e3ab6cc0f2cf3a6073686f7167ac6621252a0b6bf4c7f0b1eb53ae7134507aa0d66e5c0c57e7abc9b689a6438032b4982abe766b9c8550b6df0139b8
6
+ metadata.gz: 7cdc1ec2718ac86b0645813fc0bdfe13b1c2bee075ca90e13231a3d566d915e7ef9bcdf6d69f8d0a7cbcdaf6868eae5c27bfb4b32b52efbd7dcf1adabc20c39d
7
+ data.tar.gz: 20b7e24ec990cc00fb77d60d62c75247108748629c5478a0a315d96e04a76c443f8abc2a4df21b7c4093ad7cc46ee81c4276ca15f97c8698073b7ac9bca98c65
data/Gemfile CHANGED
@@ -23,10 +23,6 @@ group :development do
23
23
  gem 'ruby-cldr', github: 'camertron/ruby-cldr', branch: 'mapzones' # 'svenfuchs/ruby-cldr'
24
24
  gem 'i18n'
25
25
  gem 'cldr-plurals', '~> 1.0'
26
-
27
- gem 'rest-client', '~> 1.8'
28
-
29
- gem 'parallel'
30
26
  end
31
27
 
32
28
  group :test do
data/Rakefile CHANGED
@@ -57,16 +57,22 @@ task :update do
57
57
  end
58
58
 
59
59
  task :add_locale, :locale do |_, args|
60
+ locales = [args[:locale]] + args.extras
61
+
60
62
  File.write(
61
63
  TwitterCldr::SUPPORTED_LOCALES_FILE,
62
64
  YAML.dump(
63
- (TwitterCldr::SUPPORTED_LOCALES + [args[:locale]]).map(&:to_sym).uniq.sort
65
+ (TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
64
66
  )
65
67
  )
66
68
 
67
69
  klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
68
- instances = klasses.map { |klass| klass.new(locales: [args[:locale]]) }
69
- TwitterCldr::Resources::ImportResolver.new(instances).import
70
+ instances = klasses.map { |klass| klass.new(locales: locales) }
71
+ resolver = TwitterCldr::Resources::ImportResolver.new(
72
+ instances, allow_missing_dependencies: true
73
+ )
74
+
75
+ resolver.import
70
76
  end
71
77
 
72
78
  # add_locale and update_locale do the same thing
@@ -160,6 +166,16 @@ namespace :update do
160
166
  TwitterCldr::Resources::SegmentRulesImporter.new.import
161
167
  end
162
168
 
169
+ desc 'Import segmentation dictionaries'
170
+ task :segment_dictionaries do
171
+ TwitterCldr::Resources::SegmentDictionariesImporter.new.import
172
+ end
173
+
174
+ desc 'Import segment tests'
175
+ task :segment_tests do
176
+ TwitterCldr::Resources::SegmentTestsImporter.new.import
177
+ end
178
+
163
179
  desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
164
180
  task :bidi_tests do
165
181
  TwitterCldr::Resources::BidiTestImporter.new.import
@@ -200,11 +216,6 @@ namespace :update do
200
216
  TwitterCldr::Resources::TransformTestsImporter.new.import
201
217
  end
202
218
 
203
- desc 'Import segment tests'
204
- task :segment_tests do
205
- TwitterCldr::Resources::SegmentTestsImporter.new.import
206
- end
207
-
208
219
  desc 'Import hyphenation dictionaries'
209
220
  task :hyphenation_dictionaries do
210
221
  TwitterCldr::Resources::HyphenationImporter.new.import
@@ -14,12 +14,25 @@ module TwitterCldr
14
14
  class << self
15
15
 
16
16
  def normalize(string, options = {})
17
- form = options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
17
+ validate_form(form = extract_form_from(options))
18
+ Eprun.normalize(string, form)
19
+ end
20
+
21
+ def normalized?(string, options = {})
22
+ validate_form(form = extract_form_from(options))
23
+ Eprun.normalized?(string, form)
24
+ end
25
+
26
+ private
27
+
28
+ def extract_form_from(options)
29
+ options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
30
+ end
18
31
 
19
- if VALID_NORMALIZERS.include?(form)
20
- Eprun.normalize(string, form)
21
- else
22
- raise ArgumentError.new("#{form.inspect} is not a valid normalizer (valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
32
+ def validate_form(form)
33
+ unless VALID_NORMALIZERS.include?(form)
34
+ raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
35
+ "(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
23
36
  end
24
37
  end
25
38
 
@@ -22,12 +22,13 @@ module TwitterCldr
22
22
  autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
23
23
  autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
24
24
  autoload :Properties, 'twitter_cldr/resources/properties'
25
+ autoload :SegmentDictionariesImporter, 'twitter_cldr/resources/segment_dictionaries_importer'
25
26
  autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
27
+ autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
26
28
  autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
27
29
  autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
28
30
  autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
29
31
  autoload :Requirements, 'twitter_cldr/resources/requirements'
30
- autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
31
32
  autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
32
33
  autoload :TerritoriesImporter, 'twitter_cldr/resources/territories_importer'
33
34
  autoload :TimezonesImporter, 'twitter_cldr/resources/timezones_importer'
@@ -60,6 +61,7 @@ module TwitterCldr
60
61
  NumberFormatsImporter,
61
62
  PostalCodesImporter,
62
63
  RbnfTestImporter,
64
+ SegmentDictionariesImporter,
63
65
  SegmentRulesImporter,
64
66
  SegmentTestsImporter,
65
67
  TailoringImporter,
@@ -5,10 +5,11 @@ module TwitterCldr
5
5
  class ImportResolver
6
6
  include TSort
7
7
 
8
- attr_reader :importers
8
+ attr_reader :importers, :options
9
9
 
10
- def initialize(importers = Resources.importer_classes_for_ruby_engine)
10
+ def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
11
11
  @importers = importers
12
+ @options = options
12
13
  end
13
14
 
14
15
  def import
@@ -28,7 +29,12 @@ module TwitterCldr
28
29
 
29
30
  def tsort_each_child(instance, &block)
30
31
  deps_for(instance).map do |dep_class|
31
- yield instances.find { |ins| ins.class == dep_class }
32
+ dep = instances.find { |ins| ins.class == dep_class }
33
+ yield dep if dep
34
+
35
+ unless options[:allow_missing_dependencies]
36
+ raise "Could not find dependency #{dep_class.name}"
37
+ end
32
38
  end
33
39
  end
34
40
 
@@ -39,6 +45,8 @@ module TwitterCldr
39
45
  end
40
46
 
41
47
  def check_unmet_instance_deps(instance)
48
+ return if options[:allow_missing_dependencies]
49
+
42
50
  unmet_deps = unmet_deps_for(instance)
43
51
 
44
52
  unless unmet_deps.empty?
@@ -10,6 +10,8 @@ module TwitterCldr
10
10
 
11
11
  class Loader
12
12
 
13
+ VALID_EXTS = %w(.yml .dump).freeze
14
+
13
15
  def get_resource(*path)
14
16
  resources_cache[resource_file_path(path)]
15
17
  end
@@ -75,7 +77,7 @@ module TwitterCldr
75
77
 
76
78
  def resource_file_path(path)
77
79
  file = File.join(*path.map(&:to_s))
78
- file << '.yml' unless file.end_with?('.yml')
80
+ file << '.yml' unless VALID_EXTS.include?(File.extname(file))
79
81
  file
80
82
  end
81
83
 
@@ -92,6 +94,17 @@ module TwitterCldr
92
94
  end
93
95
 
94
96
  def load_resource(path, merge_custom = true)
97
+ case File.extname(path)
98
+ when '.yml'
99
+ load_yaml_resource(path, merge_custom)
100
+ when '.dump'
101
+ load_marshalled_resource(path, merge_custom)
102
+ else
103
+ load_raw_resource(path, merge_custom)
104
+ end
105
+ end
106
+
107
+ def load_yaml_resource(path, merge_custom = true)
95
108
  base = YAML.load(read_resource_file(path))
96
109
  custom_path = File.join("custom", path)
97
110
 
@@ -102,6 +115,14 @@ module TwitterCldr
102
115
  base
103
116
  end
104
117
 
118
+ def load_marshalled_resource(path, _merge_custom = :unused)
119
+ Marshal.load(read_resource_file(path))
120
+ end
121
+
122
+ def load_raw_resource(path, _merge_custom = :unused)
123
+ read_resource_file(path)
124
+ end
125
+
105
126
  def custom_resource_exists?(custom_path)
106
127
  File.exist?(
107
128
  File.join(TwitterCldr::RESOURCES_DIR, custom_path)
@@ -56,13 +56,6 @@ module TwitterCldr
56
56
  params.fetch(:output_path)
57
57
  end
58
58
 
59
- def move_segments_root_file
60
- old_file_path = File.join(output_path, *%w(shared segments_root.yml))
61
- new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
62
- FileUtils.mkdir_p(File.dirname(new_file_path))
63
- FileUtils.move(old_file_path, new_file_path)
64
- end
65
-
66
59
  def import_components
67
60
  locales = Set.new
68
61
 
@@ -100,8 +93,6 @@ module TwitterCldr
100
93
  Cldr::Export.export(export_args) do |component, locale, path|
101
94
  deep_symbolize(path)
102
95
  end
103
-
104
- move_segments_root_file
105
96
  end
106
97
 
107
98
  def components_for(locale)
@@ -3,8 +3,8 @@
3
3
  # Copyright 2012 Twitter, Inc
4
4
  # http://www.apache.org/licenses/LICENSE-2.0
5
5
 
6
- require 'rest-client'
7
6
  require 'json'
7
+ require 'open-uri'
8
8
  require 'set'
9
9
  require 'yaml'
10
10
 
@@ -21,38 +21,36 @@ module TwitterCldr
21
21
  private
22
22
 
23
23
  def execute
24
- File.open(File.join(output_path, 'postal_codes.yml'), 'w') do |output|
25
- output.write(YAML.dump(load))
26
- end
24
+ data = YAML.dump(fetch_data)
25
+ File.write(File.join(output_path, 'postal_codes.yml'), data)
26
+ puts
27
27
  end
28
28
 
29
29
  def output_path
30
30
  params.fetch(:output_path)
31
31
  end
32
32
 
33
- def load
33
+ def fetch_data
34
34
  territories = Set.new
35
35
 
36
36
  each_territory.each_with_object({}) do |territory, ret|
37
- next unless regex = get_regex_for(territory)
38
-
39
- ret[territory] = {
40
- regex: Regexp.compile(regex),
41
- ast: TwitterCldr::Utils::RegexpAst.dump(
42
- RegexpAstGenerator.generate(regex)
43
- )
44
- }
37
+ if regex = get_regex_for(territory)
38
+ ret[territory] = {
39
+ regex: Regexp.compile(regex),
40
+ ast: TwitterCldr::Utils::RegexpAst.dump(
41
+ RegexpAstGenerator.generate(regex)
42
+ )
43
+ }
44
+ end
45
45
 
46
46
  territories.add(territory)
47
47
  STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
48
48
  end
49
-
50
- puts
51
49
  end
52
50
 
53
51
  def get_regex_for(territory)
54
- result = RestClient.get("#{BASE_URL}#{territory.to_s.upcase}")
55
- data = JSON.parse(result.body)
52
+ result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
53
+ data = JSON.parse(result)
56
54
  data['zip']
57
55
  end
58
56
 
@@ -61,12 +59,10 @@ module TwitterCldr
61
59
  end
62
60
 
63
61
  def each_territory
64
- if block_given?
65
- TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
66
- yield territory
67
- end
68
- else
69
- to_enum(__method__)
62
+ return to_enum(__method__) unless block_given?
63
+
64
+ TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
65
+ yield territory
70
66
  end
71
67
  end
72
68
 
@@ -0,0 +1,75 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'fileutils'
7
+ require 'open-uri'
8
+
9
+ module TwitterCldr
10
+ module Resources
11
+ class SegmentDictionariesImporter < Importer
12
+
13
+ URL_TEMPLATE = 'https://raw.githubusercontent.com/unicode-org/icu/%{icu_version}/%{path}'
14
+
15
+ DICTIONARY_FILES = [
16
+ 'icu4c/source/data/brkitr/dictionaries/burmesedict.txt',
17
+ 'icu4c/source/data/brkitr/dictionaries/cjdict.txt',
18
+ 'icu4c/source/data/brkitr/dictionaries/khmerdict.txt',
19
+ 'icu4c/source/data/brkitr/dictionaries/laodict.txt',
20
+ 'icu4c/source/data/brkitr/dictionaries/thaidict.txt'
21
+ ]
22
+
23
+ output_path File.join(*%w(shared segments dictionaries))
24
+ ruby_engine :mri
25
+
26
+ def execute
27
+ FileUtils.mkdir_p(output_path)
28
+
29
+ DICTIONARY_FILES.each do |test_file|
30
+ import_dictionary_file(test_file)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def import_dictionary_file(dictionary_file)
37
+ source_url = url_for(dictionary_file)
38
+ source = open(source_url).read
39
+ lines = source.split("\n")
40
+ trie = TwitterCldr::Utils::Trie.new
41
+ space_regexp = TwitterCldr::Shared::UnicodeRegex.compile('\A[[:Z:][:C:]]+').to_regexp
42
+
43
+ lines.each do |line|
44
+ line.sub!(space_regexp, '')
45
+ next if line.start_with?('#')
46
+
47
+ characters, frequency = line.split("\t")
48
+ frequency = frequency ? frequency.to_i : 0
49
+
50
+ trie.add(characters.unpack('U*'), frequency)
51
+ end
52
+
53
+ output_path = output_path_for(dictionary_file)
54
+ File.write(output_path, Marshal.dump(trie))
55
+ end
56
+
57
+ def url_for(dictionary_file)
58
+ URL_TEMPLATE % {
59
+ icu_version: "release-#{Versions.icu_version.gsub('.', '-')}",
60
+ path: dictionary_file
61
+ }
62
+ end
63
+
64
+ def output_path_for(dictionary_file)
65
+ file = File.basename(dictionary_file).chomp(File.extname(dictionary_file))
66
+ File.join(output_path, "#{file}.dump")
67
+ end
68
+
69
+ def output_path
70
+ params.fetch(:output_path)
71
+ end
72
+
73
+ end
74
+ end
75
+ end
@@ -9,47 +9,164 @@ module TwitterCldr
9
9
  module Resources
10
10
  class SegmentTestsImporter < Importer
11
11
 
12
- TEST_FILES = [
12
+ CONFORMANCE_FILES = [
13
13
  'ucd/auxiliary/WordBreakTest.txt',
14
14
  'ucd/auxiliary/SentenceBreakTest.txt',
15
15
  'ucd/auxiliary/GraphemeBreakTest.txt',
16
16
  'ucd/auxiliary/LineBreakTest.txt'
17
17
  ]
18
18
 
19
- requirement :unicode, Versions.unicode_version, TEST_FILES
19
+ DICTIONARY_BREAK_SAMPLES = {
20
+ # Chinese
21
+ zh: '無畏號航空母艦是一艘隸屬於美國海軍的航空母艦,為艾塞克斯級航空母艦的三號艦。'\
22
+ '無畏號於1941年開始建造,1943年下水服役,開始參與太平洋戰爭。戰後無畏號退役封存,'\
23
+ '在韓戰後開始進行SCB-27C改建,又在期間重編為攻擊航母,於1954年在大西洋艦隊重新服役。'\
24
+ '稍後無畏號又進行SCB-125現代化改建,增設斜角飛行甲板。1962年無畏號重編為反潛航母,'\
25
+ '舷號改為CVS-11,繼續留在大西洋及地中海執勤。稍後無畏號參與美國的太空計畫,'\
26
+ '分別擔任水星-宇宙神7號及雙子座3號的救援船。1966年至1969年,無畏號曾三次前往西太平洋,'\
27
+ '參與越戰。無畏號在1974年退役,並一度預備出售拆解;但在民間組織努力下,'\
28
+ '海軍在1981年將無畏號捐贈到紐約作博物館艦。1986年,無畏號獲評為美國國家歷史地標。',
29
+
30
+ ko: '파일은 이용자가 공용 또는 위키백과 한국어판에 업로드하여 라이선스에 따라 사용 가능한 형태로 제공됩니다. '\
31
+ '업로드된 파일은 간단한 조작으로 페이지에 삽입할 수 있습니다. 업로드는 신규 이용자를 제외한 등록 이용자라면 '\
32
+ '가능합니다. 파일을 업로드하기 전에 다음 문단의 업로드를 할 수 없는 파일을 반드시 읽어 보시기 바랍니다. '\
33
+ '공용 이용 방법 및 업로드에 대해서는 Commons:초보자 길라잡이/업로드를 읽어 보시기 바랍니다. 업로드하는 '\
34
+ '페이지는 위키백과:파일 올리기를 참조하십시오. 파일의 라이선스가 삽입되는 위키백과의 문서와는 별도로 '\
35
+ '개별적으로 설정해야 합니다. 파일을 업로드할 때 적절한 라이선스를 선택하고 반드시 표시하십시오.',
36
+
37
+ # Thai
38
+ th: 'ธงไชย แมคอินไตย์ ชื่อเล่น เบิร์ด (เกิด 8 ธันวาคม พ.ศ. 2501) เป็นนักร้อง นักแสดงชาวไทย '\
39
+ 'ได้รับขนานนามว่าเป็น "ซูเปอร์สตาร์เมืองไทย" โดยคนไทยรู้จักกันดี เรียกกันว่า : พี่เบิร์ด '\
40
+ 'แรกเข้าวงการบันเทิงเป็นนักแสดงสมทบ ต่อมาได้รับบทพระเอก โดยภาพยนตร์ที่สร้างชื่อเสียงให้กับเขาที่สุดเรื่อง '\
41
+ 'ด้วยรักคือรัก ส่วนละครที่สร้างชื่อเสียงที่สุดของเขาคือบท "โกโบริ" ในละครคู่กรรม '\
42
+ 'ด้านวงการเพลงซึ่งเป็นอาชีพหลักเขาเริ่มต้นจากการประกวดร้องเพลงของสยามกลการ '\
43
+ 'ต่อมาเป็นนักร้องในสังกัดบริษัท จีเอ็มเอ็ม แกรมมี่ จำกัด (มหาชน) ซึ่งประสบความสำเร็จสูงสุดของประเทศไทย'\
44
+ 'มียอดจำหน่ายอยู่ในระดับแนวหน้าของทวีปเอเชียยอดรวมกว่า 25 ล้านชุด',
45
+
46
+ # Khmer
47
+ km: 'វីគីភីឌា (អង់គ្លេស ៖ Wikipedia) ជាសព្វវចនាធិប្បាយសេរីច្រើនភាសានៅលើអ៊ីនធឺណិត '\
48
+ 'ដែលមនុស្សគ្រប់គ្នាអាចអាននិងធ្វើឱ្យមាតិកាទាន់សម័យបន្ថែមទៀត '\
49
+ 'ធ្វើឱ្យវីគីភីឌាសព្វវចនាធិប្បាយបានក្លាយទៅជាការកែប្រែ '\
50
+ 'ការប្រមូលនិងការអភិរក្សរាប់រយរាប់ពាន់នាក់នៃអ្នកស្ម័គ្រចិត្តនៅជុំវិញពិភពលោក '\
51
+ 'តាមរយៈកម្មវិធីដែលគេហៅថាមេឌាវិគី ។ វីគីភីឌាចាប់ផ្តើមនៅថ្ងៃទី ១៥ មករា ឆ្នាំ ២០០១ '\
52
+ 'ដោយចាប់ផ្តើមគម្រោងពីឈ្មោះសព្វវចនាធិប្បាយណូ៉ភីឌាដែលសរសេរដោយហ្ស៊ីម្ម៊ី '\
53
+ 'វេល្ស និងឡែរ្រី សែងក័រ ។ នៅបច្ចុប្បន្ននេះ វីគីភីឌាមានទាំង់អស់ ២៩៣ ភាសា[៤] ដោយវីគីភីឌាភាសាខ្មែរមាន '\
54
+ '៧៨៩៨ អត្ថបទ ។ មានវីគីភីឌាច្រើនជាង ៥០ ភាសាដែលមានអត្ថបទច្រើនជាង ១០០.០០០ អត្ថបទ ។ '\
55
+ 'វីគីភីឌាភាសាអាល្លឺម៉ងត្រូវបានគេចែកចាយនៅក្នុងទ្រង់ទ្រាយឌីវីឌី-រ៉ូម ។',
56
+
57
+ # Lao
58
+ lo: 'ວິກິພີເດຍ (ອັງກິດ: Wikipedia) ເປັນສາລະນຸກົມເນື້ອຫາເສລີຫຼາຍພາສາໃນເວັບໄຊ້ '\
59
+ 'ເຊິ່ງໄດ້ຮັບການສະໜັບສະໜຸນຈາກມູນລະນິທິວິກິພີເດຍ ອົງກອນບໍ່ສະແຫວງຫາຜົນກຳໄລ ເນື້ອຫາກວ່າ 35 ລ້ານບົດຄວາມ '\
60
+ '(ສະເພາະວິກິພີເດຍພາສາອັງກິດມີເນື້ອຫາກວ່າ 4.9 ລ້ານບົດຄວາມ) ເກີດຂຶ້ນຈາກການຮ່ວມຂຽນຂອງອາສາສະໝັກທົ່ວໂລກ '\
61
+ 'ທຸກຄົນທີ່ສາມາດເຂົ້າເຖິງວິກິພີເດຍສາມາດຮ່ວມແກ້ໄຂເກືອບທຸກບົດຄວາມໄດ້ຢ່າງເສລີ ໂດຍມີຜູ້ຂຽນປະມານ 100,000ຄົນ '\
62
+ 'ຈົນເຖິງເດືອນເມສາ ຄ.ສ. 2013 ວິກິພີເດຍມີ 286 ຮຸ່ນພາສາ ແລະ '\
63
+ 'ໄດ້ກາຍມາເປັນງານອ້າງອິງທົ່ວໄປທີ່ໃກຍ່ທີ່ສຸດແລະໄດ້ຮັບຄວາມນິຍົມຫຼາຍທີ່ສຸດຢູ່ອິນເຕີເນັດ ຈົນຖືກຈັດເປັນເວັບໄຊ້ ອັນດັບທີ 6 '\
64
+ 'ທີ່ມີຜູ້ເຂົ້າເບິ່ງຫຼາຍທີ່ສຸດໃນໂລກ ຕາມການຈັດອັນດັບຂອງອາເລັກຊ້າ ດ້ວຍຈຳນວນຜູ້ອ່ານກວ່າ 365 ລ້ານຄົນ '\
65
+ 'ມີການປະເມີນວ່າວິກິພີເດຍມີການຄົ້ນຫາຂໍ້ມູນໃນວິກິພີເດຍກວ່າ 2,700 ລ້ານເທື່ອຕໍ່ເດືອນໃນສະຫະລັດ ອາເມຣິກາ',
66
+
67
+ # Burmese
68
+ my: 'ကိန်းဆိုသည်မှာ ရေတွက်ရန်နှင့် တိုင်းတာရန် အတွက် အသုံးပြုသော သင်္ချာဆိုင်ရာ အရာဝတ္ထုတစ်ခု '\
69
+ 'ဖြစ်သည်။ သင်္ချာပညာတွင် ကိန်းဂဏန်းများ၏ အဓိပ္ပာယ်ဖွင့်ဆိုချက်ကို တဖြည်းဖြည်း ချဲ့ကားလာခဲ့သဖြင့် '\
70
+ 'နှစ်ပေါင်းများစွာ ကြာသောအခါတွင် သုည၊ အနှုတ်ကိန်းများ (negative numbers)၊ ရာရှင်နယ်ကိန်း '\
71
+ '(rational number) ခေါ် အပိုင်းကိန်းများ၊ အီရာရှင်နယ်ကိန်း (irrational number) ခေါ် '\
72
+ 'အပိုင်းကိန်းမဟုတ်သောကိန်းများ နှင့် ကွန်ပလက်စ်ကိန်း (complex number) ခေါ် ကိန်းရှုပ်များ စသည်ဖြင့် '\
73
+ 'ပါဝင်လာကြသည်။ သင်္ချာဆိုင်ရာ တွက်ချက်မှုများ (mathematical operations) တွင် ဂဏန်းတစ်ခု '\
74
+ 'သို့မဟုတ် တစ်ခုထက်ပိုသော ဂဏန်းများကို အဝင်ကိန်းအဖြစ် လက်ခံကြပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း '\
75
+ 'အဖြစ် ပြန်ထုတ်ပေးသည်။ ယူနရီ တွက်ချက်မှု (unary operation) ခေါ် တစ်လုံးသွင်းတွက်ချက်မှုတွင် '\
76
+ 'ဂဏန်းတစ်ခုကို အဝင်ကိန်း အဖြစ် လက်ခံပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း အဖြစ် ထုတ်ပေးသည်။ '
77
+ }.freeze
78
+
79
+ requirement :unicode, Versions.unicode_version, CONFORMANCE_FILES
80
+ requirement :icu, Versions.icu_version
20
81
  output_path 'shared/segments/tests'
21
- ruby_engine :mri
82
+ ruby_engine :jruby
22
83
 
23
84
  def execute
24
- TEST_FILES.each do |test_file|
25
- import_test_file(test_file)
26
- end
85
+ import_conformance_files
86
+ import_dictionary_break_tests
87
+ import_combined_dictionary_break_test
27
88
  end
28
89
 
29
90
  private
30
91
 
31
- def import_test_file(test_file)
32
- source_file = source_path_for(test_file)
92
+ def import_conformance_files
93
+ CONFORMANCE_FILES.each do |test_file|
94
+ import_conformance_file(test_file)
95
+ end
96
+ end
97
+
98
+ def import_conformance_file(conformance_file)
99
+ source_file = conformance_source_path_for(conformance_file)
33
100
  FileUtils.mkdir_p(File.dirname(source_file))
34
101
  result = UnicodeFileParser.parse_standard_file(source_file).map(&:first)
35
- output_file = output_path_for(test_file)
102
+ output_file = conformance_output_path_for(conformance_file)
36
103
  FileUtils.mkdir_p(File.dirname(output_file))
37
104
  File.write(output_file, YAML.dump(result))
38
105
  end
39
106
 
40
- def source_path_for(test_file)
41
- requirements[:unicode].source_path_for(test_file)
107
+ def import_dictionary_break_tests
108
+ DICTIONARY_BREAK_SAMPLES.each do |locale, text_sample|
109
+ data = create_dictionary_break_test(locale.to_s, text_sample)
110
+ dump_dictionary_break_test(locale, data)
111
+ end
42
112
  end
43
113
 
44
- def output_path_for(test_file)
45
- file = underscore(File.basename(test_file).chomp(File.extname(test_file)))
114
+ def import_combined_dictionary_break_test
115
+ text_sample = DICTIONARY_BREAK_SAMPLES.values.join(' ')
116
+ data = create_dictionary_break_test('en', text_sample)
117
+ dump_dictionary_break_test('combined', data)
118
+ end
119
+
120
+ def create_dictionary_break_test(locale, text_sample)
121
+ done = break_iterator.const_get(:DONE)
122
+ iter = break_iterator.get_word_instance(ulocale_class.new(locale))
123
+ iter.set_text(text_sample)
124
+ start = iter.first
125
+ segments = []
126
+
127
+ until (stop = iter.next) == done
128
+ segments << text_sample[start...stop]
129
+ start = stop
130
+ end
131
+
132
+ {
133
+ locale: locale,
134
+ text: text_sample,
135
+ segments: segments
136
+ }
137
+ end
138
+
139
+ def dump_dictionary_break_test(name, data)
140
+ output_file = dictionary_test_output_path_for(name)
141
+ FileUtils.mkdir_p(File.dirname(output_file))
142
+ File.write(output_file, YAML.dump(data))
143
+ end
144
+
145
+ def conformance_source_path_for(conformance_file)
146
+ requirements[:unicode].source_path_for(conformance_file)
147
+ end
148
+
149
+ def conformance_output_path_for(conformance_file)
150
+ file = underscore(File.basename(conformance_file).chomp(File.extname(conformance_file)))
46
151
  File.join(params.fetch(:output_path), "#{file}.yml")
47
152
  end
48
153
 
154
+ def dictionary_test_output_path_for(locale)
155
+ File.join(params.fetch(:output_path), 'dictionary_tests', "#{locale}.yml")
156
+ end
157
+
49
158
  def underscore(str)
50
159
  str.gsub(/(.)([A-Z])/, '\1_\2').downcase
51
160
  end
52
161
 
162
+ def ulocale_class
163
+ @ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
164
+ end
165
+
166
+ def break_iterator
167
+ @break_iterator ||= requirements[:icu].get_class('com.ibm.icu.text.BreakIterator')
168
+ end
169
+
53
170
  end
54
171
  end
55
172
  end