twitter_cldr 5.2.0 → 5.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -4
  3. data/Rakefile +19 -8
  4. data/lib/twitter_cldr/normalization.rb +18 -5
  5. data/lib/twitter_cldr/resources.rb +3 -1
  6. data/lib/twitter_cldr/resources/import_resolver.rb +11 -3
  7. data/lib/twitter_cldr/resources/loader.rb +22 -1
  8. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -9
  9. data/lib/twitter_cldr/resources/postal_codes_importer.rb +19 -23
  10. data/lib/twitter_cldr/resources/segment_dictionaries_importer.rb +75 -0
  11. data/lib/twitter_cldr/resources/segment_tests_importer.rb +130 -13
  12. data/lib/twitter_cldr/segmentation.rb +25 -10
  13. data/lib/twitter_cldr/segmentation/brahmic_break_engine.rb +200 -0
  14. data/lib/twitter_cldr/segmentation/break_iterator.rb +22 -22
  15. data/lib/twitter_cldr/segmentation/burmese_break_engine.rb +83 -0
  16. data/lib/twitter_cldr/segmentation/category_table.rb +5 -1
  17. data/lib/twitter_cldr/segmentation/cj_break_engine.rb +163 -0
  18. data/lib/twitter_cldr/segmentation/cursor.rb +1 -1
  19. data/lib/twitter_cldr/segmentation/dictionary.rb +84 -0
  20. data/lib/twitter_cldr/segmentation/dictionary_break_engine.rb +34 -0
  21. data/lib/twitter_cldr/segmentation/khmer_break_engine.rb +83 -0
  22. data/lib/twitter_cldr/segmentation/korean_break_engine.rb +30 -0
  23. data/lib/twitter_cldr/segmentation/lao_break_engine.rb +85 -0
  24. data/lib/twitter_cldr/segmentation/line_iterator.rb +23 -0
  25. data/lib/twitter_cldr/segmentation/possible_word.rb +74 -0
  26. data/lib/twitter_cldr/segmentation/possible_word_list.rb +23 -0
  27. data/lib/twitter_cldr/segmentation/rule_set.rb +3 -12
  28. data/lib/twitter_cldr/segmentation/segment_iterator.rb +40 -0
  29. data/lib/twitter_cldr/segmentation/state_machine.rb +2 -8
  30. data/lib/twitter_cldr/segmentation/thai_break_engine.rb +141 -0
  31. data/lib/twitter_cldr/segmentation/unhandled_break_engine.rb +21 -0
  32. data/lib/twitter_cldr/segmentation/word_iterator.rb +170 -0
  33. data/lib/twitter_cldr/shared.rb +1 -0
  34. data/lib/twitter_cldr/shared/caser.rb +3 -3
  35. data/lib/twitter_cldr/shared/unicode_set.rb +77 -0
  36. data/lib/twitter_cldr/utils/range_set.rb +10 -1
  37. data/lib/twitter_cldr/version.rb +1 -1
  38. data/resources/collation/tailoring/km.yml +82 -0
  39. data/resources/collation/tailoring/lo.yml +4 -0
  40. data/resources/collation/tailoring/my.yml +940 -0
  41. data/resources/collation/tries/km.dump +0 -0
  42. data/resources/collation/tries/lo.dump +0 -0
  43. data/resources/collation/tries/my.dump +0 -0
  44. data/resources/locales/km/calendars.yml +373 -0
  45. data/resources/locales/km/currencies.yml +654 -0
  46. data/resources/locales/km/day_periods.yml +96 -0
  47. data/resources/locales/km/fields.yml +495 -0
  48. data/resources/locales/km/languages.yml +397 -0
  49. data/resources/locales/km/layout.yml +5 -0
  50. data/resources/locales/km/lists.yml +37 -0
  51. data/resources/locales/km/numbers.yml +402 -0
  52. data/resources/locales/km/plural_rules.yml +6 -0
  53. data/resources/locales/km/plurals.yml +12 -0
  54. data/resources/locales/km/rbnf.yml +131 -0
  55. data/resources/locales/km/territories.yml +267 -0
  56. data/resources/locales/km/timezones.yml +1471 -0
  57. data/resources/locales/km/units.yml +721 -0
  58. data/resources/locales/lo/calendars.yml +368 -0
  59. data/resources/locales/lo/currencies.yml +918 -0
  60. data/resources/locales/lo/day_periods.yml +96 -0
  61. data/resources/locales/lo/fields.yml +437 -0
  62. data/resources/locales/lo/languages.yml +529 -0
  63. data/resources/locales/lo/layout.yml +5 -0
  64. data/resources/locales/lo/lists.yml +42 -0
  65. data/resources/locales/lo/numbers.yml +476 -0
  66. data/resources/locales/lo/plural_rules.yml +7 -0
  67. data/resources/locales/lo/plurals.yml +14 -0
  68. data/resources/locales/lo/rbnf.yml +119 -0
  69. data/resources/locales/lo/territories.yml +265 -0
  70. data/resources/locales/lo/timezones.yml +1513 -0
  71. data/resources/locales/lo/units.yml +750 -0
  72. data/resources/locales/my/calendars.yml +374 -0
  73. data/resources/locales/my/currencies.yml +697 -0
  74. data/resources/locales/my/day_periods.yml +96 -0
  75. data/resources/locales/my/fields.yml +459 -0
  76. data/resources/locales/my/languages.yml +420 -0
  77. data/resources/locales/my/layout.yml +5 -0
  78. data/resources/locales/my/lists.yml +43 -0
  79. data/resources/locales/my/numbers.yml +417 -0
  80. data/resources/locales/my/plural_rules.yml +6 -0
  81. data/resources/locales/my/plurals.yml +12 -0
  82. data/resources/locales/my/rbnf.yml +145 -0
  83. data/resources/locales/my/territories.yml +265 -0
  84. data/resources/locales/my/timezones.yml +1479 -0
  85. data/resources/locales/my/units.yml +759 -0
  86. data/resources/locales/th/plurals.yml +1 -1
  87. data/resources/shared/segments/dictionaries/burmesedict.dump +0 -0
  88. data/resources/shared/segments/dictionaries/cjdict.dump +0 -0
  89. data/resources/shared/segments/dictionaries/khmerdict.dump +0 -0
  90. data/resources/shared/segments/dictionaries/laodict.dump +0 -0
  91. data/resources/shared/segments/dictionaries/thaidict.dump +0 -0
  92. data/resources/shared/segments/tests/dictionary_tests/combined.yml +1253 -0
  93. data/resources/shared/segments/tests/dictionary_tests/km.yml +204 -0
  94. data/resources/shared/segments/tests/dictionary_tests/ko.yml +171 -0
  95. data/resources/shared/segments/tests/dictionary_tests/lo.yml +236 -0
  96. data/resources/shared/segments/tests/dictionary_tests/my.yml +249 -0
  97. data/resources/shared/segments/tests/dictionary_tests/th.yml +201 -0
  98. data/resources/shared/segments/tests/dictionary_tests/zh.yml +206 -0
  99. data/resources/shared/segments/tests/line_break_test.yml +68 -68
  100. data/resources/shared/segments/tests/sentence_break_test.yml +52 -52
  101. data/resources/supported_locales.yml +3 -0
  102. data/spec/formatters/numbers/rbnf/locales/km/rbnf_test.yml +706 -0
  103. data/spec/formatters/numbers/rbnf/locales/lo/rbnf_test.yml +706 -0
  104. data/spec/formatters/numbers/rbnf/locales/my/rbnf_test.yml +706 -0
  105. data/spec/segmentation/dictionary_break_spec.rb +42 -0
  106. data/spec/segmentation/rule_set_spec.rb +3 -1
  107. data/spec/timezones/tests/km.yml +12475 -0
  108. data/spec/timezones/tests/lo.yml +12475 -0
  109. data/spec/timezones/tests/my.yml +12475 -0
  110. metadata +87 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b616d55c343da67733837c0f1549329d895ea3758011b4c5dd8c90c3c631f53a
4
- data.tar.gz: 3717867c2412adcc7a95ff1dfbe032e1754ade3cc005b4445d1f8ff644048b06
3
+ metadata.gz: 2783ec225d4f260deb8038237125dbc97b78840e951cbff630f3da502e31f41d
4
+ data.tar.gz: 751cc8931ee11db35a533584a1b9f3d8a946e9505641d33caad6a5a5dbc6e866
5
5
  SHA512:
6
- metadata.gz: f82323e912a622930f192a2ffe8b742ce1378feff847a082e04b7ea0feb5df215faa183861e31a35e69bc966c6222d0182689cdc5264de69263acbc60acce8ff
7
- data.tar.gz: dc11c5d5e3ab6cc0f2cf3a6073686f7167ac6621252a0b6bf4c7f0b1eb53ae7134507aa0d66e5c0c57e7abc9b689a6438032b4982abe766b9c8550b6df0139b8
6
+ metadata.gz: 7cdc1ec2718ac86b0645813fc0bdfe13b1c2bee075ca90e13231a3d566d915e7ef9bcdf6d69f8d0a7cbcdaf6868eae5c27bfb4b32b52efbd7dcf1adabc20c39d
7
+ data.tar.gz: 20b7e24ec990cc00fb77d60d62c75247108748629c5478a0a315d96e04a76c443f8abc2a4df21b7c4093ad7cc46ee81c4276ca15f97c8698073b7ac9bca98c65
data/Gemfile CHANGED
@@ -23,10 +23,6 @@ group :development do
23
23
  gem 'ruby-cldr', github: 'camertron/ruby-cldr', branch: 'mapzones' # 'svenfuchs/ruby-cldr'
24
24
  gem 'i18n'
25
25
  gem 'cldr-plurals', '~> 1.0'
26
-
27
- gem 'rest-client', '~> 1.8'
28
-
29
- gem 'parallel'
30
26
  end
31
27
 
32
28
  group :test do
data/Rakefile CHANGED
@@ -57,16 +57,22 @@ task :update do
57
57
  end
58
58
 
59
59
  task :add_locale, :locale do |_, args|
60
+ locales = [args[:locale]] + args.extras
61
+
60
62
  File.write(
61
63
  TwitterCldr::SUPPORTED_LOCALES_FILE,
62
64
  YAML.dump(
63
- (TwitterCldr::SUPPORTED_LOCALES + [args[:locale]]).map(&:to_sym).uniq.sort
65
+ (TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
64
66
  )
65
67
  )
66
68
 
67
69
  klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
68
- instances = klasses.map { |klass| klass.new(locales: [args[:locale]]) }
69
- TwitterCldr::Resources::ImportResolver.new(instances).import
70
+ instances = klasses.map { |klass| klass.new(locales: locales) }
71
+ resolver = TwitterCldr::Resources::ImportResolver.new(
72
+ instances, allow_missing_dependencies: true
73
+ )
74
+
75
+ resolver.import
70
76
  end
71
77
 
72
78
  # add_locale and update_locale do the same thing
@@ -160,6 +166,16 @@ namespace :update do
160
166
  TwitterCldr::Resources::SegmentRulesImporter.new.import
161
167
  end
162
168
 
169
+ desc 'Import segmentation dictionaries'
170
+ task :segment_dictionaries do
171
+ TwitterCldr::Resources::SegmentDictionariesImporter.new.import
172
+ end
173
+
174
+ desc 'Import segment tests'
175
+ task :segment_tests do
176
+ TwitterCldr::Resources::SegmentTestsImporter.new.import
177
+ end
178
+
163
179
  desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
164
180
  task :bidi_tests do
165
181
  TwitterCldr::Resources::BidiTestImporter.new.import
@@ -200,11 +216,6 @@ namespace :update do
200
216
  TwitterCldr::Resources::TransformTestsImporter.new.import
201
217
  end
202
218
 
203
- desc 'Import segment tests'
204
- task :segment_tests do
205
- TwitterCldr::Resources::SegmentTestsImporter.new.import
206
- end
207
-
208
219
  desc 'Import hyphenation dictionaries'
209
220
  task :hyphenation_dictionaries do
210
221
  TwitterCldr::Resources::HyphenationImporter.new.import
@@ -14,12 +14,25 @@ module TwitterCldr
14
14
  class << self
15
15
 
16
16
  def normalize(string, options = {})
17
- form = options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
17
+ validate_form(form = extract_form_from(options))
18
+ Eprun.normalize(string, form)
19
+ end
20
+
21
+ def normalized?(string, options = {})
22
+ validate_form(form = extract_form_from(options))
23
+ Eprun.normalized?(string, form)
24
+ end
25
+
26
+ private
27
+
28
+ def extract_form_from(options)
29
+ options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
30
+ end
18
31
 
19
- if VALID_NORMALIZERS.include?(form)
20
- Eprun.normalize(string, form)
21
- else
22
- raise ArgumentError.new("#{form.inspect} is not a valid normalizer (valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
32
+ def validate_form(form)
33
+ unless VALID_NORMALIZERS.include?(form)
34
+ raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
35
+ "(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
23
36
  end
24
37
  end
25
38
 
@@ -22,12 +22,13 @@ module TwitterCldr
22
22
  autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
23
23
  autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
24
24
  autoload :Properties, 'twitter_cldr/resources/properties'
25
+ autoload :SegmentDictionariesImporter, 'twitter_cldr/resources/segment_dictionaries_importer'
25
26
  autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
27
+ autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
26
28
  autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
27
29
  autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
28
30
  autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
29
31
  autoload :Requirements, 'twitter_cldr/resources/requirements'
30
- autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
31
32
  autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
32
33
  autoload :TerritoriesImporter, 'twitter_cldr/resources/territories_importer'
33
34
  autoload :TimezonesImporter, 'twitter_cldr/resources/timezones_importer'
@@ -60,6 +61,7 @@ module TwitterCldr
60
61
  NumberFormatsImporter,
61
62
  PostalCodesImporter,
62
63
  RbnfTestImporter,
64
+ SegmentDictionariesImporter,
63
65
  SegmentRulesImporter,
64
66
  SegmentTestsImporter,
65
67
  TailoringImporter,
@@ -5,10 +5,11 @@ module TwitterCldr
5
5
  class ImportResolver
6
6
  include TSort
7
7
 
8
- attr_reader :importers
8
+ attr_reader :importers, :options
9
9
 
10
- def initialize(importers = Resources.importer_classes_for_ruby_engine)
10
+ def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
11
11
  @importers = importers
12
+ @options = options
12
13
  end
13
14
 
14
15
  def import
@@ -28,7 +29,12 @@ module TwitterCldr
28
29
 
29
30
  def tsort_each_child(instance, &block)
30
31
  deps_for(instance).map do |dep_class|
31
- yield instances.find { |ins| ins.class == dep_class }
32
+ dep = instances.find { |ins| ins.class == dep_class }
33
+ yield dep if dep
34
+
35
+ unless options[:allow_missing_dependencies]
36
+ raise "Could not find dependency #{dep_class.name}"
37
+ end
32
38
  end
33
39
  end
34
40
 
@@ -39,6 +45,8 @@ module TwitterCldr
39
45
  end
40
46
 
41
47
  def check_unmet_instance_deps(instance)
48
+ return if options[:allow_missing_dependencies]
49
+
42
50
  unmet_deps = unmet_deps_for(instance)
43
51
 
44
52
  unless unmet_deps.empty?
@@ -10,6 +10,8 @@ module TwitterCldr
10
10
 
11
11
  class Loader
12
12
 
13
+ VALID_EXTS = %w(.yml .dump).freeze
14
+
13
15
  def get_resource(*path)
14
16
  resources_cache[resource_file_path(path)]
15
17
  end
@@ -75,7 +77,7 @@ module TwitterCldr
75
77
 
76
78
  def resource_file_path(path)
77
79
  file = File.join(*path.map(&:to_s))
78
- file << '.yml' unless file.end_with?('.yml')
80
+ file << '.yml' unless VALID_EXTS.include?(File.extname(file))
79
81
  file
80
82
  end
81
83
 
@@ -92,6 +94,17 @@ module TwitterCldr
92
94
  end
93
95
 
94
96
  def load_resource(path, merge_custom = true)
97
+ case File.extname(path)
98
+ when '.yml'
99
+ load_yaml_resource(path, merge_custom)
100
+ when '.dump'
101
+ load_marshalled_resource(path, merge_custom)
102
+ else
103
+ load_raw_resource(path, merge_custom)
104
+ end
105
+ end
106
+
107
+ def load_yaml_resource(path, merge_custom = true)
95
108
  base = YAML.load(read_resource_file(path))
96
109
  custom_path = File.join("custom", path)
97
110
 
@@ -102,6 +115,14 @@ module TwitterCldr
102
115
  base
103
116
  end
104
117
 
118
+ def load_marshalled_resource(path, _merge_custom = :unused)
119
+ Marshal.load(read_resource_file(path))
120
+ end
121
+
122
+ def load_raw_resource(path, _merge_custom = :unused)
123
+ read_resource_file(path)
124
+ end
125
+
105
126
  def custom_resource_exists?(custom_path)
106
127
  File.exist?(
107
128
  File.join(TwitterCldr::RESOURCES_DIR, custom_path)
@@ -56,13 +56,6 @@ module TwitterCldr
56
56
  params.fetch(:output_path)
57
57
  end
58
58
 
59
- def move_segments_root_file
60
- old_file_path = File.join(output_path, *%w(shared segments_root.yml))
61
- new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
62
- FileUtils.mkdir_p(File.dirname(new_file_path))
63
- FileUtils.move(old_file_path, new_file_path)
64
- end
65
-
66
59
  def import_components
67
60
  locales = Set.new
68
61
 
@@ -100,8 +93,6 @@ module TwitterCldr
100
93
  Cldr::Export.export(export_args) do |component, locale, path|
101
94
  deep_symbolize(path)
102
95
  end
103
-
104
- move_segments_root_file
105
96
  end
106
97
 
107
98
  def components_for(locale)
@@ -3,8 +3,8 @@
3
3
  # Copyright 2012 Twitter, Inc
4
4
  # http://www.apache.org/licenses/LICENSE-2.0
5
5
 
6
- require 'rest-client'
7
6
  require 'json'
7
+ require 'open-uri'
8
8
  require 'set'
9
9
  require 'yaml'
10
10
 
@@ -21,38 +21,36 @@ module TwitterCldr
21
21
  private
22
22
 
23
23
  def execute
24
- File.open(File.join(output_path, 'postal_codes.yml'), 'w') do |output|
25
- output.write(YAML.dump(load))
26
- end
24
+ data = YAML.dump(fetch_data)
25
+ File.write(File.join(output_path, 'postal_codes.yml'), data)
26
+ puts
27
27
  end
28
28
 
29
29
  def output_path
30
30
  params.fetch(:output_path)
31
31
  end
32
32
 
33
- def load
33
+ def fetch_data
34
34
  territories = Set.new
35
35
 
36
36
  each_territory.each_with_object({}) do |territory, ret|
37
- next unless regex = get_regex_for(territory)
38
-
39
- ret[territory] = {
40
- regex: Regexp.compile(regex),
41
- ast: TwitterCldr::Utils::RegexpAst.dump(
42
- RegexpAstGenerator.generate(regex)
43
- )
44
- }
37
+ if regex = get_regex_for(territory)
38
+ ret[territory] = {
39
+ regex: Regexp.compile(regex),
40
+ ast: TwitterCldr::Utils::RegexpAst.dump(
41
+ RegexpAstGenerator.generate(regex)
42
+ )
43
+ }
44
+ end
45
45
 
46
46
  territories.add(territory)
47
47
  STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
48
48
  end
49
-
50
- puts
51
49
  end
52
50
 
53
51
  def get_regex_for(territory)
54
- result = RestClient.get("#{BASE_URL}#{territory.to_s.upcase}")
55
- data = JSON.parse(result.body)
52
+ result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
53
+ data = JSON.parse(result)
56
54
  data['zip']
57
55
  end
58
56
 
@@ -61,12 +59,10 @@ module TwitterCldr
61
59
  end
62
60
 
63
61
  def each_territory
64
- if block_given?
65
- TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
66
- yield territory
67
- end
68
- else
69
- to_enum(__method__)
62
+ return to_enum(__method__) unless block_given?
63
+
64
+ TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
65
+ yield territory
70
66
  end
71
67
  end
72
68
 
@@ -0,0 +1,75 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'fileutils'
7
+ require 'open-uri'
8
+
9
+ module TwitterCldr
10
+ module Resources
11
+ class SegmentDictionariesImporter < Importer
12
+
13
+ URL_TEMPLATE = 'https://raw.githubusercontent.com/unicode-org/icu/%{icu_version}/%{path}'
14
+
15
+ DICTIONARY_FILES = [
16
+ 'icu4c/source/data/brkitr/dictionaries/burmesedict.txt',
17
+ 'icu4c/source/data/brkitr/dictionaries/cjdict.txt',
18
+ 'icu4c/source/data/brkitr/dictionaries/khmerdict.txt',
19
+ 'icu4c/source/data/brkitr/dictionaries/laodict.txt',
20
+ 'icu4c/source/data/brkitr/dictionaries/thaidict.txt'
21
+ ]
22
+
23
+ output_path File.join(*%w(shared segments dictionaries))
24
+ ruby_engine :mri
25
+
26
+ def execute
27
+ FileUtils.mkdir_p(output_path)
28
+
29
+ DICTIONARY_FILES.each do |test_file|
30
+ import_dictionary_file(test_file)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def import_dictionary_file(dictionary_file)
37
+ source_url = url_for(dictionary_file)
38
+ source = open(source_url).read
39
+ lines = source.split("\n")
40
+ trie = TwitterCldr::Utils::Trie.new
41
+ space_regexp = TwitterCldr::Shared::UnicodeRegex.compile('\A[[:Z:][:C:]]+').to_regexp
42
+
43
+ lines.each do |line|
44
+ line.sub!(space_regexp, '')
45
+ next if line.start_with?('#')
46
+
47
+ characters, frequency = line.split("\t")
48
+ frequency = frequency ? frequency.to_i : 0
49
+
50
+ trie.add(characters.unpack('U*'), frequency)
51
+ end
52
+
53
+ output_path = output_path_for(dictionary_file)
54
+ File.write(output_path, Marshal.dump(trie))
55
+ end
56
+
57
+ def url_for(dictionary_file)
58
+ URL_TEMPLATE % {
59
+ icu_version: "release-#{Versions.icu_version.gsub('.', '-')}",
60
+ path: dictionary_file
61
+ }
62
+ end
63
+
64
+ def output_path_for(dictionary_file)
65
+ file = File.basename(dictionary_file).chomp(File.extname(dictionary_file))
66
+ File.join(output_path, "#{file}.dump")
67
+ end
68
+
69
+ def output_path
70
+ params.fetch(:output_path)
71
+ end
72
+
73
+ end
74
+ end
75
+ end
@@ -9,47 +9,164 @@ module TwitterCldr
9
9
  module Resources
10
10
  class SegmentTestsImporter < Importer
11
11
 
12
- TEST_FILES = [
12
+ CONFORMANCE_FILES = [
13
13
  'ucd/auxiliary/WordBreakTest.txt',
14
14
  'ucd/auxiliary/SentenceBreakTest.txt',
15
15
  'ucd/auxiliary/GraphemeBreakTest.txt',
16
16
  'ucd/auxiliary/LineBreakTest.txt'
17
17
  ]
18
18
 
19
- requirement :unicode, Versions.unicode_version, TEST_FILES
19
+ DICTIONARY_BREAK_SAMPLES = {
20
+ # Chinese
21
+ zh: '無畏號航空母艦是一艘隸屬於美國海軍的航空母艦,為艾塞克斯級航空母艦的三號艦。'\
22
+ '無畏號於1941年開始建造,1943年下水服役,開始參與太平洋戰爭。戰後無畏號退役封存,'\
23
+ '在韓戰後開始進行SCB-27C改建,又在期間重編為攻擊航母,於1954年在大西洋艦隊重新服役。'\
24
+ '稍後無畏號又進行SCB-125現代化改建,增設斜角飛行甲板。1962年無畏號重編為反潛航母,'\
25
+ '舷號改為CVS-11,繼續留在大西洋及地中海執勤。稍後無畏號參與美國的太空計畫,'\
26
+ '分別擔任水星-宇宙神7號及雙子座3號的救援船。1966年至1969年,無畏號曾三次前往西太平洋,'\
27
+ '參與越戰。無畏號在1974年退役,並一度預備出售拆解;但在民間組織努力下,'\
28
+ '海軍在1981年將無畏號捐贈到紐約作博物館艦。1986年,無畏號獲評為美國國家歷史地標。',
29
+
30
+ ko: '파일은 이용자가 공용 또는 위키백과 한국어판에 업로드하여 라이선스에 따라 사용 가능한 형태로 제공됩니다. '\
31
+ '업로드된 파일은 간단한 조작으로 페이지에 삽입할 수 있습니다. 업로드는 신규 이용자를 제외한 등록 이용자라면 '\
32
+ '가능합니다. 파일을 업로드하기 전에 다음 문단의 업로드를 할 수 없는 파일을 반드시 읽어 보시기 바랍니다. '\
33
+ '공용 이용 방법 및 업로드에 대해서는 Commons:초보자 길라잡이/업로드를 읽어 보시기 바랍니다. 업로드하는 '\
34
+ '페이지는 위키백과:파일 올리기를 참조하십시오. 파일의 라이선스가 삽입되는 위키백과의 문서와는 별도로 '\
35
+ '개별적으로 설정해야 합니다. 파일을 업로드할 때 적절한 라이선스를 선택하고 반드시 표시하십시오.',
36
+
37
+ # Thai
38
+ th: 'ธงไชย แมคอินไตย์ ชื่อเล่น เบิร์ด (เกิด 8 ธันวาคม พ.ศ. 2501) เป็นนักร้อง นักแสดงชาวไทย '\
39
+ 'ได้รับขนานนามว่าเป็น "ซูเปอร์สตาร์เมืองไทย" โดยคนไทยรู้จักกันดี เรียกกันว่า : พี่เบิร์ด '\
40
+ 'แรกเข้าวงการบันเทิงเป็นนักแสดงสมทบ ต่อมาได้รับบทพระเอก โดยภาพยนตร์ที่สร้างชื่อเสียงให้กับเขาที่สุดเรื่อง '\
41
+ 'ด้วยรักคือรัก ส่วนละครที่สร้างชื่อเสียงที่สุดของเขาคือบท "โกโบริ" ในละครคู่กรรม '\
42
+ 'ด้านวงการเพลงซึ่งเป็นอาชีพหลักเขาเริ่มต้นจากการประกวดร้องเพลงของสยามกลการ '\
43
+ 'ต่อมาเป็นนักร้องในสังกัดบริษัท จีเอ็มเอ็ม แกรมมี่ จำกัด (มหาชน) ซึ่งประสบความสำเร็จสูงสุดของประเทศไทย'\
44
+ 'มียอดจำหน่ายอยู่ในระดับแนวหน้าของทวีปเอเชียยอดรวมกว่า 25 ล้านชุด',
45
+
46
+ # Khmer
47
+ km: 'វីគីភីឌា (អង់គ្លេស ៖ Wikipedia) ជាសព្វវចនាធិប្បាយសេរីច្រើនភាសានៅលើអ៊ីនធឺណិត '\
48
+ 'ដែលមនុស្សគ្រប់គ្នាអាចអាននិងធ្វើឱ្យមាតិកាទាន់សម័យបន្ថែមទៀត '\
49
+ 'ធ្វើឱ្យវីគីភីឌាសព្វវចនាធិប្បាយបានក្លាយទៅជាការកែប្រែ '\
50
+ 'ការប្រមូលនិងការអភិរក្សរាប់រយរាប់ពាន់នាក់នៃអ្នកស្ម័គ្រចិត្តនៅជុំវិញពិភពលោក '\
51
+ 'តាមរយៈកម្មវិធីដែលគេហៅថាមេឌាវិគី ។ វីគីភីឌាចាប់ផ្តើមនៅថ្ងៃទី ១៥ មករា ឆ្នាំ ២០០១ '\
52
+ 'ដោយចាប់ផ្តើមគម្រោងពីឈ្មោះសព្វវចនាធិប្បាយណូ៉ភីឌាដែលសរសេរដោយហ្ស៊ីម្ម៊ី '\
53
+ 'វេល្ស និងឡែរ្រី សែងក័រ ។ នៅបច្ចុប្បន្ននេះ វីគីភីឌាមានទាំង់អស់ ២៩៣ ភាសា[៤] ដោយវីគីភីឌាភាសាខ្មែរមាន '\
54
+ '៧៨៩៨ អត្ថបទ ។ មានវីគីភីឌាច្រើនជាង ៥០ ភាសាដែលមានអត្ថបទច្រើនជាង ១០០.០០០ អត្ថបទ ។ '\
55
+ 'វីគីភីឌាភាសាអាល្លឺម៉ងត្រូវបានគេចែកចាយនៅក្នុងទ្រង់ទ្រាយឌីវីឌី-រ៉ូម ។',
56
+
57
+ # Lao
58
+ lo: 'ວິກິພີເດຍ (ອັງກິດ: Wikipedia) ເປັນສາລະນຸກົມເນື້ອຫາເສລີຫຼາຍພາສາໃນເວັບໄຊ້ '\
59
+ 'ເຊິ່ງໄດ້ຮັບການສະໜັບສະໜຸນຈາກມູນລະນິທິວິກິພີເດຍ ອົງກອນບໍ່ສະແຫວງຫາຜົນກຳໄລ ເນື້ອຫາກວ່າ 35 ລ້ານບົດຄວາມ '\
60
+ '(ສະເພາະວິກິພີເດຍພາສາອັງກິດມີເນື້ອຫາກວ່າ 4.9 ລ້ານບົດຄວາມ) ເກີດຂຶ້ນຈາກການຮ່ວມຂຽນຂອງອາສາສະໝັກທົ່ວໂລກ '\
61
+ 'ທຸກຄົນທີ່ສາມາດເຂົ້າເຖິງວິກິພີເດຍສາມາດຮ່ວມແກ້ໄຂເກືອບທຸກບົດຄວາມໄດ້ຢ່າງເສລີ ໂດຍມີຜູ້ຂຽນປະມານ 100,000ຄົນ '\
62
+ 'ຈົນເຖິງເດືອນເມສາ ຄ.ສ. 2013 ວິກິພີເດຍມີ 286 ຮຸ່ນພາສາ ແລະ '\
63
+ 'ໄດ້ກາຍມາເປັນງານອ້າງອິງທົ່ວໄປທີ່ໃກຍ່ທີ່ສຸດແລະໄດ້ຮັບຄວາມນິຍົມຫຼາຍທີ່ສຸດຢູ່ອິນເຕີເນັດ ຈົນຖືກຈັດເປັນເວັບໄຊ້ ອັນດັບທີ 6 '\
64
+ 'ທີ່ມີຜູ້ເຂົ້າເບິ່ງຫຼາຍທີ່ສຸດໃນໂລກ ຕາມການຈັດອັນດັບຂອງອາເລັກຊ້າ ດ້ວຍຈຳນວນຜູ້ອ່ານກວ່າ 365 ລ້ານຄົນ '\
65
+ 'ມີການປະເມີນວ່າວິກິພີເດຍມີການຄົ້ນຫາຂໍ້ມູນໃນວິກິພີເດຍກວ່າ 2,700 ລ້ານເທື່ອຕໍ່ເດືອນໃນສະຫະລັດ ອາເມຣິກາ',
66
+
67
+ # Burmese
68
+ my: 'ကိန်းဆိုသည်မှာ ရေတွက်ရန်နှင့် တိုင်းတာရန် အတွက် အသုံးပြုသော သင်္ချာဆိုင်ရာ အရာဝတ္ထုတစ်ခု '\
69
+ 'ဖြစ်သည်။ သင်္ချာပညာတွင် ကိန်းဂဏန်းများ၏ အဓိပ္ပာယ်ဖွင့်ဆိုချက်ကို တဖြည်းဖြည်း ချဲ့ကားလာခဲ့သဖြင့် '\
70
+ 'နှစ်ပေါင်းများစွာ ကြာသောအခါတွင် သုည၊ အနှုတ်ကိန်းများ (negative numbers)၊ ရာရှင်နယ်ကိန်း '\
71
+ '(rational number) ခေါ် အပိုင်းကိန်းများ၊ အီရာရှင်နယ်ကိန်း (irrational number) ခေါ် '\
72
+ 'အပိုင်းကိန်းမဟုတ်သောကိန်းများ နှင့် ကွန်ပလက်စ်ကိန်း (complex number) ခေါ် ကိန်းရှုပ်များ စသည်ဖြင့် '\
73
+ 'ပါဝင်လာကြသည်။ သင်္ချာဆိုင်ရာ တွက်ချက်မှုများ (mathematical operations) တွင် ဂဏန်းတစ်ခု '\
74
+ 'သို့မဟုတ် တစ်ခုထက်ပိုသော ဂဏန်းများကို အဝင်ကိန်းအဖြစ် လက်ခံကြပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း '\
75
+ 'အဖြစ် ပြန်ထုတ်ပေးသည်။ ယူနရီ တွက်ချက်မှု (unary operation) ခေါ် တစ်လုံးသွင်းတွက်ချက်မှုတွင် '\
76
+ 'ဂဏန်းတစ်ခုကို အဝင်ကိန်း အဖြစ် လက်ခံပြီး ဂဏန်းတစ်ခုကို အထွက်ကိန်း အဖြစ် ထုတ်ပေးသည်။ '
77
+ }.freeze
78
+
79
+ requirement :unicode, Versions.unicode_version, CONFORMANCE_FILES
80
+ requirement :icu, Versions.icu_version
20
81
  output_path 'shared/segments/tests'
21
- ruby_engine :mri
82
+ ruby_engine :jruby
22
83
 
23
84
  def execute
24
- TEST_FILES.each do |test_file|
25
- import_test_file(test_file)
26
- end
85
+ import_conformance_files
86
+ import_dictionary_break_tests
87
+ import_combined_dictionary_break_test
27
88
  end
28
89
 
29
90
  private
30
91
 
31
- def import_test_file(test_file)
32
- source_file = source_path_for(test_file)
92
+ def import_conformance_files
93
+ CONFORMANCE_FILES.each do |test_file|
94
+ import_conformance_file(test_file)
95
+ end
96
+ end
97
+
98
+ def import_conformance_file(conformance_file)
99
+ source_file = conformance_source_path_for(conformance_file)
33
100
  FileUtils.mkdir_p(File.dirname(source_file))
34
101
  result = UnicodeFileParser.parse_standard_file(source_file).map(&:first)
35
- output_file = output_path_for(test_file)
102
+ output_file = conformance_output_path_for(conformance_file)
36
103
  FileUtils.mkdir_p(File.dirname(output_file))
37
104
  File.write(output_file, YAML.dump(result))
38
105
  end
39
106
 
40
- def source_path_for(test_file)
41
- requirements[:unicode].source_path_for(test_file)
107
+ def import_dictionary_break_tests
108
+ DICTIONARY_BREAK_SAMPLES.each do |locale, text_sample|
109
+ data = create_dictionary_break_test(locale.to_s, text_sample)
110
+ dump_dictionary_break_test(locale, data)
111
+ end
42
112
  end
43
113
 
44
- def output_path_for(test_file)
45
- file = underscore(File.basename(test_file).chomp(File.extname(test_file)))
114
+ def import_combined_dictionary_break_test
115
+ text_sample = DICTIONARY_BREAK_SAMPLES.values.join(' ')
116
+ data = create_dictionary_break_test('en', text_sample)
117
+ dump_dictionary_break_test('combined', data)
118
+ end
119
+
120
+ def create_dictionary_break_test(locale, text_sample)
121
+ done = break_iterator.const_get(:DONE)
122
+ iter = break_iterator.get_word_instance(ulocale_class.new(locale))
123
+ iter.set_text(text_sample)
124
+ start = iter.first
125
+ segments = []
126
+
127
+ until (stop = iter.next) == done
128
+ segments << text_sample[start...stop]
129
+ start = stop
130
+ end
131
+
132
+ {
133
+ locale: locale,
134
+ text: text_sample,
135
+ segments: segments
136
+ }
137
+ end
138
+
139
+ def dump_dictionary_break_test(name, data)
140
+ output_file = dictionary_test_output_path_for(name)
141
+ FileUtils.mkdir_p(File.dirname(output_file))
142
+ File.write(output_file, YAML.dump(data))
143
+ end
144
+
145
+ def conformance_source_path_for(conformance_file)
146
+ requirements[:unicode].source_path_for(conformance_file)
147
+ end
148
+
149
+ def conformance_output_path_for(conformance_file)
150
+ file = underscore(File.basename(conformance_file).chomp(File.extname(conformance_file)))
46
151
  File.join(params.fetch(:output_path), "#{file}.yml")
47
152
  end
48
153
 
154
+ def dictionary_test_output_path_for(locale)
155
+ File.join(params.fetch(:output_path), 'dictionary_tests', "#{locale}.yml")
156
+ end
157
+
49
158
  def underscore(str)
50
159
  str.gsub(/(.)([A-Z])/, '\1_\2').downcase
51
160
  end
52
161
 
162
+ def ulocale_class
163
+ @ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
164
+ end
165
+
166
+ def break_iterator
167
+ @break_iterator ||= requirements[:icu].get_class('com.ibm.icu.text.BreakIterator')
168
+ end
169
+
53
170
  end
54
171
  end
55
172
  end