twitter_cldr 5.1.0 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0de7de286380d82bd1bb15f2153876905a48270c43c914f19a873d7a6314abe0
4
- data.tar.gz: 100d3add1abf37b423dec0cacca0b899baed1e683abf8d1c43a7c8ce8303a3d0
3
+ metadata.gz: b616d55c343da67733837c0f1549329d895ea3758011b4c5dd8c90c3c631f53a
4
+ data.tar.gz: 3717867c2412adcc7a95ff1dfbe032e1754ade3cc005b4445d1f8ff644048b06
5
5
  SHA512:
6
- metadata.gz: d5a310d97f73576229627ffa3f46eb8d1872b2dd3032e9ec9e7e54a502db896195b5a48119fb991a5647500eec9a222ceb8774e6f293e223313ad23ea25f238b
7
- data.tar.gz: abaef5aa0122312d5aa7aa69eca068ef5f4ef270e145d168fd295cc4d7db1aaad77a1fde0e3bf31fa909d6c741d8d447a1b75588b9c297daafa26a053c9d244c
6
+ metadata.gz: f82323e912a622930f192a2ffe8b742ce1378feff847a082e04b7ea0feb5df215faa183861e31a35e69bc966c6222d0182689cdc5264de69263acbc60acce8ff
7
+ data.tar.gz: dc11c5d5e3ab6cc0f2cf3a6073686f7167ac6621252a0b6bf4c7f0b1eb53ae7134507aa0d66e5c0c57e7abc9b689a6438032b4982abe766b9c8550b6df0139b8
data/Rakefile CHANGED
@@ -155,6 +155,11 @@ namespace :update do
155
155
  TwitterCldr::Resources::CollationTestsImporter.new.import
156
156
  end
157
157
 
158
+ desc 'Import text segmentation rules'
159
+ task :segment_rules do
160
+ TwitterCldr::Resources::SegmentRulesImporter.new.import
161
+ end
162
+
158
163
  desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
159
164
  task :bidi_tests do
160
165
  TwitterCldr::Resources::BidiTestImporter.new.import
@@ -195,11 +200,6 @@ namespace :update do
195
200
  TwitterCldr::Resources::TransformTestsImporter.new.import
196
201
  end
197
202
 
198
- desc 'Import segment exceptions'
199
- task :segment_exceptions do
200
- TwitterCldr::Resources::Uli::SegmentExceptionsImporter.new.import
201
- end
202
-
203
203
  desc 'Import segment tests'
204
204
  task :segment_tests do
205
205
  TwitterCldr::Resources::SegmentTestsImporter.new.import
data/lib/twitter_cldr.rb CHANGED
@@ -59,6 +59,7 @@ module TwitterCldr
59
59
  def_delegator :resources, :resource_exists?
60
60
  def_delegator :resources, :locale_resource_exists?
61
61
  def_delegator :resources, :absolute_resource_path
62
+ def_delegator :resources, :resource_file_path
62
63
 
63
64
  class << self
64
65
 
@@ -22,6 +22,7 @@ module TwitterCldr
22
22
  autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
23
23
  autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
24
24
  autoload :Properties, 'twitter_cldr/resources/properties'
25
+ autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
25
26
  autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
26
27
  autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
27
28
  autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
@@ -37,7 +38,6 @@ module TwitterCldr
37
38
  autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
38
39
  autoload :UnicodeFileParser, 'twitter_cldr/resources/unicode_file_parser'
39
40
  autoload :UnicodePropertyAliasesImporter, 'twitter_cldr/resources/unicode_property_aliases_importer'
40
- autoload :Uli, 'twitter_cldr/resources/uli'
41
41
  autoload :ValidityDataImporter, 'twitter_cldr/resources/validity_data_importer'
42
42
 
43
43
  class << self
@@ -60,6 +60,7 @@ module TwitterCldr
60
60
  NumberFormatsImporter,
61
61
  PostalCodesImporter,
62
62
  RbnfTestImporter,
63
+ SegmentRulesImporter,
63
64
  SegmentTestsImporter,
64
65
  TailoringImporter,
65
66
  TerritoriesImporter,
@@ -74,12 +75,6 @@ module TwitterCldr
74
75
  ]
75
76
  end
76
77
 
77
- def uli_importer_classes
78
- @uli_importer_classes ||= [
79
- Uli::SegmentExceptionsImporter
80
- ]
81
- end
82
-
83
78
  def property_importer_classes
84
79
  @property_importer_classes ||= [
85
80
  Properties::AgePropertyImporter,
@@ -107,7 +102,6 @@ module TwitterCldr
107
102
  def importer_classes
108
103
  @importer_classes ||=
109
104
  standard_importer_classes +
110
- uli_importer_classes +
111
105
  property_importer_classes
112
106
  end
113
107
 
@@ -73,6 +73,12 @@ module TwitterCldr
73
73
  nil
74
74
  end
75
75
 
76
+ def resource_file_path(path)
77
+ file = File.join(*path.map(&:to_s))
78
+ file << '.yml' unless file.end_with?('.yml')
79
+ file
80
+ end
81
+
76
82
  private
77
83
 
78
84
  def locale_resource_path(locale, resource_name)
@@ -85,10 +91,6 @@ module TwitterCldr
85
91
  end
86
92
  end
87
93
 
88
- def resource_file_path(path)
89
- "#{File.join(*path.map(&:to_s))}.yml"
90
- end
91
-
92
94
  def load_resource(path, merge_custom = true)
93
95
  base = YAML.load(read_resource_file(path))
94
96
  custom_path = File.join("custom", path)
@@ -37,7 +37,6 @@ module TwitterCldr
37
37
  currency_digits_and_rounding
38
38
  rbnf_root
39
39
  numbering_systems
40
- segments_root
41
40
  territories_containment
42
41
  likely_subtags
43
42
  metazones
@@ -0,0 +1,202 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+ require 'fileutils'
8
+ require 'nokogiri'
9
+ require 'yaml'
10
+
11
+ module TwitterCldr
12
+ module Resources
13
+
14
+ class SegmentRulesImporter < Importer
15
+
16
+ # @TODO: moar boundary types
17
+ BOUNDARY_TYPES = {
18
+ 'word' => 'word',
19
+ 'sentence' => 'sentence',
20
+ 'grapheme' => 'grapheme',
21
+ 'line' => 'line' # loose, normal, strict
22
+ }.freeze
23
+
24
+ TYPES_TO_ATTRS = {
25
+ 'word' => 'WordBreak',
26
+ 'sentence' => 'SentenceBreak',
27
+ 'grapheme' => 'GraphemeClusterBreak',
28
+ 'line' => 'LineBreak'
29
+ }.freeze
30
+
31
+ Locale = TwitterCldr::Shared::Locale
32
+
33
+ StateTable = TwitterCldr::Segmentation::StateTable
34
+ StatusTable = TwitterCldr::Segmentation::StatusTable
35
+ CategoryTable = TwitterCldr::Segmentation::CategoryTable
36
+
37
+ requirement :icu, Versions.icu_version
38
+ requirement :cldr, Versions.cldr_version
39
+ output_path File.join('shared', 'segments')
40
+ ruby_engine :jruby
41
+
42
+ def execute
43
+ each_locale do |locale, doc|
44
+ BOUNDARY_TYPES.each do |kind, icu_kind|
45
+ seg = doc.xpath(
46
+ "//ldml/segmentations/segmentation[@type=\"#{TYPES_TO_ATTRS[kind]}\"]"
47
+ )
48
+
49
+ rule_data = rule_data_for(icu_kind, locale, seg)
50
+
51
+ unless rule_data.empty?
52
+ output_dir = File.join(output_path, 'rules', locale)
53
+ output_file = File.join(output_dir, "#{kind}.yml")
54
+ FileUtils.mkdir_p(output_dir)
55
+ File.write(output_file, YAML.dump(rule_data))
56
+ end
57
+
58
+ suppressions = suppressions_for(icu_kind, locale, seg)
59
+
60
+ unless suppressions.empty?
61
+ output_dir = File.join(output_path, 'suppressions', locale)
62
+ output_file = File.join(output_dir, "#{kind}.yml")
63
+ FileUtils.mkdir_p(output_dir)
64
+ File.write(output_file, YAML.dump(suppressions))
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ private
71
+
72
+ def each_locale
73
+ return to_enum(__method__) unless block_given?
74
+
75
+ pattern = File.join(requirements[:cldr].common_path, 'segments', '*.xml')
76
+
77
+ Dir.glob(pattern).each do |file, ret|
78
+ locale = File.basename(file).chomp('.xml').tr('_', '-')
79
+ yield locale, Nokogiri::XML(File.read(file))
80
+ end
81
+ end
82
+
83
+ def rule_data_for(kind, locale, doc)
84
+ vars = doc.xpath('variables/variable')
85
+ rules = doc.xpath('segmentRules/rule')
86
+ result = {}
87
+
88
+ unless vars.empty? && rules.empty?
89
+ result.merge!(encode_rbbi_data(rbbi_data_for(kind, locale)))
90
+ end
91
+
92
+ result
93
+ end
94
+
95
+ def suppressions_for(kind, locale, doc)
96
+ suppressions = doc.xpath('suppressions/suppression').map(&:text)
97
+ return {} if suppressions.empty?
98
+
99
+ encode_suppressions(suppressions)
100
+ end
101
+
102
+ def encode_rbbi_data(data)
103
+ {
104
+ metadata: metadata_from(data.fHeader),
105
+ forward_table: StateTable.new(data.fFTable.fTable.to_a, data.fFTable.fFlags).dump16,
106
+ backward_table: StateTable.new(data.fRTable.fTable.to_a, data.fRTable.fFlags).dump16,
107
+ status_table: StatusTable.new(data.fStatusTable.to_a).dump,
108
+ category_table: encode_trie(data.fTrie), # this really isn't a trie
109
+ }
110
+ end
111
+
112
+ def metadata_from(header)
113
+ { category_count: header.fCatCount }
114
+ end
115
+
116
+ def encode_suppressions(suppressions)
117
+ forwards_trie = TwitterCldr::Utils::Trie.new
118
+ backwards_trie = TwitterCldr::Utils::Trie.new
119
+
120
+ suppressions.each do |suppression|
121
+ forwards_trie.add(suppression.codepoints, true)
122
+ backwards_trie.add(suppression.reverse.codepoints, true)
123
+ end
124
+
125
+ {
126
+ forwards_trie: Marshal.dump(forwards_trie),
127
+ backwards_trie: Marshal.dump(backwards_trie)
128
+ }
129
+ end
130
+
131
+ def encode_trie(trie)
132
+ arr = [].tap do |results|
133
+ iter = trie.iterator
134
+
135
+ while iter.hasNext
136
+ range = iter.next
137
+ results << range_to_a(range)
138
+
139
+ # this should be the last entry, but for some reason ICU returns
140
+ # one more out-of-order range past the Unicode max
141
+ break if range.endCodePoint == 0x10FFFF
142
+ end
143
+ end
144
+
145
+ # @TODO: Distinguish between the 16- and 32-bit flavors
146
+ CategoryTable.new(arr).dump16.strip
147
+ end
148
+
149
+ def range_to_a(range)
150
+ [range.startCodePoint, range.endCodePoint, range.value]
151
+ end
152
+
153
+ def rbbi_data_for(kind, locale)
154
+ bundle = bundle_for(ulocale_class.new(locale))
155
+ brkf_name = bundle.getStringWithFallback("boundaries/#{kind}")
156
+ buffer = icu_binary.getData("#{brkiter_name}/#{brkf_name}")
157
+ rbbi_data_wrapper.get(buffer)
158
+ end
159
+
160
+ def bundle_for(locale)
161
+ @bundle ||= resource_bundle.getBundleInstance(brkiter_base_name, locale, locale_root)
162
+ end
163
+
164
+ def brkiter_name
165
+ @brkiter_name ||= icu_data.const_get(:ICU_BRKITR_NAME)
166
+ end
167
+
168
+ def brkiter_base_name
169
+ @brkiter_base_name ||= icu_data.const_get(:ICU_BRKITR_BASE_NAME)
170
+ end
171
+
172
+ def locale_root
173
+ @locale_root ||= resource_bundle.const_get(:OpenType).const_get(:LOCALE_ROOT)
174
+ end
175
+
176
+ def rbbi_data_wrapper
177
+ @rbbi_data_wrapper ||= requirements[:icu].get_class('com.ibm.icu.impl.RBBIDataWrapper')
178
+ end
179
+
180
+ def icu_binary
181
+ @icu_binary ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUBinary')
182
+ end
183
+
184
+ def icu_data
185
+ @icu_data ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUData')
186
+ end
187
+
188
+ def resource_bundle
189
+ @bundle_class ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUResourceBundle')
190
+ end
191
+
192
+ def ulocale_class
193
+ @ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
194
+ end
195
+
196
+ def output_path
197
+ params[:output_path]
198
+ end
199
+
200
+ end
201
+ end
202
+ end
@@ -11,7 +11,9 @@ module TwitterCldr
11
11
 
12
12
  TEST_FILES = [
13
13
  'ucd/auxiliary/WordBreakTest.txt',
14
- 'ucd/auxiliary/SentenceBreakTest.txt'
14
+ 'ucd/auxiliary/SentenceBreakTest.txt',
15
+ 'ucd/auxiliary/GraphemeBreakTest.txt',
16
+ 'ucd/auxiliary/LineBreakTest.txt'
15
17
  ]
16
18
 
17
19
  requirement :unicode, Versions.unicode_version, TEST_FILES
@@ -5,13 +5,15 @@
5
5
 
6
6
  module TwitterCldr
7
7
  module Segmentation
8
- autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
9
- autoload :BreakRule, 'twitter_cldr/segmentation/rule'
10
- autoload :Cursor, 'twitter_cldr/segmentation/cursor'
11
- autoload :NoBreakRule, 'twitter_cldr/segmentation/rule'
12
- autoload :Parser, 'twitter_cldr/segmentation/parser'
13
- autoload :Rule, 'twitter_cldr/segmentation/rule'
14
- autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
15
- autoload :RuleSetBuilder, 'twitter_cldr/segmentation/rule_set_builder'
8
+ autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
9
+ autoload :CategoryTable, 'twitter_cldr/segmentation/category_table'
10
+ autoload :Cursor, 'twitter_cldr/segmentation/cursor'
11
+ autoload :Metadata, 'twitter_cldr/segmentation/metadata'
12
+ autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
13
+ autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
14
+ autoload :StateMachine, 'twitter_cldr/segmentation/state_machine'
15
+ autoload :StateTable, 'twitter_cldr/segmentation/state_table'
16
+ autoload :StatusTable, 'twitter_cldr/segmentation/status_table'
17
+ autoload :Suppressions, 'twitter_cldr/segmentation/suppressions'
16
18
  end
17
19
  end
@@ -25,29 +25,33 @@ module TwitterCldr
25
25
  end
26
26
 
27
27
  def each_grapheme_cluster(str, &block)
28
- raise NotImplementedError,
29
- "Grapheme segmentation is not currently supported."
28
+ rule_set = rule_set_for('grapheme')
29
+ each_boundary(rule_set, str, &block)
30
30
  end
31
31
 
32
32
  def each_line(str, &block)
33
- raise NotImplementedError,
34
- "Line segmentation is not currently supported."
33
+ rule_set = rule_set_for('line')
34
+ each_boundary(rule_set, str, &block)
35
35
  end
36
36
 
37
37
  private
38
38
 
39
39
  def each_boundary(rule_set, str)
40
- if block_given?
41
- rule_set.each_boundary(str).each_cons(2) do |start, stop|
42
- yield str[start...stop], start, stop
43
- end
44
- else
45
- to_enum(__method__, rule_set, str)
40
+ return to_enum(__method__, rule_set, str) unless block_given?
41
+
42
+ rule_set.each_boundary(str).each_cons(2) do |start, stop|
43
+ yield str[start...stop], start, stop
46
44
  end
47
45
  end
48
46
 
49
47
  def rule_set_for(boundary_type)
50
- RuleSet.load(locale, boundary_type, options)
48
+ rule_set_cache[boundary_type] ||= RuleSet.create(
49
+ locale, boundary_type, options
50
+ )
51
+ end
52
+
53
+ def rule_set_cache
54
+ @rule_set_cache ||= {}
51
55
  end
52
56
  end
53
57
  end
@@ -0,0 +1,56 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class CategoryTable
11
+ PACK_FMT_16 = 'NNn'.freeze
12
+
13
+ class << self
14
+ def load16(data)
15
+ data = Base64.decode64(data)
16
+
17
+ new(
18
+ (0...data.size).step(10).map do |i|
19
+ data[i...(i + 10)].unpack(PACK_FMT_16)
20
+ end
21
+ )
22
+ end
23
+ end
24
+
25
+ attr_reader :values
26
+
27
+ def initialize(values)
28
+ @values = values
29
+ end
30
+
31
+ def get(codepoint)
32
+ find(codepoint)[2]
33
+ end
34
+
35
+ def dump16
36
+ data = ''.b.tap do |result|
37
+ values.each do |vals|
38
+ result << vals.pack(PACK_FMT_16)
39
+ end
40
+ end
41
+
42
+ Base64.encode64(data)
43
+ end
44
+
45
+ private
46
+
47
+ def find(codepoint)
48
+ values.bsearch do |entry|
49
+ next -1 if codepoint < entry[0]
50
+ next 1 if codepoint > entry[1]
51
+ 0
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end