twitter_cldr 5.1.0 → 5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +5 -5
  3. data/lib/twitter_cldr.rb +1 -0
  4. data/lib/twitter_cldr/resources.rb +2 -8
  5. data/lib/twitter_cldr/resources/loader.rb +6 -4
  6. data/lib/twitter_cldr/resources/locales_resources_importer.rb +0 -1
  7. data/lib/twitter_cldr/resources/segment_rules_importer.rb +202 -0
  8. data/lib/twitter_cldr/resources/segment_tests_importer.rb +3 -1
  9. data/lib/twitter_cldr/segmentation.rb +10 -8
  10. data/lib/twitter_cldr/segmentation/break_iterator.rb +15 -11
  11. data/lib/twitter_cldr/segmentation/category_table.rb +56 -0
  12. data/lib/twitter_cldr/segmentation/cursor.rb +10 -5
  13. data/lib/twitter_cldr/segmentation/metadata.rb +20 -0
  14. data/lib/twitter_cldr/segmentation/null_suppressions.rb +18 -0
  15. data/lib/twitter_cldr/segmentation/rule_set.rb +23 -79
  16. data/lib/twitter_cldr/segmentation/state_machine.rb +125 -0
  17. data/lib/twitter_cldr/segmentation/state_table.rb +46 -0
  18. data/lib/twitter_cldr/segmentation/status_table.rb +30 -0
  19. data/lib/twitter_cldr/segmentation/suppressions.rb +79 -0
  20. data/lib/twitter_cldr/shared/caser.rb +1 -1
  21. data/lib/twitter_cldr/shared/locale.rb +6 -2
  22. data/lib/twitter_cldr/version.rb +1 -1
  23. data/resources/shared/segments/rules/el/sentence.yml +723 -0
  24. data/resources/shared/segments/rules/en-US-POSIX/word.yml +527 -0
  25. data/resources/shared/segments/rules/ja/line.yml +964 -0
  26. data/resources/shared/segments/rules/ja/word.yml +527 -0
  27. data/resources/shared/segments/rules/root/grapheme.yml +463 -0
  28. data/resources/shared/segments/rules/root/line.yml +964 -0
  29. data/resources/shared/segments/rules/root/sentence.yml +723 -0
  30. data/resources/shared/segments/rules/root/word.yml +527 -0
  31. data/resources/shared/segments/rules/zh-Hant/line.yml +964 -0
  32. data/resources/shared/segments/rules/zh/line.yml +964 -0
  33. data/resources/shared/segments/suppressions/de/sentence.yml +5 -0
  34. data/resources/shared/segments/suppressions/en/sentence.yml +5 -0
  35. data/resources/shared/segments/suppressions/es/sentence.yml +5 -0
  36. data/resources/shared/segments/suppressions/fr/sentence.yml +5 -0
  37. data/resources/shared/segments/suppressions/it/sentence.yml +5 -0
  38. data/resources/shared/segments/suppressions/pt/sentence.yml +5 -0
  39. data/resources/shared/segments/suppressions/ru/sentence.yml +5 -0
  40. data/resources/shared/segments/tests/grapheme_break_test.yml +603 -0
  41. data/resources/shared/segments/tests/line_break_test.yml +7348 -0
  42. data/resources/uli/segments/de.yml +5 -230
  43. data/resources/uli/segments/en.yml +3 -154
  44. data/resources/uli/segments/es.yml +5 -145
  45. data/resources/uli/segments/fr.yml +5 -68
  46. data/resources/uli/segments/it.yml +3 -48
  47. data/resources/uli/segments/pt.yml +5 -173
  48. data/resources/uli/segments/ru.yml +3 -10
  49. data/spec/segmentation/rule_set_spec.rb +54 -27
  50. metadata +29 -9
  51. data/lib/twitter_cldr/resources/uli.rb +0 -12
  52. data/lib/twitter_cldr/resources/uli/segment_exceptions_importer.rb +0 -59
  53. data/lib/twitter_cldr/segmentation/parser.rb +0 -71
  54. data/lib/twitter_cldr/segmentation/rule.rb +0 -79
  55. data/lib/twitter_cldr/segmentation/rule_set_builder.rb +0 -142
  56. data/resources/shared/segments/segments_root.yml +0 -869
  57. data/spec/segmentation/parser_spec.rb +0 -104
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0de7de286380d82bd1bb15f2153876905a48270c43c914f19a873d7a6314abe0
4
- data.tar.gz: 100d3add1abf37b423dec0cacca0b899baed1e683abf8d1c43a7c8ce8303a3d0
3
+ metadata.gz: b616d55c343da67733837c0f1549329d895ea3758011b4c5dd8c90c3c631f53a
4
+ data.tar.gz: 3717867c2412adcc7a95ff1dfbe032e1754ade3cc005b4445d1f8ff644048b06
5
5
  SHA512:
6
- metadata.gz: d5a310d97f73576229627ffa3f46eb8d1872b2dd3032e9ec9e7e54a502db896195b5a48119fb991a5647500eec9a222ceb8774e6f293e223313ad23ea25f238b
7
- data.tar.gz: abaef5aa0122312d5aa7aa69eca068ef5f4ef270e145d168fd295cc4d7db1aaad77a1fde0e3bf31fa909d6c741d8d447a1b75588b9c297daafa26a053c9d244c
6
+ metadata.gz: f82323e912a622930f192a2ffe8b742ce1378feff847a082e04b7ea0feb5df215faa183861e31a35e69bc966c6222d0182689cdc5264de69263acbc60acce8ff
7
+ data.tar.gz: dc11c5d5e3ab6cc0f2cf3a6073686f7167ac6621252a0b6bf4c7f0b1eb53ae7134507aa0d66e5c0c57e7abc9b689a6438032b4982abe766b9c8550b6df0139b8
data/Rakefile CHANGED
@@ -155,6 +155,11 @@ namespace :update do
155
155
  TwitterCldr::Resources::CollationTestsImporter.new.import
156
156
  end
157
157
 
158
+ desc 'Import text segmentation rules'
159
+ task :segment_rules do
160
+ TwitterCldr::Resources::SegmentRulesImporter.new.import
161
+ end
162
+
158
163
  desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
159
164
  task :bidi_tests do
160
165
  TwitterCldr::Resources::BidiTestImporter.new.import
@@ -195,11 +200,6 @@ namespace :update do
195
200
  TwitterCldr::Resources::TransformTestsImporter.new.import
196
201
  end
197
202
 
198
- desc 'Import segment exceptions'
199
- task :segment_exceptions do
200
- TwitterCldr::Resources::Uli::SegmentExceptionsImporter.new.import
201
- end
202
-
203
203
  desc 'Import segment tests'
204
204
  task :segment_tests do
205
205
  TwitterCldr::Resources::SegmentTestsImporter.new.import
data/lib/twitter_cldr.rb CHANGED
@@ -59,6 +59,7 @@ module TwitterCldr
59
59
  def_delegator :resources, :resource_exists?
60
60
  def_delegator :resources, :locale_resource_exists?
61
61
  def_delegator :resources, :absolute_resource_path
62
+ def_delegator :resources, :resource_file_path
62
63
 
63
64
  class << self
64
65
 
@@ -22,6 +22,7 @@ module TwitterCldr
22
22
  autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
23
23
  autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
24
24
  autoload :Properties, 'twitter_cldr/resources/properties'
25
+ autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
25
26
  autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
26
27
  autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
27
28
  autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
@@ -37,7 +38,6 @@ module TwitterCldr
37
38
  autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
38
39
  autoload :UnicodeFileParser, 'twitter_cldr/resources/unicode_file_parser'
39
40
  autoload :UnicodePropertyAliasesImporter, 'twitter_cldr/resources/unicode_property_aliases_importer'
40
- autoload :Uli, 'twitter_cldr/resources/uli'
41
41
  autoload :ValidityDataImporter, 'twitter_cldr/resources/validity_data_importer'
42
42
 
43
43
  class << self
@@ -60,6 +60,7 @@ module TwitterCldr
60
60
  NumberFormatsImporter,
61
61
  PostalCodesImporter,
62
62
  RbnfTestImporter,
63
+ SegmentRulesImporter,
63
64
  SegmentTestsImporter,
64
65
  TailoringImporter,
65
66
  TerritoriesImporter,
@@ -74,12 +75,6 @@ module TwitterCldr
74
75
  ]
75
76
  end
76
77
 
77
- def uli_importer_classes
78
- @uli_importer_classes ||= [
79
- Uli::SegmentExceptionsImporter
80
- ]
81
- end
82
-
83
78
  def property_importer_classes
84
79
  @property_importer_classes ||= [
85
80
  Properties::AgePropertyImporter,
@@ -107,7 +102,6 @@ module TwitterCldr
107
102
  def importer_classes
108
103
  @importer_classes ||=
109
104
  standard_importer_classes +
110
- uli_importer_classes +
111
105
  property_importer_classes
112
106
  end
113
107
 
@@ -73,6 +73,12 @@ module TwitterCldr
73
73
  nil
74
74
  end
75
75
 
76
+ def resource_file_path(path)
77
+ file = File.join(*path.map(&:to_s))
78
+ file << '.yml' unless file.end_with?('.yml')
79
+ file
80
+ end
81
+
76
82
  private
77
83
 
78
84
  def locale_resource_path(locale, resource_name)
@@ -85,10 +91,6 @@ module TwitterCldr
85
91
  end
86
92
  end
87
93
 
88
- def resource_file_path(path)
89
- "#{File.join(*path.map(&:to_s))}.yml"
90
- end
91
-
92
94
  def load_resource(path, merge_custom = true)
93
95
  base = YAML.load(read_resource_file(path))
94
96
  custom_path = File.join("custom", path)
@@ -37,7 +37,6 @@ module TwitterCldr
37
37
  currency_digits_and_rounding
38
38
  rbnf_root
39
39
  numbering_systems
40
- segments_root
41
40
  territories_containment
42
41
  likely_subtags
43
42
  metazones
@@ -0,0 +1,202 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+ require 'fileutils'
8
+ require 'nokogiri'
9
+ require 'yaml'
10
+
11
+ module TwitterCldr
12
+ module Resources
13
+
14
+ class SegmentRulesImporter < Importer
15
+
16
+ # @TODO: moar boundary types
17
+ BOUNDARY_TYPES = {
18
+ 'word' => 'word',
19
+ 'sentence' => 'sentence',
20
+ 'grapheme' => 'grapheme',
21
+ 'line' => 'line' # loose, normal, strict
22
+ }.freeze
23
+
24
+ TYPES_TO_ATTRS = {
25
+ 'word' => 'WordBreak',
26
+ 'sentence' => 'SentenceBreak',
27
+ 'grapheme' => 'GraphemeClusterBreak',
28
+ 'line' => 'LineBreak'
29
+ }.freeze
30
+
31
+ Locale = TwitterCldr::Shared::Locale
32
+
33
+ StateTable = TwitterCldr::Segmentation::StateTable
34
+ StatusTable = TwitterCldr::Segmentation::StatusTable
35
+ CategoryTable = TwitterCldr::Segmentation::CategoryTable
36
+
37
+ requirement :icu, Versions.icu_version
38
+ requirement :cldr, Versions.cldr_version
39
+ output_path File.join('shared', 'segments')
40
+ ruby_engine :jruby
41
+
42
+ def execute
43
+ each_locale do |locale, doc|
44
+ BOUNDARY_TYPES.each do |kind, icu_kind|
45
+ seg = doc.xpath(
46
+ "//ldml/segmentations/segmentation[@type=\"#{TYPES_TO_ATTRS[kind]}\"]"
47
+ )
48
+
49
+ rule_data = rule_data_for(icu_kind, locale, seg)
50
+
51
+ unless rule_data.empty?
52
+ output_dir = File.join(output_path, 'rules', locale)
53
+ output_file = File.join(output_dir, "#{kind}.yml")
54
+ FileUtils.mkdir_p(output_dir)
55
+ File.write(output_file, YAML.dump(rule_data))
56
+ end
57
+
58
+ suppressions = suppressions_for(icu_kind, locale, seg)
59
+
60
+ unless suppressions.empty?
61
+ output_dir = File.join(output_path, 'suppressions', locale)
62
+ output_file = File.join(output_dir, "#{kind}.yml")
63
+ FileUtils.mkdir_p(output_dir)
64
+ File.write(output_file, YAML.dump(suppressions))
65
+ end
66
+ end
67
+ end
68
+ end
69
+
70
+ private
71
+
72
+ def each_locale
73
+ return to_enum(__method__) unless block_given?
74
+
75
+ pattern = File.join(requirements[:cldr].common_path, 'segments', '*.xml')
76
+
77
+ Dir.glob(pattern).each do |file, ret|
78
+ locale = File.basename(file).chomp('.xml').tr('_', '-')
79
+ yield locale, Nokogiri::XML(File.read(file))
80
+ end
81
+ end
82
+
83
+ def rule_data_for(kind, locale, doc)
84
+ vars = doc.xpath('variables/variable')
85
+ rules = doc.xpath('segmentRules/rule')
86
+ result = {}
87
+
88
+ unless vars.empty? && rules.empty?
89
+ result.merge!(encode_rbbi_data(rbbi_data_for(kind, locale)))
90
+ end
91
+
92
+ result
93
+ end
94
+
95
+ def suppressions_for(kind, locale, doc)
96
+ suppressions = doc.xpath('suppressions/suppression').map(&:text)
97
+ return {} if suppressions.empty?
98
+
99
+ encode_suppressions(suppressions)
100
+ end
101
+
102
+ def encode_rbbi_data(data)
103
+ {
104
+ metadata: metadata_from(data.fHeader),
105
+ forward_table: StateTable.new(data.fFTable.fTable.to_a, data.fFTable.fFlags).dump16,
106
+ backward_table: StateTable.new(data.fRTable.fTable.to_a, data.fRTable.fFlags).dump16,
107
+ status_table: StatusTable.new(data.fStatusTable.to_a).dump,
108
+ category_table: encode_trie(data.fTrie), # this really isn't a trie
109
+ }
110
+ end
111
+
112
+ def metadata_from(header)
113
+ { category_count: header.fCatCount }
114
+ end
115
+
116
+ def encode_suppressions(suppressions)
117
+ forwards_trie = TwitterCldr::Utils::Trie.new
118
+ backwards_trie = TwitterCldr::Utils::Trie.new
119
+
120
+ suppressions.each do |suppression|
121
+ forwards_trie.add(suppression.codepoints, true)
122
+ backwards_trie.add(suppression.reverse.codepoints, true)
123
+ end
124
+
125
+ {
126
+ forwards_trie: Marshal.dump(forwards_trie),
127
+ backwards_trie: Marshal.dump(backwards_trie)
128
+ }
129
+ end
130
+
131
+ def encode_trie(trie)
132
+ arr = [].tap do |results|
133
+ iter = trie.iterator
134
+
135
+ while iter.hasNext
136
+ range = iter.next
137
+ results << range_to_a(range)
138
+
139
+ # this should be the last entry, but for some reason ICU returns
140
+ # one more out-of-order range past the Unicode max
141
+ break if range.endCodePoint == 0x10FFFF
142
+ end
143
+ end
144
+
145
+ # @TODO: Distinguish between the 16- and 32-bit flavors
146
+ CategoryTable.new(arr).dump16.strip
147
+ end
148
+
149
+ def range_to_a(range)
150
+ [range.startCodePoint, range.endCodePoint, range.value]
151
+ end
152
+
153
+ def rbbi_data_for(kind, locale)
154
+ bundle = bundle_for(ulocale_class.new(locale))
155
+ brkf_name = bundle.getStringWithFallback("boundaries/#{kind}")
156
+ buffer = icu_binary.getData("#{brkiter_name}/#{brkf_name}")
157
+ rbbi_data_wrapper.get(buffer)
158
+ end
159
+
160
+ def bundle_for(locale)
161
+ @bundle ||= resource_bundle.getBundleInstance(brkiter_base_name, locale, locale_root)
162
+ end
163
+
164
+ def brkiter_name
165
+ @brkiter_name ||= icu_data.const_get(:ICU_BRKITR_NAME)
166
+ end
167
+
168
+ def brkiter_base_name
169
+ @brkiter_base_name ||= icu_data.const_get(:ICU_BRKITR_BASE_NAME)
170
+ end
171
+
172
+ def locale_root
173
+ @locale_root ||= resource_bundle.const_get(:OpenType).const_get(:LOCALE_ROOT)
174
+ end
175
+
176
+ def rbbi_data_wrapper
177
+ @rbbi_data_wrapper ||= requirements[:icu].get_class('com.ibm.icu.impl.RBBIDataWrapper')
178
+ end
179
+
180
+ def icu_binary
181
+ @icu_binary ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUBinary')
182
+ end
183
+
184
+ def icu_data
185
+ @icu_data ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUData')
186
+ end
187
+
188
+ def resource_bundle
189
+ @bundle_class ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUResourceBundle')
190
+ end
191
+
192
+ def ulocale_class
193
+ @ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
194
+ end
195
+
196
+ def output_path
197
+ params[:output_path]
198
+ end
199
+
200
+ end
201
+ end
202
+ end
@@ -11,7 +11,9 @@ module TwitterCldr
11
11
 
12
12
  TEST_FILES = [
13
13
  'ucd/auxiliary/WordBreakTest.txt',
14
- 'ucd/auxiliary/SentenceBreakTest.txt'
14
+ 'ucd/auxiliary/SentenceBreakTest.txt',
15
+ 'ucd/auxiliary/GraphemeBreakTest.txt',
16
+ 'ucd/auxiliary/LineBreakTest.txt'
15
17
  ]
16
18
 
17
19
  requirement :unicode, Versions.unicode_version, TEST_FILES
@@ -5,13 +5,15 @@
5
5
 
6
6
  module TwitterCldr
7
7
  module Segmentation
8
- autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
9
- autoload :BreakRule, 'twitter_cldr/segmentation/rule'
10
- autoload :Cursor, 'twitter_cldr/segmentation/cursor'
11
- autoload :NoBreakRule, 'twitter_cldr/segmentation/rule'
12
- autoload :Parser, 'twitter_cldr/segmentation/parser'
13
- autoload :Rule, 'twitter_cldr/segmentation/rule'
14
- autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
15
- autoload :RuleSetBuilder, 'twitter_cldr/segmentation/rule_set_builder'
8
+ autoload :BreakIterator, 'twitter_cldr/segmentation/break_iterator'
9
+ autoload :CategoryTable, 'twitter_cldr/segmentation/category_table'
10
+ autoload :Cursor, 'twitter_cldr/segmentation/cursor'
11
+ autoload :Metadata, 'twitter_cldr/segmentation/metadata'
12
+ autoload :NullSuppressions, 'twitter_cldr/segmentation/null_suppressions'
13
+ autoload :RuleSet, 'twitter_cldr/segmentation/rule_set'
14
+ autoload :StateMachine, 'twitter_cldr/segmentation/state_machine'
15
+ autoload :StateTable, 'twitter_cldr/segmentation/state_table'
16
+ autoload :StatusTable, 'twitter_cldr/segmentation/status_table'
17
+ autoload :Suppressions, 'twitter_cldr/segmentation/suppressions'
16
18
  end
17
19
  end
@@ -25,29 +25,33 @@ module TwitterCldr
25
25
  end
26
26
 
27
27
  def each_grapheme_cluster(str, &block)
28
- raise NotImplementedError,
29
- "Grapheme segmentation is not currently supported."
28
+ rule_set = rule_set_for('grapheme')
29
+ each_boundary(rule_set, str, &block)
30
30
  end
31
31
 
32
32
  def each_line(str, &block)
33
- raise NotImplementedError,
34
- "Line segmentation is not currently supported."
33
+ rule_set = rule_set_for('line')
34
+ each_boundary(rule_set, str, &block)
35
35
  end
36
36
 
37
37
  private
38
38
 
39
39
  def each_boundary(rule_set, str)
40
- if block_given?
41
- rule_set.each_boundary(str).each_cons(2) do |start, stop|
42
- yield str[start...stop], start, stop
43
- end
44
- else
45
- to_enum(__method__, rule_set, str)
40
+ return to_enum(__method__, rule_set, str) unless block_given?
41
+
42
+ rule_set.each_boundary(str).each_cons(2) do |start, stop|
43
+ yield str[start...stop], start, stop
46
44
  end
47
45
  end
48
46
 
49
47
  def rule_set_for(boundary_type)
50
- RuleSet.load(locale, boundary_type, options)
48
+ rule_set_cache[boundary_type] ||= RuleSet.create(
49
+ locale, boundary_type, options
50
+ )
51
+ end
52
+
53
+ def rule_set_cache
54
+ @rule_set_cache ||= {}
51
55
  end
52
56
  end
53
57
  end
@@ -0,0 +1,56 @@
1
+ # encoding: UTF-8
2
+
3
+ # Copyright 2012 Twitter, Inc
4
+ # http://www.apache.org/licenses/LICENSE-2.0
5
+
6
+ require 'base64'
7
+
8
+ module TwitterCldr
9
+ module Segmentation
10
+ class CategoryTable
11
+ PACK_FMT_16 = 'NNn'.freeze
12
+
13
+ class << self
14
+ def load16(data)
15
+ data = Base64.decode64(data)
16
+
17
+ new(
18
+ (0...data.size).step(10).map do |i|
19
+ data[i...(i + 10)].unpack(PACK_FMT_16)
20
+ end
21
+ )
22
+ end
23
+ end
24
+
25
+ attr_reader :values
26
+
27
+ def initialize(values)
28
+ @values = values
29
+ end
30
+
31
+ def get(codepoint)
32
+ find(codepoint)[2]
33
+ end
34
+
35
+ def dump16
36
+ data = ''.b.tap do |result|
37
+ values.each do |vals|
38
+ result << vals.pack(PACK_FMT_16)
39
+ end
40
+ end
41
+
42
+ Base64.encode64(data)
43
+ end
44
+
45
+ private
46
+
47
+ def find(codepoint)
48
+ values.bsearch do |entry|
49
+ next -1 if codepoint < entry[0]
50
+ next 1 if codepoint > entry[1]
51
+ 0
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end