icu 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE +20 -0
  6. data/README.md +69 -0
  7. data/Rakefile +38 -0
  8. data/benchmark/normalization.rb +106 -0
  9. data/benchmark/normalization_phrases.txt +1031 -0
  10. data/benchmark/normalization_result.txt +45 -0
  11. data/benchmark/normalization_wikip.txt +2838 -0
  12. data/ext/icu/extconf.rb +242 -0
  13. data/ext/icu/icu.c +18 -0
  14. data/ext/icu/icu.h +78 -0
  15. data/ext/icu/icu_charset_detector.c +192 -0
  16. data/ext/icu/icu_collator.c +138 -0
  17. data/ext/icu/icu_locale.c +852 -0
  18. data/ext/icu/icu_normalizer.c +122 -0
  19. data/ext/icu/icu_number_format.c +0 -0
  20. data/ext/icu/icu_spoof_checker.c +194 -0
  21. data/ext/icu/icu_transliterator.c +159 -0
  22. data/ext/icu/internal_encoding.c +38 -0
  23. data/ext/icu/internal_ustring.c +304 -0
  24. data/ext/icu/internal_utils.c +50 -0
  25. data/ext/icu/rb_errors.c +14 -0
  26. data/icu.gemspec +22 -0
  27. data/lib/icu.rb +6 -18
  28. data/lib/icu/charset_detector.rb +5 -0
  29. data/lib/icu/collator.rb +24 -0
  30. data/lib/icu/locale.rb +19 -0
  31. data/lib/icu/transliterator.rb +8 -0
  32. data/lib/icu/version.rb +3 -0
  33. data/spec/charset_detector_spec.rb +47 -0
  34. data/spec/collator_spec.rb +73 -0
  35. data/spec/locale_spec.rb +312 -0
  36. data/spec/normalizer_spec.rb +35 -0
  37. data/spec/spec_helper.rb +8 -0
  38. data/spec/spoof_checker_spec.rb +56 -0
  39. data/spec/transliterator_spec.rb +41 -0
  40. metadata +132 -55
  41. data/COPYING +0 -674
  42. data/COPYING.LESSER +0 -165
  43. data/README +0 -81
  44. data/ext/extconf.rb +0 -31
  45. data/ext/icu.c +0 -128
  46. data/ext/icu.h +0 -34
  47. data/ext/icu_locale.c +0 -330
  48. data/ext/icu_locale_country.c +0 -99
  49. data/ext/icu_locale_language.c +0 -99
  50. data/ext/icu_numeric.c +0 -161
  51. data/ext/icu_time.c +0 -391
  52. data/test/test_locale.rb +0 -73
  53. data/test/test_numeric.rb +0 -78
  54. data/test/test_time.rb +0 -75
@@ -0,0 +1,5 @@
1
+ module ICU
2
+ class CharsetDetector
3
+ alias input_filter_enabled? input_filter
4
+ end
5
+ end
@@ -0,0 +1,24 @@
1
+ module ICU
2
+ class Collator
3
+ def self.sort(locale, strings)
4
+ self.new(locale)
5
+ .sort(strings)
6
+ end
7
+
8
+ def sort(strings)
9
+ strings.sort { |a, b| compare(a, b) }
10
+ end
11
+
12
+ def greater?(str_a, str_b)
13
+ compare(str_a, str_b) > 0
14
+ end
15
+
16
+ def greater_or_equal?(str_a, str_b)
17
+ compare(str_a, str_b) >= 0
18
+ end
19
+
20
+ def equal?(str_a, str_b)
21
+ compare(str_a, str_b).zero?
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,19 @@
1
+ module ICU
2
+ class Locale
3
+ attr_reader :id, :enc
4
+
5
+ def ==(other)
6
+ other.is_a?(self.class) && other.id == self.id
7
+ end
8
+
9
+ alias === ==
10
+ alias to_s id
11
+
12
+ def with_keywords(keywords)
13
+ keywords.reduce(self) do |locale, (keyword, value)|
14
+ # p locale, keyword, value
15
+ locale.with_keyword(keyword, value)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module ICU
2
+ class Transliterator
3
+ def self.transliterate(id, str, rules = nil, direction = nil)
4
+ self.new(id, rules, direction)
5
+ .transliterate(str)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module ICU
2
+ VERSION = "0.10.0"
3
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe ICU::CharsetDetector do
6
+ describe '.detect' do
7
+ it "should recognize UTF-8" do
8
+ m = subject.detect("æåø")
9
+ expect(m.name).to eq "UTF-8"
10
+ expect(m.language).to be_kind_of(String)
11
+ end
12
+
13
+ it "should support null bytes" do
14
+ # Create a utf-16 string and then force it to binary (ascii) to mimic data from net/http
15
+ string = "foo".encode("UTF-16").force_encoding("binary")
16
+ m = subject.detect(string)
17
+ expect(m.name).to eq "UTF-16BE"
18
+ expect(m.language).to be_kind_of(String)
19
+ end
20
+ end
21
+
22
+ describe '.detect_all' do
23
+ it "should detect several matching encodings" do
24
+ expect(subject.detect_all("foo bar")).to be_instance_of(Array)
25
+ end
26
+ end
27
+
28
+ describe "input filter" do
29
+ it "should disable / enable the input filter" do
30
+ subject.input_filter = false
31
+ expect(subject.input_filter_enabled?).to be_falsey
32
+ expect(subject.input_filter).to be_falsey
33
+ subject.input_filter = true
34
+ expect(subject.input_filter_enabled?).to be_truthy
35
+ expect(subject.input_filter).to be_truthy
36
+ end
37
+ end
38
+
39
+ describe '.detectable_charsets' do
40
+ it "returns an array of detectable charsets" do
41
+ cs = subject.detectable_charsets
42
+ expect(cs).to be_kind_of(Array)
43
+ expect(cs).not_to be_empty
44
+ expect(cs.first).to be_kind_of(String)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,73 @@
1
+ require "spec_helper"
2
+
3
+ describe ICU::Collator do
4
+ let(:loc) { "nb" }
5
+ subject { ICU::Collator.new(loc) }
6
+
7
+ describe '.sort' do
8
+ it "should sort an array of strings" do
9
+ expect(subject.sort(%w[å ø æ])).to eq %w[æ ø å]
10
+ end
11
+ end
12
+
13
+ describe '.compare' do
14
+ it "should compare two strings" do
15
+ expect(subject.compare("blåbærsyltetøy", "blah")).to eq 1
16
+ expect(subject.compare("blåbærsyltetøy".encode("UTF-16"), "blah")).to eq 1
17
+ expect(subject.compare("blåbærsyltetøy", "blah")).to eq 1
18
+ expect(subject.compare("blah", "blah")).to eq 0
19
+ expect(subject.compare("blah".encode("UTF-16"), "blah".encode("UTF-32"))).to eq 0
20
+ expect(subject.compare("ba", "bl")).to eq -1
21
+ end
22
+ end
23
+
24
+ describe '.locale' do
25
+ subject { ICU::Collator.new("en_US_CALIFORNIA") }
26
+
27
+ it 'returns the valid locale of the collator' do
28
+ expect(subject.locale).to eq "en_US"
29
+ end
30
+
31
+ it 'returns the actual locale of the collator' do
32
+ expect(subject.locale(:actual)).to eq "root"
33
+ end
34
+ end
35
+
36
+ describe '.greater?' do
37
+ it "returns true when the former is greater" do
38
+ expect(subject.greater?("z", "a")).to be_truthy
39
+ expect(subject.greater?("a", "z")).to be_falsey
40
+ end
41
+ end
42
+
43
+ describe '.greater_or_equal?' do
44
+ it "returns true when the former is greater or equal" do
45
+ expect(subject.greater_or_equal?("z", "a")).to be_truthy
46
+ expect(subject.greater_or_equal?("z", "z")).to be_truthy
47
+ expect(subject.greater_or_equal?("a", "z")).to be_falsey
48
+ end
49
+ end
50
+
51
+ describe '.equal?' do
52
+ it "returns true when the former is equal" do
53
+ expect(subject.equal?("a", "a")).to be_truthy
54
+ expect(subject.equal?("a", "b")).to be_falsey
55
+ end
56
+ end
57
+
58
+ describe '.rules' do
59
+ it "should return rules" do
60
+ expect(subject.rules).not_to be_empty
61
+ # ö sorts before Ö
62
+ expect(subject.rules.include?('ö<<<Ö')).to be_truthy
63
+ end
64
+ end
65
+
66
+ describe '#sort' do
67
+ subject { ICU::Collator }
68
+
69
+ it "sorts the array of strings" do
70
+ expect(subject.sort(loc, %w[å ø æ])).to eq %w[æ ø å]
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,312 @@
1
+ require 'spec_helper'
2
+
3
+ describe ICU::Locale do
4
+ describe '#avalable' do
5
+ subject { ICU::Locale.available }
6
+
7
+ it { is_expected.to be_an Array }
8
+ it { is_expected.not_to be_empty }
9
+ it { expect(subject.first).to be_a ICU::Locale }
10
+ end
11
+
12
+ describe '#iso_countries' do
13
+ subject { ICU::Locale.iso_countries }
14
+
15
+ it { is_expected.to be_an Array }
16
+ it { is_expected.not_to be_empty }
17
+ it { expect(subject.first).to be_a String }
18
+ end
19
+
20
+ describe '#default' do
21
+ subject { ICU::Locale.default }
22
+
23
+ let(:locale) do
24
+ locales = ICU::Locale.available
25
+ locales.delete(ICU::Locale.default)
26
+ locales.respond_to?(:sample) ? locales.sample : locales.choice
27
+ end
28
+
29
+ it { is_expected.to be_a ICU::Locale }
30
+
31
+ it 'can be assigned using Locale' do
32
+ default_locale = locale
33
+ expect(default_locale).to eq locale
34
+ end
35
+
36
+ it 'can be assigned using string' do
37
+ string = locale.to_s
38
+
39
+ ICU::Locale.default = string
40
+ expect(subject.to_s).to eq string
41
+ expect(subject).to eq ICU::Locale.new(string)
42
+ end
43
+
44
+ it 'can be assigned using symbol' do
45
+ symbol = locale.to_s.to_sym
46
+
47
+ ICU::Locale.default = symbol
48
+ expect(ICU::Locale.default).to eq ICU::Locale.new(symbol)
49
+ end
50
+ end
51
+
52
+ describe '.new' do
53
+ it "raises when locale can't be encoded by ASCII" do
54
+ expect { ICU::Locale.new("中文") }.to raise_error(Encoding::UndefinedConversionError)
55
+ end
56
+ end
57
+
58
+ describe '.==' do
59
+ let(:locale) { "en" }
60
+ it 'returns true when the @id is exact the same and class matches' do
61
+ expect(ICU::Locale.new(locale) == ICU::Locale.new("en")).to be_truthy
62
+ end
63
+
64
+ it 'returns false when the class is different' do
65
+ class TmpLocale
66
+ def id
67
+ locale
68
+ end
69
+ end
70
+ expect(ICU::Locale.new(locale) == TmpLocale.new).to be_falsey
71
+ end
72
+ end
73
+
74
+ context 'with ICU locale ID' do
75
+ describe '#for_language_tag' do
76
+ it 'converts a language tag to a locale' do
77
+ expect(ICU::Locale.for_language_tag('en-us')).to eq ICU::Locale.new('en_US')
78
+ expect(ICU::Locale.for_language_tag('nan-Hant-tw')).to eq ICU::Locale.new('nan_Hant_TW')
79
+ end
80
+ end
81
+
82
+ describe '.language_tag' do
83
+ it 'returns a language tag for a locale' do
84
+ expect(ICU::Locale.new('en_US').language_tag).to eq 'en-US'
85
+ expect(ICU::Locale.new('zh_TW').language_tag).to eq 'zh-TW'
86
+ expect(ICU::Locale.new('zh_Hans_CH_PINYIN').language_tag).to eq 'zh-Hans-CH-u-co-pinyin'
87
+ end
88
+ end
89
+
90
+ describe '#for_lcid' do
91
+ it 'converts an LCID to a locale' do
92
+ expect(ICU::Locale.for_lcid(1033)).to eq ICU::Locale.new('en_US')
93
+ expect(ICU::Locale.for_lcid(1036)).to eq ICU::Locale.new('fr_FR')
94
+ end
95
+ end
96
+
97
+ describe '.lcid' do
98
+ it 'returns an LCID for a locale' do
99
+ expect(ICU::Locale.new('en_US').lcid).to eq 1033
100
+ expect(ICU::Locale.new('es_US').lcid).to eq 21514
101
+ end
102
+ end
103
+
104
+ describe '.display_country' do
105
+ it 'returns the country' do
106
+ expect(ICU::Locale.new('de_DE').display_country('en')).to eq 'Germany'
107
+ expect(ICU::Locale.new('en_US').display_country('fr')).to eq 'États-Unis'
108
+ end
109
+ end
110
+
111
+ describe '.display_language' do
112
+ it 'returns the language' do
113
+ expect(ICU::Locale.new('fr_FR').display_language('de')).to eq 'Französisch'
114
+ expect(ICU::Locale.new('zh_CH').display_language('en')).to eq 'Chinese'
115
+ end
116
+ end
117
+
118
+ describe '.display_name' do
119
+ it 'returns the name' do
120
+ expect(ICU::Locale.new('en_US').display_name('de')).to eq 'Englisch (Vereinigte Staaten)'
121
+ expect(ICU::Locale.new('zh_CH').display_name('fr')).to eq 'chinois (Suisse)'
122
+ end
123
+ end
124
+
125
+ describe '.display_script' do
126
+ it 'returns the script' do
127
+ expect(ICU::Locale.new('ja_Hira_JP').display_script('en')).to eq 'Hiragana'
128
+ expect(ICU::Locale.new('ja_Hira_JP').display_script('ru')).to eq 'хирагана'
129
+ end
130
+ end
131
+
132
+ describe '.display_variant' do
133
+ it 'returns the variant' do
134
+ expect(ICU::Locale.new('be_BY_TARASK').display_variant('de')).to eq 'Taraskievica-Orthographie'
135
+ expect(ICU::Locale.new('zh_CH_POSIX').display_variant('en')).to eq 'Computer'
136
+ end
137
+ end
138
+
139
+ context 'with default locale' do
140
+ let(:default) { ICU::Locale.default }
141
+ subject { ICU::Locale.new('de_DE') }
142
+
143
+ it 'returns the country' do
144
+ expect(subject.display_country).to eq subject.display_country(default)
145
+ end
146
+
147
+ it 'returns the language' do
148
+ expect(subject.display_language).to eq subject.display_language(default)
149
+ end
150
+
151
+ it 'returns the name' do
152
+ expect(subject.display_name).to eq subject.display_name(default)
153
+ end
154
+
155
+ it 'returns the script' do
156
+ expect(subject.display_script).to eq subject.display_script(default)
157
+ end
158
+
159
+ it 'returns the variant' do
160
+ expect(subject.display_variant).to eq subject.display_variant(default)
161
+ end
162
+ end
163
+ end
164
+
165
+ context 'formatting' do
166
+ subject { ICU::Locale.new('de-de.utf8@collation = phonebook') }
167
+
168
+ describe '.name' do
169
+ it 'is formatted' do
170
+ expect(subject.name).to eq 'de_DE.utf8@collation=phonebook'
171
+ end
172
+ end
173
+
174
+ describe '.base_name' do
175
+ it 'is formatted without keywords' do
176
+ expect(subject.base_name).to eq 'de_DE.utf8'
177
+ end
178
+ end
179
+
180
+ describe '.canonical_name' do
181
+ it 'is formatted for ICU' do
182
+ expect(subject.canonical_name).to eq 'de_DE@collation=phonebook'
183
+ end
184
+ end
185
+ end
186
+
187
+ describe '.parent' do
188
+ it 'truncates a properly formatted locale, returning the "parent"' do
189
+ expect(ICU::Locale.new('es-mx').parent).to eq ''
190
+ expect(ICU::Locale.new('es_MX').parent).to eq 'es'
191
+ expect(ICU::Locale.new('zh_Hans_CH_PINYIN').parent).to eq 'zh_Hans_CH'
192
+ end
193
+ end
194
+
195
+ describe '.iso_country' do
196
+ it 'returns the ISO 3166 alpha-3 country code' do
197
+ expect(ICU::Locale.new('en_US').iso_country).to eq 'USA'
198
+ expect(ICU::Locale.new('zh_CN').iso_country).to eq 'CHN'
199
+ end
200
+ end
201
+
202
+ describe '.iso_language' do
203
+ it 'returns the ISO 639 three-letter language code' do
204
+ expect(ICU::Locale.new('en_US').iso_language).to eq 'eng'
205
+ expect(ICU::Locale.new('zh_CN').iso_language).to eq 'zho'
206
+ end
207
+ end
208
+
209
+ describe '.keywords' do
210
+ context 'when improperly formatted' do
211
+ let(:locale) { ICU::Locale.new('de_DE@euro') }
212
+
213
+ it 'raises an error' do
214
+ expect { locale.keywords }.to raise_error(ICU::Error)
215
+ end
216
+ end
217
+
218
+ context 'when properly formatted' do
219
+ let(:locale) { ICU::Locale.new('de_DE@currency=EUR') }
220
+
221
+ it 'returns the list of keywords' do
222
+ expect(locale.keywords).to eq ['currency']
223
+ end
224
+ end
225
+ end
226
+
227
+ describe '.keyword' do
228
+ it 'can be read' do
229
+ expect(ICU::Locale.new('en_US@calendar=chinese').keyword('calendar')).to eq 'chinese'
230
+ expect(ICU::Locale.new('en_US@calendar=chinese').keyword(:calendar)).to eq 'chinese'
231
+ expect(ICU::Locale.new('en_US@some=thing').keyword('missing')).to eq ''
232
+ end
233
+ end
234
+
235
+ describe '.with_keyword' do
236
+ it 'can be added' do
237
+ expect(ICU::Locale.new('de_DE').with_keyword('currency', 'EUR')).to eq ICU::Locale.new('de_DE@currency=EUR')
238
+ expect(ICU::Locale.new('de_DE').with_keyword(:currency, :EUR)).to eq ICU::Locale.new('de_DE@currency=EUR')
239
+ end
240
+
241
+ it 'can be removed' do
242
+ expect(ICU::Locale.new('en_US@some=thing').with_keyword(:some, nil)).to eq ICU::Locale.new('en_US')
243
+ expect(ICU::Locale.new('en_US@some=thing').with_keyword(:some, '')).to eq ICU::Locale.new('en_US')
244
+ end
245
+ end
246
+
247
+ describe '.with_keywords' do
248
+ it 'can be added using hash' do
249
+ expect(ICU::Locale.new('fr').with_keywords(:a => :b, :c => :d)).to eq ICU::Locale.new('fr@a=b;c=d')
250
+ end
251
+ end
252
+
253
+ describe '.character_orientation' do
254
+ it 'returns the character orientation' do
255
+ expect(ICU::Locale.new('ar').character_orientation).to eq :rtl
256
+ expect(ICU::Locale.new('en').character_orientation).to eq :ltr
257
+ expect(ICU::Locale.new('fa').character_orientation).to eq :rtl
258
+ end
259
+ end
260
+
261
+ describe '.line_orientation' do
262
+ it 'returns the line orientation' do
263
+ expect(ICU::Locale.new('ar').line_orientation).to eq :ttb
264
+ expect(ICU::Locale.new('en').line_orientation).to eq :ttb
265
+ expect(ICU::Locale.new('fa').line_orientation).to eq :ttb
266
+ end
267
+ end
268
+
269
+ context 'subtags' do
270
+ subject { ICU::Locale.new('zh-hans-ch-pinyin') }
271
+
272
+ describe '.country' do
273
+ it 'returns the country code' do
274
+ expect(subject.country).to eq 'CH'
275
+ end
276
+ end
277
+
278
+ describe '.language' do
279
+ it 'returns the language code' do
280
+ expect(subject.language).to eq 'zh'
281
+ end
282
+ end
283
+
284
+ describe '.script' do
285
+ it 'returns the script code' do
286
+ expect(subject.script).to eq 'Hans'
287
+ end
288
+ end
289
+
290
+ describe '.variant' do
291
+ it 'returns the variant code' do
292
+ expect(subject.variant).to eq 'PINYIN'
293
+ end
294
+ end
295
+
296
+ describe '.with_likely_subtags' do
297
+ it 'adds likely subtags' do
298
+ expect(ICU::Locale.new('en').with_likely_subtags).to eq ICU::Locale.new('en_Latn_US')
299
+ expect(ICU::Locale.new('sr').with_likely_subtags).to eq ICU::Locale.new('sr_Cyrl_RS')
300
+ expect(ICU::Locale.new('zh_TW').with_likely_subtags).to eq ICU::Locale.new('zh_Hant_TW')
301
+ end
302
+ end
303
+
304
+ describe '.with_minimized_subtags' do
305
+ it 'removes likely subtags' do
306
+ expect(ICU::Locale.new('en_US').with_minimized_subtags).to eq ICU::Locale.new('en')
307
+ expect(ICU::Locale.new('sr_RS').with_minimized_subtags).to eq ICU::Locale.new('sr')
308
+ expect(ICU::Locale.new('zh_Hant_TW').with_minimized_subtags).to eq ICU::Locale.new('zh_TW')
309
+ end
310
+ end
311
+ end
312
+ end