icu 0.9.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE +20 -0
  6. data/README.md +69 -0
  7. data/Rakefile +38 -0
  8. data/benchmark/normalization.rb +106 -0
  9. data/benchmark/normalization_phrases.txt +1031 -0
  10. data/benchmark/normalization_result.txt +45 -0
  11. data/benchmark/normalization_wikip.txt +2838 -0
  12. data/ext/icu/extconf.rb +242 -0
  13. data/ext/icu/icu.c +18 -0
  14. data/ext/icu/icu.h +78 -0
  15. data/ext/icu/icu_charset_detector.c +192 -0
  16. data/ext/icu/icu_collator.c +138 -0
  17. data/ext/icu/icu_locale.c +852 -0
  18. data/ext/icu/icu_normalizer.c +122 -0
  19. data/ext/icu/icu_number_format.c +0 -0
  20. data/ext/icu/icu_spoof_checker.c +194 -0
  21. data/ext/icu/icu_transliterator.c +159 -0
  22. data/ext/icu/internal_encoding.c +38 -0
  23. data/ext/icu/internal_ustring.c +304 -0
  24. data/ext/icu/internal_utils.c +50 -0
  25. data/ext/icu/rb_errors.c +14 -0
  26. data/icu.gemspec +22 -0
  27. data/lib/icu.rb +6 -18
  28. data/lib/icu/charset_detector.rb +5 -0
  29. data/lib/icu/collator.rb +24 -0
  30. data/lib/icu/locale.rb +19 -0
  31. data/lib/icu/transliterator.rb +8 -0
  32. data/lib/icu/version.rb +3 -0
  33. data/spec/charset_detector_spec.rb +47 -0
  34. data/spec/collator_spec.rb +73 -0
  35. data/spec/locale_spec.rb +312 -0
  36. data/spec/normalizer_spec.rb +35 -0
  37. data/spec/spec_helper.rb +8 -0
  38. data/spec/spoof_checker_spec.rb +56 -0
  39. data/spec/transliterator_spec.rb +41 -0
  40. metadata +132 -55
  41. data/COPYING +0 -674
  42. data/COPYING.LESSER +0 -165
  43. data/README +0 -81
  44. data/ext/extconf.rb +0 -31
  45. data/ext/icu.c +0 -128
  46. data/ext/icu.h +0 -34
  47. data/ext/icu_locale.c +0 -330
  48. data/ext/icu_locale_country.c +0 -99
  49. data/ext/icu_locale_language.c +0 -99
  50. data/ext/icu_numeric.c +0 -161
  51. data/ext/icu_time.c +0 -391
  52. data/test/test_locale.rb +0 -73
  53. data/test/test_numeric.rb +0 -78
  54. data/test/test_time.rb +0 -75
@@ -0,0 +1,5 @@
1
+ module ICU
2
+ class CharsetDetector
3
+ alias input_filter_enabled? input_filter
4
+ end
5
+ end
@@ -0,0 +1,24 @@
1
+ module ICU
2
+ class Collator
3
+ def self.sort(locale, strings)
4
+ self.new(locale)
5
+ .sort(strings)
6
+ end
7
+
8
+ def sort(strings)
9
+ strings.sort { |a, b| compare(a, b) }
10
+ end
11
+
12
+ def greater?(str_a, str_b)
13
+ compare(str_a, str_b) > 0
14
+ end
15
+
16
+ def greater_or_equal?(str_a, str_b)
17
+ compare(str_a, str_b) >= 0
18
+ end
19
+
20
+ def equal?(str_a, str_b)
21
+ compare(str_a, str_b).zero?
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,19 @@
1
+ module ICU
2
+ class Locale
3
+ attr_reader :id, :enc
4
+
5
+ def ==(other)
6
+ other.is_a?(self.class) && other.id == self.id
7
+ end
8
+
9
+ alias === ==
10
+ alias to_s id
11
+
12
+ def with_keywords(keywords)
13
+ keywords.reduce(self) do |locale, (keyword, value)|
14
+ # p locale, keyword, value
15
+ locale.with_keyword(keyword, value)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,8 @@
1
+ module ICU
2
+ class Transliterator
3
+ def self.transliterate(id, str, rules = nil, direction = nil)
4
+ self.new(id, rules, direction)
5
+ .transliterate(str)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module ICU
2
+ VERSION = "0.10.0"
3
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe ICU::CharsetDetector do
6
+ describe '.detect' do
7
+ it "should recognize UTF-8" do
8
+ m = subject.detect("æåø")
9
+ expect(m.name).to eq "UTF-8"
10
+ expect(m.language).to be_kind_of(String)
11
+ end
12
+
13
+ it "should support null bytes" do
14
+ # Create a utf-16 string and then force it to binary (ascii) to mimic data from net/http
15
+ string = "foo".encode("UTF-16").force_encoding("binary")
16
+ m = subject.detect(string)
17
+ expect(m.name).to eq "UTF-16BE"
18
+ expect(m.language).to be_kind_of(String)
19
+ end
20
+ end
21
+
22
+ describe '.detect_all' do
23
+ it "should detect several matching encodings" do
24
+ expect(subject.detect_all("foo bar")).to be_instance_of(Array)
25
+ end
26
+ end
27
+
28
+ describe "input filter" do
29
+ it "should disable / enable the input filter" do
30
+ subject.input_filter = false
31
+ expect(subject.input_filter_enabled?).to be_falsey
32
+ expect(subject.input_filter).to be_falsey
33
+ subject.input_filter = true
34
+ expect(subject.input_filter_enabled?).to be_truthy
35
+ expect(subject.input_filter).to be_truthy
36
+ end
37
+ end
38
+
39
+ describe '.detectable_charsets' do
40
+ it "returns an array of detectable charsets" do
41
+ cs = subject.detectable_charsets
42
+ expect(cs).to be_kind_of(Array)
43
+ expect(cs).not_to be_empty
44
+ expect(cs.first).to be_kind_of(String)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,73 @@
1
+ require "spec_helper"
2
+
3
+ describe ICU::Collator do
4
+ let(:loc) { "nb" }
5
+ subject { ICU::Collator.new(loc) }
6
+
7
+ describe '.sort' do
8
+ it "should sort an array of strings" do
9
+ expect(subject.sort(%w[å ø æ])).to eq %w[æ ø å]
10
+ end
11
+ end
12
+
13
+ describe '.compare' do
14
+ it "should compare two strings" do
15
+ expect(subject.compare("blåbærsyltetøy", "blah")).to eq 1
16
+ expect(subject.compare("blåbærsyltetøy".encode("UTF-16"), "blah")).to eq 1
17
+ expect(subject.compare("blåbærsyltetøy", "blah")).to eq 1
18
+ expect(subject.compare("blah", "blah")).to eq 0
19
+ expect(subject.compare("blah".encode("UTF-16"), "blah".encode("UTF-32"))).to eq 0
20
+ expect(subject.compare("ba", "bl")).to eq -1
21
+ end
22
+ end
23
+
24
+ describe '.locale' do
25
+ subject { ICU::Collator.new("en_US_CALIFORNIA") }
26
+
27
+ it 'returns the valid locale of the collator' do
28
+ expect(subject.locale).to eq "en_US"
29
+ end
30
+
31
+ it 'returns the actual locale of the collator' do
32
+ expect(subject.locale(:actual)).to eq "root"
33
+ end
34
+ end
35
+
36
+ describe '.greater?' do
37
+ it "returns true when the former is greater" do
38
+ expect(subject.greater?("z", "a")).to be_truthy
39
+ expect(subject.greater?("a", "z")).to be_falsey
40
+ end
41
+ end
42
+
43
+ describe '.greater_or_equal?' do
44
+ it "returns true when the former is greater or equal" do
45
+ expect(subject.greater_or_equal?("z", "a")).to be_truthy
46
+ expect(subject.greater_or_equal?("z", "z")).to be_truthy
47
+ expect(subject.greater_or_equal?("a", "z")).to be_falsey
48
+ end
49
+ end
50
+
51
+ describe '.equal?' do
52
+ it "returns true when the former is equal" do
53
+ expect(subject.equal?("a", "a")).to be_truthy
54
+ expect(subject.equal?("a", "b")).to be_falsey
55
+ end
56
+ end
57
+
58
+ describe '.rules' do
59
+ it "should return rules" do
60
+ expect(subject.rules).not_to be_empty
61
+ # ö sorts before Ö
62
+ expect(subject.rules.include?('ö<<<Ö')).to be_truthy
63
+ end
64
+ end
65
+
66
+ describe '#sort' do
67
+ subject { ICU::Collator }
68
+
69
+ it "sorts the array of strings" do
70
+ expect(subject.sort(loc, %w[å ø æ])).to eq %w[æ ø å]
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,312 @@
1
+ require 'spec_helper'
2
+
3
+ describe ICU::Locale do
4
+ describe '#avalable' do
5
+ subject { ICU::Locale.available }
6
+
7
+ it { is_expected.to be_an Array }
8
+ it { is_expected.not_to be_empty }
9
+ it { expect(subject.first).to be_a ICU::Locale }
10
+ end
11
+
12
+ describe '#iso_countries' do
13
+ subject { ICU::Locale.iso_countries }
14
+
15
+ it { is_expected.to be_an Array }
16
+ it { is_expected.not_to be_empty }
17
+ it { expect(subject.first).to be_a String }
18
+ end
19
+
20
+ describe '#default' do
21
+ subject { ICU::Locale.default }
22
+
23
+ let(:locale) do
24
+ locales = ICU::Locale.available
25
+ locales.delete(ICU::Locale.default)
26
+ locales.respond_to?(:sample) ? locales.sample : locales.choice
27
+ end
28
+
29
+ it { is_expected.to be_a ICU::Locale }
30
+
31
+ it 'can be assigned using Locale' do
32
+ default_locale = locale
33
+ expect(default_locale).to eq locale
34
+ end
35
+
36
+ it 'can be assigned using string' do
37
+ string = locale.to_s
38
+
39
+ ICU::Locale.default = string
40
+ expect(subject.to_s).to eq string
41
+ expect(subject).to eq ICU::Locale.new(string)
42
+ end
43
+
44
+ it 'can be assigned using symbol' do
45
+ symbol = locale.to_s.to_sym
46
+
47
+ ICU::Locale.default = symbol
48
+ expect(ICU::Locale.default).to eq ICU::Locale.new(symbol)
49
+ end
50
+ end
51
+
52
+ describe '.new' do
53
+ it "raises when locale can't be encoded by ASCII" do
54
+ expect { ICU::Locale.new("中文") }.to raise_error(Encoding::UndefinedConversionError)
55
+ end
56
+ end
57
+
58
+ describe '.==' do
59
+ let(:locale) { "en" }
60
+ it 'returns true when the @id is exact the same and class matches' do
61
+ expect(ICU::Locale.new(locale) == ICU::Locale.new("en")).to be_truthy
62
+ end
63
+
64
+ it 'returns false when the class is different' do
65
+ class TmpLocale
66
+ def id
67
+ locale
68
+ end
69
+ end
70
+ expect(ICU::Locale.new(locale) == TmpLocale.new).to be_falsey
71
+ end
72
+ end
73
+
74
+ context 'with ICU locale ID' do
75
+ describe '#for_language_tag' do
76
+ it 'converts a language tag to a locale' do
77
+ expect(ICU::Locale.for_language_tag('en-us')).to eq ICU::Locale.new('en_US')
78
+ expect(ICU::Locale.for_language_tag('nan-Hant-tw')).to eq ICU::Locale.new('nan_Hant_TW')
79
+ end
80
+ end
81
+
82
+ describe '.language_tag' do
83
+ it 'returns a language tag for a locale' do
84
+ expect(ICU::Locale.new('en_US').language_tag).to eq 'en-US'
85
+ expect(ICU::Locale.new('zh_TW').language_tag).to eq 'zh-TW'
86
+ expect(ICU::Locale.new('zh_Hans_CH_PINYIN').language_tag).to eq 'zh-Hans-CH-u-co-pinyin'
87
+ end
88
+ end
89
+
90
+ describe '#for_lcid' do
91
+ it 'converts an LCID to a locale' do
92
+ expect(ICU::Locale.for_lcid(1033)).to eq ICU::Locale.new('en_US')
93
+ expect(ICU::Locale.for_lcid(1036)).to eq ICU::Locale.new('fr_FR')
94
+ end
95
+ end
96
+
97
+ describe '.lcid' do
98
+ it 'returns an LCID for a locale' do
99
+ expect(ICU::Locale.new('en_US').lcid).to eq 1033
100
+ expect(ICU::Locale.new('es_US').lcid).to eq 21514
101
+ end
102
+ end
103
+
104
+ describe '.display_country' do
105
+ it 'returns the country' do
106
+ expect(ICU::Locale.new('de_DE').display_country('en')).to eq 'Germany'
107
+ expect(ICU::Locale.new('en_US').display_country('fr')).to eq 'États-Unis'
108
+ end
109
+ end
110
+
111
+ describe '.display_language' do
112
+ it 'returns the language' do
113
+ expect(ICU::Locale.new('fr_FR').display_language('de')).to eq 'Französisch'
114
+ expect(ICU::Locale.new('zh_CH').display_language('en')).to eq 'Chinese'
115
+ end
116
+ end
117
+
118
+ describe '.display_name' do
119
+ it 'returns the name' do
120
+ expect(ICU::Locale.new('en_US').display_name('de')).to eq 'Englisch (Vereinigte Staaten)'
121
+ expect(ICU::Locale.new('zh_CH').display_name('fr')).to eq 'chinois (Suisse)'
122
+ end
123
+ end
124
+
125
+ describe '.display_script' do
126
+ it 'returns the script' do
127
+ expect(ICU::Locale.new('ja_Hira_JP').display_script('en')).to eq 'Hiragana'
128
+ expect(ICU::Locale.new('ja_Hira_JP').display_script('ru')).to eq 'хирагана'
129
+ end
130
+ end
131
+
132
+ describe '.display_variant' do
133
+ it 'returns the variant' do
134
+ expect(ICU::Locale.new('be_BY_TARASK').display_variant('de')).to eq 'Taraskievica-Orthographie'
135
+ expect(ICU::Locale.new('zh_CH_POSIX').display_variant('en')).to eq 'Computer'
136
+ end
137
+ end
138
+
139
+ context 'with default locale' do
140
+ let(:default) { ICU::Locale.default }
141
+ subject { ICU::Locale.new('de_DE') }
142
+
143
+ it 'returns the country' do
144
+ expect(subject.display_country).to eq subject.display_country(default)
145
+ end
146
+
147
+ it 'returns the language' do
148
+ expect(subject.display_language).to eq subject.display_language(default)
149
+ end
150
+
151
+ it 'returns the name' do
152
+ expect(subject.display_name).to eq subject.display_name(default)
153
+ end
154
+
155
+ it 'returns the script' do
156
+ expect(subject.display_script).to eq subject.display_script(default)
157
+ end
158
+
159
+ it 'returns the variant' do
160
+ expect(subject.display_variant).to eq subject.display_variant(default)
161
+ end
162
+ end
163
+ end
164
+
165
+ context 'formatting' do
166
+ subject { ICU::Locale.new('de-de.utf8@collation = phonebook') }
167
+
168
+ describe '.name' do
169
+ it 'is formatted' do
170
+ expect(subject.name).to eq 'de_DE.utf8@collation=phonebook'
171
+ end
172
+ end
173
+
174
+ describe '.base_name' do
175
+ it 'is formatted without keywords' do
176
+ expect(subject.base_name).to eq 'de_DE.utf8'
177
+ end
178
+ end
179
+
180
+ describe '.canonical_name' do
181
+ it 'is formatted for ICU' do
182
+ expect(subject.canonical_name).to eq 'de_DE@collation=phonebook'
183
+ end
184
+ end
185
+ end
186
+
187
+ describe '.parent' do
188
+ it 'truncates a properly formatted locale, returning the "parent"' do
189
+ expect(ICU::Locale.new('es-mx').parent).to eq ''
190
+ expect(ICU::Locale.new('es_MX').parent).to eq 'es'
191
+ expect(ICU::Locale.new('zh_Hans_CH_PINYIN').parent).to eq 'zh_Hans_CH'
192
+ end
193
+ end
194
+
195
+ describe '.iso_country' do
196
+ it 'returns the ISO 3166 alpha-3 country code' do
197
+ expect(ICU::Locale.new('en_US').iso_country).to eq 'USA'
198
+ expect(ICU::Locale.new('zh_CN').iso_country).to eq 'CHN'
199
+ end
200
+ end
201
+
202
+ describe '.iso_language' do
203
+ it 'returns the ISO 639 three-letter language code' do
204
+ expect(ICU::Locale.new('en_US').iso_language).to eq 'eng'
205
+ expect(ICU::Locale.new('zh_CN').iso_language).to eq 'zho'
206
+ end
207
+ end
208
+
209
+ describe '.keywords' do
210
+ context 'when improperly formatted' do
211
+ let(:locale) { ICU::Locale.new('de_DE@euro') }
212
+
213
+ it 'raises an error' do
214
+ expect { locale.keywords }.to raise_error(ICU::Error)
215
+ end
216
+ end
217
+
218
+ context 'when properly formatted' do
219
+ let(:locale) { ICU::Locale.new('de_DE@currency=EUR') }
220
+
221
+ it 'returns the list of keywords' do
222
+ expect(locale.keywords).to eq ['currency']
223
+ end
224
+ end
225
+ end
226
+
227
+ describe '.keyword' do
228
+ it 'can be read' do
229
+ expect(ICU::Locale.new('en_US@calendar=chinese').keyword('calendar')).to eq 'chinese'
230
+ expect(ICU::Locale.new('en_US@calendar=chinese').keyword(:calendar)).to eq 'chinese'
231
+ expect(ICU::Locale.new('en_US@some=thing').keyword('missing')).to eq ''
232
+ end
233
+ end
234
+
235
+ describe '.with_keyword' do
236
+ it 'can be added' do
237
+ expect(ICU::Locale.new('de_DE').with_keyword('currency', 'EUR')).to eq ICU::Locale.new('de_DE@currency=EUR')
238
+ expect(ICU::Locale.new('de_DE').with_keyword(:currency, :EUR)).to eq ICU::Locale.new('de_DE@currency=EUR')
239
+ end
240
+
241
+ it 'can be removed' do
242
+ expect(ICU::Locale.new('en_US@some=thing').with_keyword(:some, nil)).to eq ICU::Locale.new('en_US')
243
+ expect(ICU::Locale.new('en_US@some=thing').with_keyword(:some, '')).to eq ICU::Locale.new('en_US')
244
+ end
245
+ end
246
+
247
+ describe '.with_keywords' do
248
+ it 'can be added using hash' do
249
+ expect(ICU::Locale.new('fr').with_keywords(:a => :b, :c => :d)).to eq ICU::Locale.new('fr@a=b;c=d')
250
+ end
251
+ end
252
+
253
+ describe '.character_orientation' do
254
+ it 'returns the character orientation' do
255
+ expect(ICU::Locale.new('ar').character_orientation).to eq :rtl
256
+ expect(ICU::Locale.new('en').character_orientation).to eq :ltr
257
+ expect(ICU::Locale.new('fa').character_orientation).to eq :rtl
258
+ end
259
+ end
260
+
261
+ describe '.line_orientation' do
262
+ it 'returns the line orientation' do
263
+ expect(ICU::Locale.new('ar').line_orientation).to eq :ttb
264
+ expect(ICU::Locale.new('en').line_orientation).to eq :ttb
265
+ expect(ICU::Locale.new('fa').line_orientation).to eq :ttb
266
+ end
267
+ end
268
+
269
+ context 'subtags' do
270
+ subject { ICU::Locale.new('zh-hans-ch-pinyin') }
271
+
272
+ describe '.country' do
273
+ it 'returns the country code' do
274
+ expect(subject.country).to eq 'CH'
275
+ end
276
+ end
277
+
278
+ describe '.language' do
279
+ it 'returns the language code' do
280
+ expect(subject.language).to eq 'zh'
281
+ end
282
+ end
283
+
284
+ describe '.script' do
285
+ it 'returns the script code' do
286
+ expect(subject.script).to eq 'Hans'
287
+ end
288
+ end
289
+
290
+ describe '.variant' do
291
+ it 'returns the variant code' do
292
+ expect(subject.variant).to eq 'PINYIN'
293
+ end
294
+ end
295
+
296
+ describe '.with_likely_subtags' do
297
+ it 'adds likely subtags' do
298
+ expect(ICU::Locale.new('en').with_likely_subtags).to eq ICU::Locale.new('en_Latn_US')
299
+ expect(ICU::Locale.new('sr').with_likely_subtags).to eq ICU::Locale.new('sr_Cyrl_RS')
300
+ expect(ICU::Locale.new('zh_TW').with_likely_subtags).to eq ICU::Locale.new('zh_Hant_TW')
301
+ end
302
+ end
303
+
304
+ describe '.with_minimized_subtags' do
305
+ it 'removes likely subtags' do
306
+ expect(ICU::Locale.new('en_US').with_minimized_subtags).to eq ICU::Locale.new('en')
307
+ expect(ICU::Locale.new('sr_RS').with_minimized_subtags).to eq ICU::Locale.new('sr')
308
+ expect(ICU::Locale.new('zh_Hant_TW').with_minimized_subtags).to eq ICU::Locale.new('zh_TW')
309
+ end
310
+ end
311
+ end
312
+ end