icu 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE +20 -0
  6. data/README.md +69 -0
  7. data/Rakefile +38 -0
  8. data/benchmark/normalization.rb +106 -0
  9. data/benchmark/normalization_phrases.txt +1031 -0
  10. data/benchmark/normalization_result.txt +45 -0
  11. data/benchmark/normalization_wikip.txt +2838 -0
  12. data/ext/icu/extconf.rb +242 -0
  13. data/ext/icu/icu.c +18 -0
  14. data/ext/icu/icu.h +78 -0
  15. data/ext/icu/icu_charset_detector.c +192 -0
  16. data/ext/icu/icu_collator.c +138 -0
  17. data/ext/icu/icu_locale.c +852 -0
  18. data/ext/icu/icu_normalizer.c +122 -0
  19. data/ext/icu/icu_number_format.c +0 -0
  20. data/ext/icu/icu_spoof_checker.c +194 -0
  21. data/ext/icu/icu_transliterator.c +159 -0
  22. data/ext/icu/internal_encoding.c +38 -0
  23. data/ext/icu/internal_ustring.c +304 -0
  24. data/ext/icu/internal_utils.c +50 -0
  25. data/ext/icu/rb_errors.c +14 -0
  26. data/icu.gemspec +22 -0
  27. data/lib/icu.rb +6 -18
  28. data/lib/icu/charset_detector.rb +5 -0
  29. data/lib/icu/collator.rb +24 -0
  30. data/lib/icu/locale.rb +19 -0
  31. data/lib/icu/transliterator.rb +8 -0
  32. data/lib/icu/version.rb +3 -0
  33. data/spec/charset_detector_spec.rb +47 -0
  34. data/spec/collator_spec.rb +73 -0
  35. data/spec/locale_spec.rb +312 -0
  36. data/spec/normalizer_spec.rb +35 -0
  37. data/spec/spec_helper.rb +8 -0
  38. data/spec/spoof_checker_spec.rb +56 -0
  39. data/spec/transliterator_spec.rb +41 -0
  40. metadata +132 -55
  41. data/COPYING +0 -674
  42. data/COPYING.LESSER +0 -165
  43. data/README +0 -81
  44. data/ext/extconf.rb +0 -31
  45. data/ext/icu.c +0 -128
  46. data/ext/icu.h +0 -34
  47. data/ext/icu_locale.c +0 -330
  48. data/ext/icu_locale_country.c +0 -99
  49. data/ext/icu_locale_language.c +0 -99
  50. data/ext/icu_numeric.c +0 -161
  51. data/ext/icu_time.c +0 -391
  52. data/test/test_locale.rb +0 -73
  53. data/test/test_numeric.rb +0 -78
  54. data/test/test_time.rb +0 -75
@@ -0,0 +1,38 @@
1
+ #include "icu.h"
2
+
3
+ static rb_encoding* ascii_enc;
4
+ static rb_encoding* utf8_enc;
5
+ static ID ID_to_s;
6
+
7
+ int icu_is_rb_enc_idx_as_utf_8(int enc_idx)
8
+ {
9
+ return enc_idx == rb_ascii8bit_encindex() ||
10
+ enc_idx == rb_usascii_encindex() ||
11
+ enc_idx == rb_utf8_encindex();
12
+ }
13
+
14
+ int icu_is_rb_str_as_utf_8(VALUE str)
15
+ {
16
+ return icu_is_rb_enc_idx_as_utf_8(ENCODING_GET(str));
17
+ }
18
+
19
+ int icu_rb_str_enc_idx(VALUE str)
20
+ {
21
+ return ENCODING_GET(str);
22
+ }
23
+
24
+ VALUE rb_str_enc_to_ascii_as_utf8(VALUE str)
25
+ {
26
+ str = rb_funcall(str, ID_to_s, 0);
27
+ // sanitize, will raise Encoding::UndefinedConversionError if anything wrong
28
+ str = rb_str_encode(str, rb_enc_from_encoding(ascii_enc), 0, Qnil);
29
+ rb_enc_associate(str, utf8_enc);
30
+ return str;
31
+ }
32
+
33
+ void init_internal_encoding(void)
34
+ {
35
+ ascii_enc = rb_ascii8bit_encoding();
36
+ utf8_enc = rb_utf8_encoding();
37
+ ID_to_s = rb_intern("to_s");
38
+ }
@@ -0,0 +1,304 @@
1
+ #include "icu.h"
2
+ #include "unicode/ucnv.h"
3
+
4
+ // #define ICU_USTRING_DEBUG 1
5
+
6
+ #define GET_STRING(_data) icu_ustring_data* _data; \
7
+ TypedData_Get_Struct(self, icu_ustring_data, &icu_ustring_type, _data)
8
+ #define GET_STRING_VAL(_val, _data) icu_ustring_data* _data; \
9
+ TypedData_Get_Struct(_val, icu_ustring_data, &icu_ustring_type, _data)
10
+
11
+ VALUE rb_cICU_UString;
12
+
13
+ /* Data types */
14
+ typedef struct {
15
+ int32_t len;
16
+ int32_t capa;
17
+ int rb_enc_idx;
18
+ UConverter* converter;
19
+ UChar* ptr;
20
+ } icu_ustring_data;
21
+
22
+ static void icu_ustring_free(void* _this)
23
+ {
24
+ icu_ustring_data* this = _this;
25
+ ucnv_close(this->converter);
26
+ if (this->ptr != NULL) {
27
+ ruby_xfree(this->ptr);
28
+ }
29
+ }
30
+
31
+ static size_t icu_ustring_memsize(const void* data)
32
+ {
33
+ icu_ustring_data* this = (icu_ustring_data*)data;
34
+ size_t size = sizeof(UChar) * this->capa;
35
+ return size + sizeof(icu_ustring_data);
36
+ }
37
+
38
+ static const rb_data_type_t icu_ustring_type = {
39
+ "icu/ustring",
40
+ {NULL, icu_ustring_free, icu_ustring_memsize,},
41
+ 0, 0,
42
+ RUBY_TYPED_FREE_IMMEDIATELY,
43
+ };
44
+
45
+ // avoid name conflicts
46
+ #include "ruby/encoding.h"
47
+
48
+ /* Always allocate the internal string in a C function where you modify it
49
+ * as Ruby GC scans the C stacks and registers to find out GC root
50
+ */
51
+ VALUE icu_ustring_alloc(VALUE self)
52
+ {
53
+ icu_ustring_data* this;
54
+ return TypedData_Make_Struct(self, icu_ustring_data, &icu_ustring_type, this);
55
+ }
56
+
57
+ void icu_ustring_resize(VALUE self, int32_t capa)
58
+ {
59
+ GET_STRING(this);
60
+ REALLOC_N(this->ptr, UChar, capa);
61
+ this->capa = capa;
62
+ }
63
+
64
+ // some ICU gives a pointer to buffer, used for releasing ptr since we don't manage it
65
+ void icu_ustring_clear_ptr(VALUE self)
66
+ {
67
+ GET_STRING(this);
68
+ this->ptr = NULL;
69
+ }
70
+
71
+ /*
72
+ Initialize the internal object. Used to convert string to ICU string.
73
+ See also:
74
+ - icu_ustring_init_with_capa_enc
75
+ - icu_ustring_from_uchar_str
76
+ */
77
+ VALUE icu_ustring_from_rb_str(VALUE rb_str)
78
+ {
79
+ StringValue(rb_str);
80
+ VALUE u_str = icu_ustring_alloc(rb_cICU_UString);
81
+ GET_STRING_VAL(u_str, this);
82
+ UErrorCode status = U_ZERO_ERROR;
83
+
84
+ this->rb_enc_idx = icu_rb_str_enc_idx(rb_str);
85
+ // take UTF-8 code path
86
+ if (icu_is_rb_enc_idx_as_utf_8(this->rb_enc_idx)) {
87
+ this->converter = NULL;
88
+ } else {
89
+ this->converter = ucnv_open(ICU_RB_STRING_ENC_NAME_IDX(this->rb_enc_idx), &status);
90
+ if (U_FAILURE(status)) {
91
+ icu_rb_raise_icu_error(status);
92
+ }
93
+ }
94
+
95
+ this->capa = RSTRING_LENINT(rb_str) + RUBY_C_STRING_TERMINATOR_SIZE;
96
+ this->ptr = ALLOC_N(UChar, this->capa);
97
+
98
+ #ifdef ICU_USTRING_DEBUG
99
+ printf("icu_ustring_from_rb_str: %p %p %p %p %ld\n", u_str, this->ptr, rb_str, StringValuePtr(rb_str), RSTRING_LENINT(rb_str));
100
+ #endif
101
+
102
+ status = U_ZERO_ERROR;
103
+ int retried = FALSE;
104
+ int32_t len;
105
+ do {
106
+ if (this->converter == NULL) {
107
+ u_strFromUTF8(this->ptr, this->capa, &len,
108
+ RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str),
109
+ &status);
110
+ } else {
111
+ len = ucnv_toUChars(this->converter, this->ptr, this->capa,
112
+ RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str),
113
+ &status);
114
+ }
115
+ if (!retried && status == U_BUFFER_OVERFLOW_ERROR) {
116
+ retried = TRUE;
117
+ this->capa = len + RUBY_C_STRING_TERMINATOR_SIZE;
118
+ REALLOC_N(this->ptr, UChar, this->capa);
119
+ status = U_ZERO_ERROR;
120
+ } else if (U_FAILURE(status)) {
121
+ icu_rb_raise_icu_error(status);
122
+ } else { // retried == true && U_SUCCESS(status)
123
+ break;
124
+ }
125
+ } while (retried);
126
+ this->len = len;
127
+
128
+ return u_str;
129
+ }
130
+
131
+ /*
132
+ Initialize the internal object. Used to convert string to Ruby string.
133
+ If the pointer points to a memory space managed by ICU, releasing it before the EOL of our object.
134
+ See also:
135
+ - icu_ustring_init_with_capa_enc
136
+ - icu_ustring_from_uchar_str
137
+ - icu_ustring_clear_ptr
138
+ */
139
+ VALUE icu_ustring_from_uchar_str(const UChar* str, int32_t len)
140
+ {
141
+ if (len <= 0) {
142
+ len = u_strlen(str);
143
+ }
144
+ VALUE u_str = icu_ustring_init_with_capa_enc(len + RUBY_C_STRING_TERMINATOR_SIZE, ICU_RUBY_ENCODING_INDEX);
145
+ GET_STRING_VAL(u_str, this);
146
+ this->ptr = (UChar*)str;
147
+ this->len = len;
148
+ return u_str;
149
+ }
150
+
151
+ /*
152
+ Initialize the internal object. Usually used as a buffer.
153
+ See also:
154
+ - icu_ustring_from_rb_str
155
+ - icu_ustring_from_uchar_str
156
+ */
157
+ VALUE icu_ustring_init_with_capa_enc(int32_t capa, int enc_idx)
158
+ {
159
+ VALUE buf = icu_ustring_alloc(rb_cICU_UString);
160
+ GET_STRING_VAL(buf, this);
161
+ icu_ustring_set_enc(buf, enc_idx);
162
+ this->capa = capa;
163
+ this->ptr = ALLOC_N(UChar, capa);
164
+ return buf;
165
+ }
166
+
167
+ void icu_ustring_set_enc(VALUE self, int enc_idx)
168
+ {
169
+ GET_STRING(this);
170
+ if (this->converter != NULL) {
171
+ ucnv_close(this->converter);
172
+ }
173
+
174
+ this->rb_enc_idx = enc_idx;
175
+ // take UTF-8 code path
176
+ if (icu_is_rb_enc_idx_as_utf_8(enc_idx)) {
177
+ this->converter = NULL;
178
+ } else {
179
+ UErrorCode status = U_ZERO_ERROR;
180
+ this->converter = ucnv_open(ICU_RB_STRING_ENC_NAME_IDX(this->rb_enc_idx), &status);
181
+ if (U_FAILURE(status)) {
182
+ icu_rb_raise_icu_error(status);
183
+ }
184
+ }
185
+ }
186
+
187
+ VALUE icu_ustring_to_rb_enc_str_with_len(VALUE self, int32_t len)
188
+ {
189
+ GET_STRING(this);
190
+
191
+ #ifdef ICU_USTRING_DEBUG
192
+ printf("icu_ustring_to_rb_enc_str_with_len: %p %ld\n", self, len);
193
+ #endif
194
+
195
+ this->len = len;
196
+ return icu_ustring_to_rb_enc_str(self);
197
+ }
198
+
199
+ VALUE icu_ustring_to_rb_enc_str(VALUE self)
200
+ {
201
+ GET_STRING(this);
202
+ if (this->len < 0) {
203
+ this->len = 0;
204
+ }
205
+
206
+ #ifdef ICU_USTRING_DEBUG
207
+ printf("icu_ustring_to_rb_enc_str: %p %d %d\n", self, this->len, this->capa);
208
+ #endif
209
+
210
+ int32_t dest_len;
211
+ int32_t dest_capa = this->len + RUBY_C_STRING_TERMINATOR_SIZE;
212
+ char* dest = ALLOC_N(char, dest_capa);
213
+ int retried = FALSE;
214
+ UErrorCode status = U_ZERO_ERROR;
215
+ do {
216
+ if (this->converter == NULL) {
217
+ u_strToUTF8(dest, dest_capa, &dest_len, this->ptr, this->len, &status);
218
+ } else {
219
+ dest_len = ucnv_fromUChars(this->converter, dest, dest_capa,
220
+ this->ptr, this->len,
221
+ &status);
222
+ }
223
+ if (!retried && status == U_BUFFER_OVERFLOW_ERROR) {
224
+ retried = TRUE;
225
+ dest_capa = dest_len + RUBY_C_STRING_TERMINATOR_SIZE;
226
+ REALLOC_N(dest, char, dest_capa);
227
+ status = U_ZERO_ERROR;
228
+ } else if (U_FAILURE(status)) {
229
+ ruby_xfree(dest);
230
+ icu_rb_raise_icu_error(status);
231
+ } else { // retried == true && U_SUCCESS(status)
232
+ break;
233
+ }
234
+ } while (retried);
235
+
236
+ #ifdef ICU_USTRING_DEBUG
237
+ printf("icu_ustring_to_rb_enc_str (before creating rb str): %p %d %d %d\n", (void *)self, this->len, this->capa, this->rb_enc_idx);
238
+ printf("icu_ustring_to_rb_enc_str (pointers): %p %p %d\n", (void *)self, dest, dest_len);
239
+ #endif
240
+
241
+ VALUE rb_str = rb_enc_str_new(dest, dest_len, rb_enc_from_index(this->rb_enc_idx));
242
+ ruby_xfree(dest);
243
+ OBJ_TAINT(rb_str);
244
+ return rb_str;
245
+ }
246
+
247
+
248
+ inline UChar* icu_ustring_ptr_internal(const icu_ustring_data *this)
249
+ {
250
+ return this->ptr;
251
+ }
252
+
253
+ inline UChar* icu_ustring_ptr(VALUE self)
254
+ {
255
+ GET_STRING(this);
256
+ return icu_ustring_ptr_internal(this);
257
+ }
258
+
259
+ inline int32_t icu_ustring_len_internal(const icu_ustring_data *this)
260
+ {
261
+ return this->len;
262
+ }
263
+
264
+ inline int32_t icu_ustring_len(VALUE self)
265
+ {
266
+ GET_STRING(this);
267
+ return icu_ustring_len_internal(this);
268
+ }
269
+
270
+ inline int32_t icu_ustring_capa_internal(const icu_ustring_data *this)
271
+ {
272
+ return this->capa;
273
+ }
274
+
275
+ inline int32_t icu_ustring_capa(VALUE self)
276
+ {
277
+ GET_STRING(this);
278
+ return icu_ustring_capa_internal(this);
279
+ }
280
+
281
+ inline VALUE char_buffer_to_rb_str(const char* buffer)
282
+ {
283
+ VALUE str = rb_str_new_cstr(buffer);
284
+ return rb_str_export_to_enc(str, rb_enc_from_index(ICU_RUBY_ENCODING_INDEX));
285
+ }
286
+
287
+ inline char* char_buffer_new(int32_t buf_size)
288
+ {
289
+ char* buffer = ALLOC_N(char, buf_size);
290
+ return buffer;
291
+ }
292
+
293
+ inline void char_buffer_resize(const char* buffer, int32_t buf_size)
294
+ {
295
+ REALLOC_N(buffer, char, buf_size);
296
+ }
297
+
298
+ inline void char_buffer_free(const char* buffer)
299
+ {
300
+ ruby_xfree((void*)buffer);
301
+ }
302
+
303
+ #undef GET_STRING
304
+ #undef GET_STRING_VAL
@@ -0,0 +1,50 @@
1
+ #include "icu.h"
2
+
3
+ VALUE icu_enum_to_rb_ary(UEnumeration* icu_enum, UErrorCode status, long pre_allocated)
4
+ {
5
+ if (U_FAILURE(status)) {
6
+ uenum_close(icu_enum);
7
+ icu_rb_raise_icu_error(status);
8
+ }
9
+ VALUE result = rb_ary_new2(pre_allocated);
10
+ const UChar* ptr = NULL;
11
+ int32_t len = 0;
12
+ status = U_ZERO_ERROR;
13
+ while ((ptr = uenum_unext(icu_enum, &len, &status)) != NULL) {
14
+ if (U_FAILURE(status)) {
15
+ uenum_close(icu_enum);
16
+ icu_rb_raise_icu_error(status);
17
+ }
18
+ VALUE s = icu_ustring_from_uchar_str(ptr, len);
19
+ rb_ary_push(result, icu_ustring_to_rb_enc_str(s));
20
+ icu_ustring_clear_ptr(s);
21
+ status = U_ZERO_ERROR;
22
+ }
23
+ uenum_close(icu_enum);
24
+ return result;
25
+ }
26
+
27
+ extern inline void icu_rb_raise_icu_error(UErrorCode status)
28
+ {
29
+ rb_raise(rb_eICU_Error, "ICU Error Code: %d, %s.", status, u_errorName(status));
30
+ }
31
+
32
+
33
+ extern inline void
34
+ icu_rb_raise_icu_invalid_parameter(const char* parameter,
35
+ const char* error_message)
36
+ {
37
+ rb_raise(rb_eICU_InvalidParameterError,
38
+ "ICU Invalid parameter: %s, %s.",
39
+ parameter,
40
+ error_message);
41
+ }
42
+
43
+
44
+ extern inline void icu_rb_raise_icu_parse_error(const UParseError* error)
45
+ {
46
+ rb_raise(rb_eICU_InvalidParameterError,
47
+ "ICU Parse Error: Line %d, offset %d.",
48
+ error->line,
49
+ error->offset);
50
+ }
@@ -0,0 +1,14 @@
1
+ #include "icu.h"
2
+
3
+ VALUE rb_eICU_Error;
4
+ VALUE rb_eICU_InvalidParameterError;
5
+
6
+ void init_rb_errors(void)
7
+ {
8
+ rb_eICU_Error = rb_define_class_under(rb_mICU, "Error", rb_eStandardError);
9
+ rb_define_alias(rb_eICU_Error, "error", "message");
10
+ rb_eICU_InvalidParameterError = rb_define_class_under(rb_mICU, "InvalidParameterError", rb_eStandardError);
11
+ rb_define_alias(rb_eICU_InvalidParameterError, "error", "message");
12
+ }
13
+
14
+ /* vim: set expandtab sws=4 sw=4: */
@@ -0,0 +1,22 @@
1
+ require File.expand_path('../lib/icu/version', __FILE__)
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "icu"
5
+ s.version = ICU::VERSION
6
+ s.authors = ["Erick Guan"]
7
+ s.email = ["fantasticfears@gmail.com"]
8
+ s.license = "MIT"
9
+ s.homepage = "https://github.com/fantasticfears/icu4r"
10
+ s.summary = %q{A Unicode processing functions ruby gem, binding to ICU}
11
+ s.required_ruby_version = '>= 2.2.7'
12
+
13
+ s.extensions = ["ext/icu/extconf.rb"]
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
19
+ s.add_development_dependency 'rspec'
20
+ s.add_development_dependency 'pry'
21
+ s.add_development_dependency 'mini_portile2', '~> 2.2.0'
22
+ end
data/lib/icu.rb CHANGED
@@ -1,18 +1,6 @@
1
- # Copyright (c) 2007 Joshua Llorach
2
- #
3
- # This file is part of ICU Ruby extension.
4
- #
5
- # ICU is free software: you can redistribute it and/or modify
6
- # it under the terms of the GNU Lesser General Public License as published by
7
- # the Free Software Foundation, either version 3 of the License, or
8
- # (at your option) any later version.
9
- #
10
- # ICU is distributed in the hope that it will be useful,
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
- # GNU Lesser General Public License for more details.
14
- #
15
- # You should have received a copy of the GNU Lesser General Public License
16
- # along with ICU. If not, see <http://www.gnu.org/licenses/>.
17
-
18
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'icu.so'))
1
+ require 'icu/icu'
2
+ require 'icu/version'
3
+ require 'icu/collator'
4
+ require 'icu/transliterator'
5
+ require 'icu/charset_detector'
6
+ require 'icu/locale'