icu 0.9.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE +20 -0
  6. data/README.md +69 -0
  7. data/Rakefile +38 -0
  8. data/benchmark/normalization.rb +106 -0
  9. data/benchmark/normalization_phrases.txt +1031 -0
  10. data/benchmark/normalization_result.txt +45 -0
  11. data/benchmark/normalization_wikip.txt +2838 -0
  12. data/ext/icu/extconf.rb +242 -0
  13. data/ext/icu/icu.c +18 -0
  14. data/ext/icu/icu.h +78 -0
  15. data/ext/icu/icu_charset_detector.c +192 -0
  16. data/ext/icu/icu_collator.c +138 -0
  17. data/ext/icu/icu_locale.c +852 -0
  18. data/ext/icu/icu_normalizer.c +122 -0
  19. data/ext/icu/icu_number_format.c +0 -0
  20. data/ext/icu/icu_spoof_checker.c +194 -0
  21. data/ext/icu/icu_transliterator.c +159 -0
  22. data/ext/icu/internal_encoding.c +38 -0
  23. data/ext/icu/internal_ustring.c +304 -0
  24. data/ext/icu/internal_utils.c +50 -0
  25. data/ext/icu/rb_errors.c +14 -0
  26. data/icu.gemspec +22 -0
  27. data/lib/icu.rb +6 -18
  28. data/lib/icu/charset_detector.rb +5 -0
  29. data/lib/icu/collator.rb +24 -0
  30. data/lib/icu/locale.rb +19 -0
  31. data/lib/icu/transliterator.rb +8 -0
  32. data/lib/icu/version.rb +3 -0
  33. data/spec/charset_detector_spec.rb +47 -0
  34. data/spec/collator_spec.rb +73 -0
  35. data/spec/locale_spec.rb +312 -0
  36. data/spec/normalizer_spec.rb +35 -0
  37. data/spec/spec_helper.rb +8 -0
  38. data/spec/spoof_checker_spec.rb +56 -0
  39. data/spec/transliterator_spec.rb +41 -0
  40. metadata +132 -55
  41. data/COPYING +0 -674
  42. data/COPYING.LESSER +0 -165
  43. data/README +0 -81
  44. data/ext/extconf.rb +0 -31
  45. data/ext/icu.c +0 -128
  46. data/ext/icu.h +0 -34
  47. data/ext/icu_locale.c +0 -330
  48. data/ext/icu_locale_country.c +0 -99
  49. data/ext/icu_locale_language.c +0 -99
  50. data/ext/icu_numeric.c +0 -161
  51. data/ext/icu_time.c +0 -391
  52. data/test/test_locale.rb +0 -73
  53. data/test/test_numeric.rb +0 -78
  54. data/test/test_time.rb +0 -75
@@ -0,0 +1,38 @@
1
+ #include "icu.h"
2
+
3
+ static rb_encoding* ascii_enc;
4
+ static rb_encoding* utf8_enc;
5
+ static ID ID_to_s;
6
+
7
+ int icu_is_rb_enc_idx_as_utf_8(int enc_idx)
8
+ {
9
+ return enc_idx == rb_ascii8bit_encindex() ||
10
+ enc_idx == rb_usascii_encindex() ||
11
+ enc_idx == rb_utf8_encindex();
12
+ }
13
+
14
+ int icu_is_rb_str_as_utf_8(VALUE str)
15
+ {
16
+ return icu_is_rb_enc_idx_as_utf_8(ENCODING_GET(str));
17
+ }
18
+
19
+ int icu_rb_str_enc_idx(VALUE str)
20
+ {
21
+ return ENCODING_GET(str);
22
+ }
23
+
24
+ VALUE rb_str_enc_to_ascii_as_utf8(VALUE str)
25
+ {
26
+ str = rb_funcall(str, ID_to_s, 0);
27
+ // sanitize, will raise Encoding::UndefinedConversionError if anything wrong
28
+ str = rb_str_encode(str, rb_enc_from_encoding(ascii_enc), 0, Qnil);
29
+ rb_enc_associate(str, utf8_enc);
30
+ return str;
31
+ }
32
+
33
+ void init_internal_encoding(void)
34
+ {
35
+ ascii_enc = rb_ascii8bit_encoding();
36
+ utf8_enc = rb_utf8_encoding();
37
+ ID_to_s = rb_intern("to_s");
38
+ }
@@ -0,0 +1,304 @@
1
+ #include "icu.h"
2
+ #include "unicode/ucnv.h"
3
+
4
+ // #define ICU_USTRING_DEBUG 1
5
+
6
+ #define GET_STRING(_data) icu_ustring_data* _data; \
7
+ TypedData_Get_Struct(self, icu_ustring_data, &icu_ustring_type, _data)
8
+ #define GET_STRING_VAL(_val, _data) icu_ustring_data* _data; \
9
+ TypedData_Get_Struct(_val, icu_ustring_data, &icu_ustring_type, _data)
10
+
11
+ VALUE rb_cICU_UString;
12
+
13
+ /* Data types */
14
+ typedef struct {
15
+ int32_t len;
16
+ int32_t capa;
17
+ int rb_enc_idx;
18
+ UConverter* converter;
19
+ UChar* ptr;
20
+ } icu_ustring_data;
21
+
22
+ static void icu_ustring_free(void* _this)
23
+ {
24
+ icu_ustring_data* this = _this;
25
+ ucnv_close(this->converter);
26
+ if (this->ptr != NULL) {
27
+ ruby_xfree(this->ptr);
28
+ }
29
+ }
30
+
31
+ static size_t icu_ustring_memsize(const void* data)
32
+ {
33
+ icu_ustring_data* this = (icu_ustring_data*)data;
34
+ size_t size = sizeof(UChar) * this->capa;
35
+ return size + sizeof(icu_ustring_data);
36
+ }
37
+
38
+ static const rb_data_type_t icu_ustring_type = {
39
+ "icu/ustring",
40
+ {NULL, icu_ustring_free, icu_ustring_memsize,},
41
+ 0, 0,
42
+ RUBY_TYPED_FREE_IMMEDIATELY,
43
+ };
44
+
45
+ // avoid name conflicts
46
+ #include "ruby/encoding.h"
47
+
48
+ /* Always allocate the internal string in a C function where you modify it
49
+ * as Ruby GC scans the C stacks and registers to find out GC root
50
+ */
51
+ VALUE icu_ustring_alloc(VALUE self)
52
+ {
53
+ icu_ustring_data* this;
54
+ return TypedData_Make_Struct(self, icu_ustring_data, &icu_ustring_type, this);
55
+ }
56
+
57
+ void icu_ustring_resize(VALUE self, int32_t capa)
58
+ {
59
+ GET_STRING(this);
60
+ REALLOC_N(this->ptr, UChar, capa);
61
+ this->capa = capa;
62
+ }
63
+
64
+ // some ICU gives a pointer to buffer, used for releasing ptr since we don't manage it
65
+ void icu_ustring_clear_ptr(VALUE self)
66
+ {
67
+ GET_STRING(this);
68
+ this->ptr = NULL;
69
+ }
70
+
71
+ /*
72
+ Initialize the internal object. Used to convert string to ICU string.
73
+ See also:
74
+ - icu_ustring_init_with_capa_enc
75
+ - icu_ustring_from_uchar_str
76
+ */
77
+ VALUE icu_ustring_from_rb_str(VALUE rb_str)
78
+ {
79
+ StringValue(rb_str);
80
+ VALUE u_str = icu_ustring_alloc(rb_cICU_UString);
81
+ GET_STRING_VAL(u_str, this);
82
+ UErrorCode status = U_ZERO_ERROR;
83
+
84
+ this->rb_enc_idx = icu_rb_str_enc_idx(rb_str);
85
+ // take UTF-8 code path
86
+ if (icu_is_rb_enc_idx_as_utf_8(this->rb_enc_idx)) {
87
+ this->converter = NULL;
88
+ } else {
89
+ this->converter = ucnv_open(ICU_RB_STRING_ENC_NAME_IDX(this->rb_enc_idx), &status);
90
+ if (U_FAILURE(status)) {
91
+ icu_rb_raise_icu_error(status);
92
+ }
93
+ }
94
+
95
+ this->capa = RSTRING_LENINT(rb_str) + RUBY_C_STRING_TERMINATOR_SIZE;
96
+ this->ptr = ALLOC_N(UChar, this->capa);
97
+
98
+ #ifdef ICU_USTRING_DEBUG
99
+ printf("icu_ustring_from_rb_str: %p %p %p %p %ld\n", u_str, this->ptr, rb_str, StringValuePtr(rb_str), RSTRING_LENINT(rb_str));
100
+ #endif
101
+
102
+ status = U_ZERO_ERROR;
103
+ int retried = FALSE;
104
+ int32_t len;
105
+ do {
106
+ if (this->converter == NULL) {
107
+ u_strFromUTF8(this->ptr, this->capa, &len,
108
+ RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str),
109
+ &status);
110
+ } else {
111
+ len = ucnv_toUChars(this->converter, this->ptr, this->capa,
112
+ RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str),
113
+ &status);
114
+ }
115
+ if (!retried && status == U_BUFFER_OVERFLOW_ERROR) {
116
+ retried = TRUE;
117
+ this->capa = len + RUBY_C_STRING_TERMINATOR_SIZE;
118
+ REALLOC_N(this->ptr, UChar, this->capa);
119
+ status = U_ZERO_ERROR;
120
+ } else if (U_FAILURE(status)) {
121
+ icu_rb_raise_icu_error(status);
122
+ } else { // retried == true && U_SUCCESS(status)
123
+ break;
124
+ }
125
+ } while (retried);
126
+ this->len = len;
127
+
128
+ return u_str;
129
+ }
130
+
131
+ /*
132
+ Initialize the internal object. Used to convert string to Ruby string.
133
+ If the pointer points to a memory space managed by ICU, releasing it before the EOL of our object.
134
+ See also:
135
+ - icu_ustring_init_with_capa_enc
136
+ - icu_ustring_from_uchar_str
137
+ - icu_ustring_clear_ptr
138
+ */
139
+ VALUE icu_ustring_from_uchar_str(const UChar* str, int32_t len)
140
+ {
141
+ if (len <= 0) {
142
+ len = u_strlen(str);
143
+ }
144
+ VALUE u_str = icu_ustring_init_with_capa_enc(len + RUBY_C_STRING_TERMINATOR_SIZE, ICU_RUBY_ENCODING_INDEX);
145
+ GET_STRING_VAL(u_str, this);
146
+ this->ptr = (UChar*)str;
147
+ this->len = len;
148
+ return u_str;
149
+ }
150
+
151
+ /*
152
+ Initialize the internal object. Usually used as a buffer.
153
+ See also:
154
+ - icu_ustring_from_rb_str
155
+ - icu_ustring_from_uchar_str
156
+ */
157
+ VALUE icu_ustring_init_with_capa_enc(int32_t capa, int enc_idx)
158
+ {
159
+ VALUE buf = icu_ustring_alloc(rb_cICU_UString);
160
+ GET_STRING_VAL(buf, this);
161
+ icu_ustring_set_enc(buf, enc_idx);
162
+ this->capa = capa;
163
+ this->ptr = ALLOC_N(UChar, capa);
164
+ return buf;
165
+ }
166
+
167
+ void icu_ustring_set_enc(VALUE self, int enc_idx)
168
+ {
169
+ GET_STRING(this);
170
+ if (this->converter != NULL) {
171
+ ucnv_close(this->converter);
172
+ }
173
+
174
+ this->rb_enc_idx = enc_idx;
175
+ // take UTF-8 code path
176
+ if (icu_is_rb_enc_idx_as_utf_8(enc_idx)) {
177
+ this->converter = NULL;
178
+ } else {
179
+ UErrorCode status = U_ZERO_ERROR;
180
+ this->converter = ucnv_open(ICU_RB_STRING_ENC_NAME_IDX(this->rb_enc_idx), &status);
181
+ if (U_FAILURE(status)) {
182
+ icu_rb_raise_icu_error(status);
183
+ }
184
+ }
185
+ }
186
+
187
+ VALUE icu_ustring_to_rb_enc_str_with_len(VALUE self, int32_t len)
188
+ {
189
+ GET_STRING(this);
190
+
191
+ #ifdef ICU_USTRING_DEBUG
192
+ printf("icu_ustring_to_rb_enc_str_with_len: %p %ld\n", self, len);
193
+ #endif
194
+
195
+ this->len = len;
196
+ return icu_ustring_to_rb_enc_str(self);
197
+ }
198
+
199
+ VALUE icu_ustring_to_rb_enc_str(VALUE self)
200
+ {
201
+ GET_STRING(this);
202
+ if (this->len < 0) {
203
+ this->len = 0;
204
+ }
205
+
206
+ #ifdef ICU_USTRING_DEBUG
207
+ printf("icu_ustring_to_rb_enc_str: %p %d %d\n", self, this->len, this->capa);
208
+ #endif
209
+
210
+ int32_t dest_len;
211
+ int32_t dest_capa = this->len + RUBY_C_STRING_TERMINATOR_SIZE;
212
+ char* dest = ALLOC_N(char, dest_capa);
213
+ int retried = FALSE;
214
+ UErrorCode status = U_ZERO_ERROR;
215
+ do {
216
+ if (this->converter == NULL) {
217
+ u_strToUTF8(dest, dest_capa, &dest_len, this->ptr, this->len, &status);
218
+ } else {
219
+ dest_len = ucnv_fromUChars(this->converter, dest, dest_capa,
220
+ this->ptr, this->len,
221
+ &status);
222
+ }
223
+ if (!retried && status == U_BUFFER_OVERFLOW_ERROR) {
224
+ retried = TRUE;
225
+ dest_capa = dest_len + RUBY_C_STRING_TERMINATOR_SIZE;
226
+ REALLOC_N(dest, char, dest_capa);
227
+ status = U_ZERO_ERROR;
228
+ } else if (U_FAILURE(status)) {
229
+ ruby_xfree(dest);
230
+ icu_rb_raise_icu_error(status);
231
+ } else { // retried == true && U_SUCCESS(status)
232
+ break;
233
+ }
234
+ } while (retried);
235
+
236
+ #ifdef ICU_USTRING_DEBUG
237
+ printf("icu_ustring_to_rb_enc_str (before creating rb str): %p %d %d %d\n", (void *)self, this->len, this->capa, this->rb_enc_idx);
238
+ printf("icu_ustring_to_rb_enc_str (pointers): %p %p %d\n", (void *)self, dest, dest_len);
239
+ #endif
240
+
241
+ VALUE rb_str = rb_enc_str_new(dest, dest_len, rb_enc_from_index(this->rb_enc_idx));
242
+ ruby_xfree(dest);
243
+ OBJ_TAINT(rb_str);
244
+ return rb_str;
245
+ }
246
+
247
+
248
+ inline UChar* icu_ustring_ptr_internal(const icu_ustring_data *this)
249
+ {
250
+ return this->ptr;
251
+ }
252
+
253
+ inline UChar* icu_ustring_ptr(VALUE self)
254
+ {
255
+ GET_STRING(this);
256
+ return icu_ustring_ptr_internal(this);
257
+ }
258
+
259
+ inline int32_t icu_ustring_len_internal(const icu_ustring_data *this)
260
+ {
261
+ return this->len;
262
+ }
263
+
264
+ inline int32_t icu_ustring_len(VALUE self)
265
+ {
266
+ GET_STRING(this);
267
+ return icu_ustring_len_internal(this);
268
+ }
269
+
270
+ inline int32_t icu_ustring_capa_internal(const icu_ustring_data *this)
271
+ {
272
+ return this->capa;
273
+ }
274
+
275
+ inline int32_t icu_ustring_capa(VALUE self)
276
+ {
277
+ GET_STRING(this);
278
+ return icu_ustring_capa_internal(this);
279
+ }
280
+
281
+ inline VALUE char_buffer_to_rb_str(const char* buffer)
282
+ {
283
+ VALUE str = rb_str_new_cstr(buffer);
284
+ return rb_str_export_to_enc(str, rb_enc_from_index(ICU_RUBY_ENCODING_INDEX));
285
+ }
286
+
287
+ inline char* char_buffer_new(int32_t buf_size)
288
+ {
289
+ char* buffer = ALLOC_N(char, buf_size);
290
+ return buffer;
291
+ }
292
+
293
+ inline void char_buffer_resize(const char* buffer, int32_t buf_size)
294
+ {
295
+ REALLOC_N(buffer, char, buf_size);
296
+ }
297
+
298
+ inline void char_buffer_free(const char* buffer)
299
+ {
300
+ ruby_xfree((void*)buffer);
301
+ }
302
+
303
+ #undef GET_STRING
304
+ #undef GET_STRING_VAL
@@ -0,0 +1,50 @@
1
+ #include "icu.h"
2
+
3
+ VALUE icu_enum_to_rb_ary(UEnumeration* icu_enum, UErrorCode status, long pre_allocated)
4
+ {
5
+ if (U_FAILURE(status)) {
6
+ uenum_close(icu_enum);
7
+ icu_rb_raise_icu_error(status);
8
+ }
9
+ VALUE result = rb_ary_new2(pre_allocated);
10
+ const UChar* ptr = NULL;
11
+ int32_t len = 0;
12
+ status = U_ZERO_ERROR;
13
+ while ((ptr = uenum_unext(icu_enum, &len, &status)) != NULL) {
14
+ if (U_FAILURE(status)) {
15
+ uenum_close(icu_enum);
16
+ icu_rb_raise_icu_error(status);
17
+ }
18
+ VALUE s = icu_ustring_from_uchar_str(ptr, len);
19
+ rb_ary_push(result, icu_ustring_to_rb_enc_str(s));
20
+ icu_ustring_clear_ptr(s);
21
+ status = U_ZERO_ERROR;
22
+ }
23
+ uenum_close(icu_enum);
24
+ return result;
25
+ }
26
+
27
+ extern inline void icu_rb_raise_icu_error(UErrorCode status)
28
+ {
29
+ rb_raise(rb_eICU_Error, "ICU Error Code: %d, %s.", status, u_errorName(status));
30
+ }
31
+
32
+
33
+ extern inline void
34
+ icu_rb_raise_icu_invalid_parameter(const char* parameter,
35
+ const char* error_message)
36
+ {
37
+ rb_raise(rb_eICU_InvalidParameterError,
38
+ "ICU Invalid parameter: %s, %s.",
39
+ parameter,
40
+ error_message);
41
+ }
42
+
43
+
44
+ extern inline void icu_rb_raise_icu_parse_error(const UParseError* error)
45
+ {
46
+ rb_raise(rb_eICU_InvalidParameterError,
47
+ "ICU Parse Error: Line %d, offset %d.",
48
+ error->line,
49
+ error->offset);
50
+ }
@@ -0,0 +1,14 @@
1
+ #include "icu.h"
2
+
3
+ VALUE rb_eICU_Error;
4
+ VALUE rb_eICU_InvalidParameterError;
5
+
6
+ void init_rb_errors(void)
7
+ {
8
+ rb_eICU_Error = rb_define_class_under(rb_mICU, "Error", rb_eStandardError);
9
+ rb_define_alias(rb_eICU_Error, "error", "message");
10
+ rb_eICU_InvalidParameterError = rb_define_class_under(rb_mICU, "InvalidParameterError", rb_eStandardError);
11
+ rb_define_alias(rb_eICU_InvalidParameterError, "error", "message");
12
+ }
13
+
14
+ /* vim: set expandtab sws=4 sw=4: */
@@ -0,0 +1,22 @@
1
+ require File.expand_path('../lib/icu/version', __FILE__)
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "icu"
5
+ s.version = ICU::VERSION
6
+ s.authors = ["Erick Guan"]
7
+ s.email = ["fantasticfears@gmail.com"]
8
+ s.license = "MIT"
9
+ s.homepage = "https://github.com/fantasticfears/icu4r"
10
+ s.summary = %q{A Unicode processing functions ruby gem, binding to ICU}
11
+ s.required_ruby_version = '>= 2.2.7'
12
+
13
+ s.extensions = ["ext/icu/extconf.rb"]
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.add_development_dependency 'rake-compiler', ">= 0.7.5"
19
+ s.add_development_dependency 'rspec'
20
+ s.add_development_dependency 'pry'
21
+ s.add_development_dependency 'mini_portile2', '~> 2.2.0'
22
+ end
data/lib/icu.rb CHANGED
@@ -1,18 +1,6 @@
1
- # Copyright (c) 2007 Joshua Llorach
2
- #
3
- # This file is part of ICU Ruby extension.
4
- #
5
- # ICU is free software: you can redistribute it and/or modify
6
- # it under the terms of the GNU Lesser General Public License as published by
7
- # the Free Software Foundation, either version 3 of the License, or
8
- # (at your option) any later version.
9
- #
10
- # ICU is distributed in the hope that it will be useful,
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
- # GNU Lesser General Public License for more details.
14
- #
15
- # You should have received a copy of the GNU Lesser General Public License
16
- # along with ICU. If not, see <http://www.gnu.org/licenses/>.
17
-
18
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'icu.so'))
1
+ require 'icu/icu'
2
+ require 'icu/version'
3
+ require 'icu/collator'
4
+ require 'icu/transliterator'
5
+ require 'icu/charset_detector'
6
+ require 'icu/locale'