icu4r 0.1.3.2006.01.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +153 -0
- data/calendar.c +576 -0
- data/docs/FORMATTING +131 -0
- data/docs/UNICODE_REGEXPS +204 -0
- data/extconf.rb +15 -0
- data/fmt.cpp +150 -0
- data/icu4r.c +14 -0
- data/icu_common.h +45 -0
- data/samples/demo_each.rb +23 -0
- data/samples/demo_locales.rb +16 -0
- data/samples/demo_regexp.rb +11 -0
- data/samples/resbundle/appmsg/root.res +0 -0
- data/samples/resbundle/appmsg/ru.res +0 -0
- data/samples/resbundle/demo_bundle.rb +4 -0
- data/samples/resbundle/mkres.sh +4 -0
- data/samples/resbundle/root.txt +10 -0
- data/samples/resbundle/ru.txt +4 -0
- data/test/test_calendar.rb +109 -0
- data/test/test_ustring.rb +381 -0
- data/tools/doc.sh +2 -0
- data/tools/km.rb +425 -0
- data/ubundle.c +209 -0
- data/ucore_ext.c +168 -0
- data/uregex.c +673 -0
- data/uregex.h +27 -0
- data/ustring.c +3042 -0
- metadata +81 -0
data/ustring.c
ADDED
@@ -0,0 +1,3042 @@
|
|
1
|
+
/**
|
2
|
+
* ustring.c - ICU based Unicode string support.
|
3
|
+
*
|
4
|
+
* $Id: ustring.c,v 1.20 2006/01/23 14:26:45 meadow Exp $
|
5
|
+
*
|
6
|
+
* Copyright (c) 2006 Nikolai Lugovoi
|
7
|
+
*
|
8
|
+
* This code is based on original ruby String class source (string.c):
|
9
|
+
*
|
10
|
+
* * string.c -
|
11
|
+
* *
|
12
|
+
* * Copyright (C) 1993-2003 Yukihiro Matsumoto
|
13
|
+
* * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
|
14
|
+
* * Copyright (C) 2000 Information-technology Promotion Agency, Japan
|
15
|
+
* *
|
16
|
+
**/
|
17
|
+
|
18
|
+
#include "icu_common.h"
|
19
|
+
VALUE icu_ustr_replace(VALUE str, VALUE str2);
|
20
|
+
VALUE ustr_gsub(int argc, VALUE * argv, VALUE str, int bang, int once);
|
21
|
+
extern VALUE icu_from_rstr(int argc, VALUE * argv, VALUE str);
|
22
|
+
|
23
|
+
VALUE rb_cURegexp;
|
24
|
+
VALUE rb_cUString;
|
25
|
+
VALUE rb_cUMatch;
|
26
|
+
VALUE rb_cUResourceBundle;
|
27
|
+
VALUE rb_cULocale;
|
28
|
+
VALUE rb_cUCalendar;
|
29
|
+
|
30
|
+
#include "uregex.h"
|
31
|
+
|
32
|
+
|
33
|
+
/* to be used in <=>, casecmp */
|
34
|
+
static UCollator * s_UCA_collator, * s_case_UCA_collator;
|
35
|
+
|
36
|
+
static void
|
37
|
+
free_ustr(str)
|
38
|
+
ICUString *str;
|
39
|
+
{
|
40
|
+
if (str->ptr)
|
41
|
+
free(str->ptr);
|
42
|
+
str->ptr = 0;
|
43
|
+
free(str);
|
44
|
+
}
|
45
|
+
inline void icu_check_frozen(VALUE str)
|
46
|
+
{
|
47
|
+
rb_check_frozen(str);
|
48
|
+
if(USTRING(str)->busy) rb_raise(rb_eRuntimeError, "String is busy. Can't modify");
|
49
|
+
}
|
50
|
+
#define START_BUF_LEN 16
|
51
|
+
/**
|
52
|
+
* Allocate ICUString struct with given +capa+ capacity,
|
53
|
+
* if mode == 1 and UChar != 0 - copy len UChars from src,
|
54
|
+
* else set pointer to src.
|
55
|
+
*/
|
56
|
+
#define ICU_COPY 1
|
57
|
+
#define ICU_SET 0
|
58
|
+
VALUE icu_ustr_alloc_and_wrap(UChar * src, long len, long capa, int mode)
|
59
|
+
{
|
60
|
+
ICUString *n_str = ALLOC_N(ICUString, 1);
|
61
|
+
size_t alloc_capa;
|
62
|
+
if( mode == ICU_COPY ) {
|
63
|
+
alloc_capa = START_BUF_LEN > capa ? START_BUF_LEN : capa;
|
64
|
+
if(alloc_capa<=len) alloc_capa = len + 1;
|
65
|
+
n_str->ptr = ALLOC_N(UChar, alloc_capa);
|
66
|
+
n_str->capa = alloc_capa;
|
67
|
+
n_str->len = len;
|
68
|
+
if( src ) {
|
69
|
+
u_memcpy(n_str->ptr, src, len);
|
70
|
+
n_str->ptr[len] = 0;
|
71
|
+
}
|
72
|
+
} else {
|
73
|
+
n_str->ptr = src;
|
74
|
+
n_str->len = len;
|
75
|
+
n_str->capa = capa;
|
76
|
+
}
|
77
|
+
if(n_str->capa <= n_str->len) rb_raise(rb_eRuntimeError, "Capacity is not large then len, sentinel can't be set!");
|
78
|
+
n_str->busy = 0;
|
79
|
+
n_str->ptr[n_str->len] = 0;
|
80
|
+
return Data_Wrap_Struct(rb_cUString, 0, free_ustr, n_str);
|
81
|
+
}
|
82
|
+
VALUE
|
83
|
+
icu_ustr_alloc(klass)
|
84
|
+
VALUE klass;
|
85
|
+
{
|
86
|
+
return icu_ustr_alloc_and_wrap(NULL, 0, 0, ICU_COPY);
|
87
|
+
}
|
88
|
+
|
89
|
+
void ustr_capa_resize(ICUString * str, long new_capa)
|
90
|
+
{
|
91
|
+
if (new_capa != str->capa) {
|
92
|
+
if (str->capa < new_capa || (str->capa - new_capa > 1024)) {
|
93
|
+
if(new_capa < START_BUF_LEN) new_capa = START_BUF_LEN;
|
94
|
+
REALLOC_N(str->ptr, UChar, new_capa);
|
95
|
+
str->capa = new_capa;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
}
|
99
|
+
/* delete +del_len+ units from string and insert replacement */
|
100
|
+
void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len)
|
101
|
+
{
|
102
|
+
long new_len;
|
103
|
+
UChar * temp = 0 ;
|
104
|
+
if( str->busy ) {
|
105
|
+
rb_warn("Attempt to modify busy string. Ignored");
|
106
|
+
return;
|
107
|
+
}
|
108
|
+
if( repl_len < 0) return;
|
109
|
+
if( del_len == 0 && repl_len == 0) return;
|
110
|
+
new_len = str->len - del_len + repl_len;
|
111
|
+
if (replacement == str->ptr ) {
|
112
|
+
temp = ALLOC_N(UChar, repl_len);
|
113
|
+
u_memcpy(temp, replacement, repl_len);
|
114
|
+
replacement = temp;
|
115
|
+
}
|
116
|
+
if ( repl_len >= del_len) ustr_capa_resize(str, new_len+1);
|
117
|
+
/* move tail */
|
118
|
+
if(str->len - (start+del_len) > 0) {
|
119
|
+
u_memmove(str->ptr + start+repl_len, str->ptr + start+del_len, str->len-(start+del_len) );
|
120
|
+
}
|
121
|
+
/* copy string */
|
122
|
+
if( repl_len > 0) u_memcpy(str->ptr+start, replacement, repl_len);
|
123
|
+
if ( repl_len < del_len) ustr_capa_resize(str, new_len+1);
|
124
|
+
str->len = new_len;
|
125
|
+
str->ptr[new_len] = 0;
|
126
|
+
if(temp) {
|
127
|
+
free(temp);
|
128
|
+
}
|
129
|
+
}
|
130
|
+
static inline void
|
131
|
+
ustr_mod_check(VALUE s, UChar *p, long len)
|
132
|
+
{
|
133
|
+
if (ICU_PTR(s) != p || ICU_LEN(s) != len){
|
134
|
+
rb_raise(rb_eRuntimeError, "string modified");
|
135
|
+
}
|
136
|
+
}
|
137
|
+
VALUE
|
138
|
+
ustr_new(klass, ptr, len)
|
139
|
+
VALUE klass;
|
140
|
+
UChar *ptr;
|
141
|
+
long len;
|
142
|
+
{
|
143
|
+
if (len < 0) {
|
144
|
+
rb_raise(rb_eArgError, "negative string size (or size too big)");
|
145
|
+
}
|
146
|
+
return icu_ustr_alloc_and_wrap(ptr, len, len+1, ICU_COPY);
|
147
|
+
}
|
148
|
+
|
149
|
+
VALUE
|
150
|
+
icu_ustr_new(ptr, len)
|
151
|
+
const UChar *ptr;
|
152
|
+
long len;
|
153
|
+
{
|
154
|
+
return ustr_new(rb_cUString, ptr, len);
|
155
|
+
}
|
156
|
+
VALUE
|
157
|
+
icu_ustr_new_set(ptr, len, capa)
|
158
|
+
UChar *ptr;
|
159
|
+
long len;
|
160
|
+
long capa;
|
161
|
+
{
|
162
|
+
return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_SET);
|
163
|
+
}
|
164
|
+
VALUE
|
165
|
+
icu_ustr_new2(ptr)
|
166
|
+
const UChar *ptr;
|
167
|
+
{
|
168
|
+
if (!ptr) {
|
169
|
+
rb_raise(rb_eArgError, "NULL pointer given");
|
170
|
+
}
|
171
|
+
return icu_ustr_new(ptr, u_strlen(ptr));
|
172
|
+
}
|
173
|
+
|
174
|
+
inline VALUE
|
175
|
+
icu_ustr_new_capa(UChar * ptr, long len, long capa)
|
176
|
+
{
|
177
|
+
return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_COPY);
|
178
|
+
}
|
179
|
+
|
180
|
+
/* ------------ */
|
181
|
+
|
182
|
+
/**
|
183
|
+
* call-seq:
|
184
|
+
* UString.new(str="".u) => new_str
|
185
|
+
*
|
186
|
+
* Returns a new string object containing a copy of <i>str</i>.
|
187
|
+
*/
|
188
|
+
|
189
|
+
VALUE
|
190
|
+
icu_ustr_init(argc, argv, str)
|
191
|
+
int argc;
|
192
|
+
VALUE *argv;
|
193
|
+
VALUE str;
|
194
|
+
{
|
195
|
+
VALUE orig;
|
196
|
+
|
197
|
+
if (rb_scan_args(argc, argv, "01", &orig) == 1)
|
198
|
+
{
|
199
|
+
icu_ustr_replace(str, orig);
|
200
|
+
}
|
201
|
+
return str;
|
202
|
+
}
|
203
|
+
|
204
|
+
/**
|
205
|
+
* call-seq:
|
206
|
+
* str.length => integer
|
207
|
+
*
|
208
|
+
* Returns the length of <i>str</i>.
|
209
|
+
*/
|
210
|
+
VALUE
|
211
|
+
icu_ustr_length(str)
|
212
|
+
VALUE str;
|
213
|
+
{
|
214
|
+
return LONG2NUM(ICU_LEN(str));
|
215
|
+
}
|
216
|
+
|
217
|
+
/**
|
218
|
+
* call-seq:
|
219
|
+
* str.empty? => true or false
|
220
|
+
*
|
221
|
+
* Returns <code>true</code> if <i>str</i> has a length of zero.
|
222
|
+
*
|
223
|
+
* "hello".u.empty? #=> false
|
224
|
+
* "".u.empty? #=> true
|
225
|
+
*/
|
226
|
+
|
227
|
+
VALUE
|
228
|
+
icu_ustr_empty(str)
|
229
|
+
VALUE str;
|
230
|
+
{
|
231
|
+
return 0 == ICU_LEN(str) ? Qtrue : Qfalse;
|
232
|
+
}
|
233
|
+
|
234
|
+
VALUE
|
235
|
+
icu_ustr_resize(str, len)
|
236
|
+
VALUE str;
|
237
|
+
long len;
|
238
|
+
{
|
239
|
+
if (len < 0) {
|
240
|
+
rb_raise(rb_eArgError, "negative string size (or size too big)");
|
241
|
+
}
|
242
|
+
ustr_capa_resize(USTRING(str), len);
|
243
|
+
ICU_LEN(str) = len;
|
244
|
+
ICU_PTR(str)[len] = 0; /* sentinel */
|
245
|
+
return str;
|
246
|
+
}
|
247
|
+
|
248
|
+
|
249
|
+
/**
|
250
|
+
* call-seq:
|
251
|
+
* str.replace(other_str) => str
|
252
|
+
*
|
253
|
+
* Replaces the contents and taintedness of <i>str</i> with the corresponding
|
254
|
+
* values in <i>other_str</i>.
|
255
|
+
*
|
256
|
+
* s = "hello".u #=> "hello"
|
257
|
+
* s.replace "world".u #=> "world"
|
258
|
+
*/
|
259
|
+
VALUE
|
260
|
+
icu_ustr_replace(str, str2)
|
261
|
+
VALUE str,
|
262
|
+
str2;
|
263
|
+
{
|
264
|
+
if (str == str2)
|
265
|
+
return str;
|
266
|
+
icu_check_frozen(str);
|
267
|
+
Check_Class(str2, rb_cUString);
|
268
|
+
ustr_splice_units(USTRING(str), 0, ICU_LEN(str), ICU_PTR(str2), ICU_LEN(str2));
|
269
|
+
OBJ_INFECT(str, str2);
|
270
|
+
return str;
|
271
|
+
}
|
272
|
+
|
273
|
+
/**
|
274
|
+
* call-seq:
|
275
|
+
* string.clear -> string
|
276
|
+
*
|
277
|
+
* Makes string empty.
|
278
|
+
*
|
279
|
+
* a = "abcde".u
|
280
|
+
* a.clear #=> ""
|
281
|
+
*/
|
282
|
+
|
283
|
+
VALUE
|
284
|
+
icu_ustr_clear(str)
|
285
|
+
VALUE str;
|
286
|
+
{
|
287
|
+
icu_check_frozen(str);
|
288
|
+
icu_ustr_resize(str, 0);
|
289
|
+
return str;
|
290
|
+
}
|
291
|
+
|
292
|
+
int icu_collator_cmp (UCollator * collator, VALUE str1, VALUE str2)
|
293
|
+
{
|
294
|
+
int ret = 0, result ;
|
295
|
+
result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
|
296
|
+
switch(result){
|
297
|
+
case UCOL_EQUAL: ret = 0;break;
|
298
|
+
case UCOL_GREATER: ret = 1;break;
|
299
|
+
case UCOL_LESS: ret = -1;break;
|
300
|
+
}
|
301
|
+
return ret;
|
302
|
+
}
|
303
|
+
|
304
|
+
int
|
305
|
+
icu_ustr_cmp(str1, str2)
|
306
|
+
VALUE str1,
|
307
|
+
str2;
|
308
|
+
{
|
309
|
+
return icu_collator_cmp(s_UCA_collator, str1, str2);
|
310
|
+
}
|
311
|
+
|
312
|
+
/**
|
313
|
+
* call-seq:
|
314
|
+
* str == obj => true or false
|
315
|
+
*
|
316
|
+
* Equality---If <i>obj</i> is not a <code>UString</code>, returns
|
317
|
+
* <code>false</code>. Otherwise, returns <code>true</code> if
|
318
|
+
* strings are of the same length and content
|
319
|
+
*
|
320
|
+
*/
|
321
|
+
|
322
|
+
VALUE
|
323
|
+
icu_ustr_equal(str1, str2)
|
324
|
+
VALUE str1,
|
325
|
+
str2;
|
326
|
+
{
|
327
|
+
if (str1 == str2)
|
328
|
+
return Qtrue;
|
329
|
+
if (CLASS_OF(str2) != rb_cUString) {
|
330
|
+
return Qfalse;
|
331
|
+
}
|
332
|
+
if (ICU_LEN(str1) == ICU_LEN(str2) &&
|
333
|
+
u_strncmp(ICU_PTR(str1), ICU_PTR(str2), ICU_LEN(str1) ) == 0) {
|
334
|
+
return Qtrue;
|
335
|
+
}
|
336
|
+
return Qfalse;
|
337
|
+
}
|
338
|
+
|
339
|
+
/**
|
340
|
+
* call-seq:
|
341
|
+
* str <=> other_str => -1, 0, +1
|
342
|
+
*
|
343
|
+
* Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
|
344
|
+
* <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
|
345
|
+
* <i>str</i>.
|
346
|
+
*
|
347
|
+
* <code><=></code> is the basis for the methods <code><</code>,
|
348
|
+
* <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
|
349
|
+
* included from module <code>Comparable</code>. The method
|
350
|
+
* <code>String#==</code> does not use <code>Comparable#==</code>.
|
351
|
+
*
|
352
|
+
* This method uses UCA rules, see also #strcoll for locale-specific string collation.
|
353
|
+
*
|
354
|
+
* "abcdef".u <=> "abcde".u #=> 1
|
355
|
+
* "abcdef".u <=> "abcdef".u #=> 0
|
356
|
+
* "abcdef".u <=> "abcdefg".u #=> -1
|
357
|
+
* "abcdef".u <=> "ABCDEF".u #=> -1
|
358
|
+
*/
|
359
|
+
|
360
|
+
VALUE
|
361
|
+
icu_ustr_cmp_m(str1, str2)
|
362
|
+
VALUE str1,
|
363
|
+
str2;
|
364
|
+
{
|
365
|
+
long result;
|
366
|
+
|
367
|
+
if (CLASS_OF(str2) != rb_cUString) {
|
368
|
+
return Qnil;
|
369
|
+
} else {
|
370
|
+
result = icu_ustr_cmp(str1, str2);
|
371
|
+
}
|
372
|
+
return LONG2NUM(result);
|
373
|
+
}
|
374
|
+
|
375
|
+
/**
|
376
|
+
* call-seq:
|
377
|
+
* str.casecmp(other_str) => -1, 0, +1
|
378
|
+
*
|
379
|
+
* Case-insensitive version of <code>UString#<=></code> .
|
380
|
+
* This method uses UCA collator with secondary strength, see #strcoll
|
381
|
+
*
|
382
|
+
*
|
383
|
+
* "abcdef".u.casecmp("abcde".u) #=> 1
|
384
|
+
* "aBcDeF".u.casecmp("abcdef".u) #=> 0
|
385
|
+
* "abcdef".u.casecmp("abcdefg".u) #=> -1
|
386
|
+
* "abcdef".u.casecmp("ABCDEF".u) #=> 0
|
387
|
+
*/
|
388
|
+
|
389
|
+
VALUE
|
390
|
+
icu_ustr_casecmp(str1, str2)
|
391
|
+
VALUE str1,
|
392
|
+
str2;
|
393
|
+
{
|
394
|
+
Check_Class(str2, rb_cUString);
|
395
|
+
return INT2FIX(icu_collator_cmp(s_case_UCA_collator, str1, str2));
|
396
|
+
}
|
397
|
+
|
398
|
+
/**
|
399
|
+
* call-seq:
|
400
|
+
* str + other_str => new_str
|
401
|
+
*
|
402
|
+
* Concatenation---Returns a new <code>UString</code> containing
|
403
|
+
* <i>other_str</i> concatenated to <i>str</i>.
|
404
|
+
*
|
405
|
+
* "Hello from ".u + "main".u #=> "Hello from main"
|
406
|
+
*/
|
407
|
+
|
408
|
+
VALUE
|
409
|
+
icu_ustr_plus(str1, str2)
|
410
|
+
VALUE str1,
|
411
|
+
str2;
|
412
|
+
{
|
413
|
+
VALUE str3;
|
414
|
+
Check_Class(str2, rb_cUString);
|
415
|
+
|
416
|
+
str3 = icu_ustr_new_capa(ICU_PTR(str1), ICU_LEN(str1), ICU_LEN(str1) + ICU_LEN(str2));
|
417
|
+
ustr_splice_units(USTRING(str3), ICU_LEN(str3), 0, ICU_PTR(str2), ICU_LEN(str2));
|
418
|
+
if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
|
419
|
+
OBJ_TAINT(str3);
|
420
|
+
return str3;
|
421
|
+
}
|
422
|
+
|
423
|
+
/**
|
424
|
+
* call-seq:
|
425
|
+
* str * integer => new_str
|
426
|
+
*
|
427
|
+
* Copy---Returns a new <code>UString</code> containing <i>integer</i> copies of
|
428
|
+
* the receiver.
|
429
|
+
*
|
430
|
+
* "Ho! ".u * 3 #=> "Ho! Ho! Ho! ".u
|
431
|
+
*/
|
432
|
+
|
433
|
+
VALUE
|
434
|
+
icu_ustr_times(str, times)
|
435
|
+
VALUE str,
|
436
|
+
times;
|
437
|
+
{
|
438
|
+
VALUE str2;
|
439
|
+
long i,
|
440
|
+
len;
|
441
|
+
Check_Type(times, T_FIXNUM);
|
442
|
+
len = NUM2LONG(times);
|
443
|
+
if (len < 0) {
|
444
|
+
rb_raise(rb_eArgError, "negative argument");
|
445
|
+
}
|
446
|
+
if (len && LONG_MAX / len < ICU_LEN(str)) {
|
447
|
+
rb_raise(rb_eArgError, "argument too big");
|
448
|
+
}
|
449
|
+
|
450
|
+
str2 = icu_ustr_new_capa(0, 0, len *= ICU_LEN(str));
|
451
|
+
for (i = 0; i < len; i += ICU_LEN(str)) {
|
452
|
+
ustr_splice_units(USTRING(str2), i, 0, ICU_PTR(str), ICU_LEN(str));
|
453
|
+
}
|
454
|
+
ICU_PTR(str2)[ICU_LEN(str2)] = 0;
|
455
|
+
|
456
|
+
OBJ_INFECT(str2, str);
|
457
|
+
|
458
|
+
return str2;
|
459
|
+
}
|
460
|
+
|
461
|
+
|
462
|
+
/**
|
463
|
+
* call-seq:
|
464
|
+
* str << other_str => str
|
465
|
+
* str.concat(other_str) => str
|
466
|
+
*
|
467
|
+
* Append---Concatenates the given string object to <i>str</i>.
|
468
|
+
*
|
469
|
+
* a = "hello ".u
|
470
|
+
* a << "world".u #=> "hello world"
|
471
|
+
*/
|
472
|
+
|
473
|
+
VALUE
|
474
|
+
icu_ustr_concat(str1, str2)
|
475
|
+
VALUE str1,
|
476
|
+
str2;
|
477
|
+
{
|
478
|
+
icu_check_frozen(str1);
|
479
|
+
Check_Class(str2, rb_cUString);
|
480
|
+
if (ICU_LEN(str2) > 0) {
|
481
|
+
ustr_splice_units(USTRING(str1), ICU_LEN(str1), 0, ICU_PTR(str2), ICU_LEN(str2));
|
482
|
+
OBJ_INFECT(str1, str2);
|
483
|
+
}
|
484
|
+
return str1;
|
485
|
+
}
|
486
|
+
|
487
|
+
int
|
488
|
+
icu_ustr_hash(str)
|
489
|
+
VALUE str;
|
490
|
+
{
|
491
|
+
register long len = ICU_LEN(str) * (sizeof(UChar));
|
492
|
+
register char *p = (char*)ICU_PTR(str);
|
493
|
+
register int key = 0;
|
494
|
+
|
495
|
+
while (len--) {
|
496
|
+
key += *p++;
|
497
|
+
key += (key << 10);
|
498
|
+
key ^= (key >> 6);
|
499
|
+
}
|
500
|
+
key += (key << 3);
|
501
|
+
key ^= (key >> 11);
|
502
|
+
key += (key << 15);
|
503
|
+
return key;
|
504
|
+
}
|
505
|
+
|
506
|
+
/**
|
507
|
+
* call-seq:
|
508
|
+
* str.hash => fixnum
|
509
|
+
*
|
510
|
+
* Return a hash based on the string's length and content.
|
511
|
+
*/
|
512
|
+
|
513
|
+
VALUE
|
514
|
+
icu_ustr_hash_m(str)
|
515
|
+
VALUE str;
|
516
|
+
{
|
517
|
+
int key = icu_ustr_hash(str);
|
518
|
+
return INT2FIX(key);
|
519
|
+
}
|
520
|
+
|
521
|
+
VALUE
|
522
|
+
icu_ustr_dup(str)
|
523
|
+
VALUE str;
|
524
|
+
{
|
525
|
+
VALUE dup = icu_ustr_new(ICU_PTR(str), ICU_LEN(str));
|
526
|
+
return dup;
|
527
|
+
}
|
528
|
+
|
529
|
+
/**
|
530
|
+
* call-seq:
|
531
|
+
* str.upcase!(locale = "") => str or nil
|
532
|
+
*
|
533
|
+
* Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
|
534
|
+
* were made. This method is locale-sensitive.
|
535
|
+
*/
|
536
|
+
|
537
|
+
VALUE
|
538
|
+
icu_ustr_upcase_bang(argc, argv, str)
|
539
|
+
int argc;
|
540
|
+
VALUE * argv;
|
541
|
+
VALUE str;
|
542
|
+
|
543
|
+
{
|
544
|
+
UErrorCode error = 0;
|
545
|
+
UChar *buf = 0;
|
546
|
+
long len ;
|
547
|
+
VALUE loc;
|
548
|
+
char * locale = NULL;
|
549
|
+
icu_check_frozen(str);
|
550
|
+
buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
|
551
|
+
if (rb_scan_args(argc, argv, "01", &loc) == 1) {
|
552
|
+
if( loc != Qnil) {
|
553
|
+
Check_Type(loc, T_STRING);
|
554
|
+
locale = RSTRING(loc)->ptr;
|
555
|
+
}
|
556
|
+
}
|
557
|
+
|
558
|
+
len = u_strToUpper(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale, &error);
|
559
|
+
if (U_BUFFER_OVERFLOW_ERROR == error) {
|
560
|
+
REALLOC_N(buf, UChar, len + 1);
|
561
|
+
error = 0;
|
562
|
+
len =
|
563
|
+
u_strToUpper(buf, len, ICU_PTR(str), ICU_LEN(str), locale, &error);
|
564
|
+
}
|
565
|
+
if (0 == u_strncmp(buf, ICU_PTR(str), len))
|
566
|
+
return Qnil;
|
567
|
+
free(ICU_PTR(str));
|
568
|
+
ICU_PTR(str) = buf;
|
569
|
+
ICU_LEN(str) = len;
|
570
|
+
return str;
|
571
|
+
}
|
572
|
+
|
573
|
+
|
574
|
+
/**
|
575
|
+
* call-seq:
|
576
|
+
* str.upcase(locale = "") => new_str
|
577
|
+
*
|
578
|
+
* Returns a copy of <i>str</i> with all lowercase letters replaced with their
|
579
|
+
* uppercase counterparts. The operation is locale sensitive.
|
580
|
+
*
|
581
|
+
* "hEllO".u.upcase #=> "HELLO"
|
582
|
+
*/
|
583
|
+
|
584
|
+
VALUE
|
585
|
+
icu_ustr_upcase(argc, argv, str)
|
586
|
+
int argc;
|
587
|
+
VALUE * argv;
|
588
|
+
VALUE str;
|
589
|
+
|
590
|
+
{
|
591
|
+
str = icu_ustr_dup(str);
|
592
|
+
icu_ustr_upcase_bang(argc, argv, str);
|
593
|
+
return str;
|
594
|
+
}
|
595
|
+
|
596
|
+
|
597
|
+
/**
|
598
|
+
* call-seq:
|
599
|
+
* str.downcase!(locale = "") => str or nil
|
600
|
+
*
|
601
|
+
* Downcases the contents of <i>str</i>, returning <code>nil</code> if no
|
602
|
+
* changes were made.
|
603
|
+
*/
|
604
|
+
|
605
|
+
VALUE
|
606
|
+
icu_ustr_downcase_bang(argc, argv, str)
|
607
|
+
int argc;
|
608
|
+
VALUE * argv;
|
609
|
+
VALUE str;
|
610
|
+
{
|
611
|
+
UErrorCode error = 0;
|
612
|
+
UChar *buf;
|
613
|
+
long len ;
|
614
|
+
VALUE loc;
|
615
|
+
char * locale = NULL;
|
616
|
+
buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
|
617
|
+
icu_check_frozen(str);
|
618
|
+
if (rb_scan_args(argc, argv, "01", &loc) == 1) {
|
619
|
+
if( loc != Qnil) {
|
620
|
+
Check_Type(loc, T_STRING);
|
621
|
+
locale = RSTRING(loc)->ptr;
|
622
|
+
}
|
623
|
+
}
|
624
|
+
len =
|
625
|
+
u_strToLower(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale,
|
626
|
+
&error);
|
627
|
+
if (U_BUFFER_OVERFLOW_ERROR == error) {
|
628
|
+
REALLOC_N(buf, UChar, len + 1);
|
629
|
+
error = 0;
|
630
|
+
len =
|
631
|
+
u_strToLower(buf, len , ICU_PTR(str), ICU_LEN(str), locale,
|
632
|
+
&error);
|
633
|
+
}
|
634
|
+
if (0 == u_strncmp(buf, ICU_PTR(str), len))
|
635
|
+
return Qnil;
|
636
|
+
free(ICU_PTR(str));
|
637
|
+
ICU_PTR(str) = buf;
|
638
|
+
ICU_LEN(str) = len;
|
639
|
+
return str;
|
640
|
+
}
|
641
|
+
|
642
|
+
/**
|
643
|
+
* call-seq:
|
644
|
+
* str.downcase(locale = "") => new_str
|
645
|
+
*
|
646
|
+
* Returns a copy of <i>str</i> with all uppercase letters replaced with their
|
647
|
+
* lowercase counterparts. The operation is locale sensitive.
|
648
|
+
*
|
649
|
+
* "hEllO".u.downcase #=> "hello"
|
650
|
+
*/
|
651
|
+
|
652
|
+
VALUE
|
653
|
+
icu_ustr_downcase(argc, argv, str)
|
654
|
+
int argc;
|
655
|
+
VALUE * argv;
|
656
|
+
VALUE str;
|
657
|
+
{
|
658
|
+
str = icu_ustr_dup(str);
|
659
|
+
icu_ustr_downcase_bang(argc, argv, str);
|
660
|
+
return str;
|
661
|
+
}
|
662
|
+
|
663
|
+
/**
|
664
|
+
* call-seq:
|
665
|
+
* str.foldcase
|
666
|
+
*
|
667
|
+
* Case-fold the characters in a string.
|
668
|
+
* Case-folding is locale-independent and not context-sensitive.
|
669
|
+
*
|
670
|
+
*/
|
671
|
+
VALUE
|
672
|
+
icu_ustr_foldcase(str)
|
673
|
+
VALUE str;
|
674
|
+
{
|
675
|
+
UErrorCode error = 0;
|
676
|
+
UChar *buf;
|
677
|
+
long len, capa ;
|
678
|
+
capa = ICU_LEN(str) + 1;
|
679
|
+
buf = ALLOC_N(UChar, capa);
|
680
|
+
len = u_strFoldCase(buf, capa-1, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
|
681
|
+
if (U_BUFFER_OVERFLOW_ERROR == error) {
|
682
|
+
capa = len + 1;
|
683
|
+
REALLOC_N(buf, UChar, len + 1);
|
684
|
+
error = 0;
|
685
|
+
len = u_strFoldCase(buf, capa, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
|
686
|
+
}
|
687
|
+
return icu_ustr_new_set(buf, len, capa) ;
|
688
|
+
}
|
689
|
+
|
690
|
+
static long
|
691
|
+
icu_ustr_index(str, sub, offset)
|
692
|
+
VALUE str,
|
693
|
+
sub;
|
694
|
+
long offset;
|
695
|
+
{
|
696
|
+
long pos;
|
697
|
+
UChar *found;
|
698
|
+
if (offset < 0) {
|
699
|
+
offset += ICU_LEN(str);
|
700
|
+
if (offset < 0)
|
701
|
+
return -1;
|
702
|
+
}
|
703
|
+
if (ICU_LEN(str) - offset < ICU_LEN(sub))
|
704
|
+
return -1;
|
705
|
+
if (ICU_LEN(sub) == 0)
|
706
|
+
return offset;
|
707
|
+
found =
|
708
|
+
u_strFindFirst(ICU_PTR(str) + offset, ICU_LEN(str) - offset,
|
709
|
+
ICU_PTR(sub), ICU_LEN(sub));
|
710
|
+
if (NULL == found)
|
711
|
+
return -1;
|
712
|
+
pos = found - (ICU_PTR(str) + offset);
|
713
|
+
return pos + offset;
|
714
|
+
}
|
715
|
+
|
716
|
+
/**
|
717
|
+
* call-seq:
|
718
|
+
* str.index(substring [, offset]) => fixnum or nil
|
719
|
+
* str.index(regexp [, offset]) => fixnum or nil
|
720
|
+
*
|
721
|
+
* Returns the index of the first occurrence of the given <i>substring</i>,
|
722
|
+
* or pattern (<i>regexp</i>) in <i>str</i>. Returns
|
723
|
+
* <code>nil</code> if not found. If the second parameter is present, it
|
724
|
+
* specifies the position in the string to begin the search.
|
725
|
+
*
|
726
|
+
* "hello".u.index('e'.u) #=> 1
|
727
|
+
* "hello".u.index('lo'.u) #=> 3
|
728
|
+
* "hello".u.index('a'.u) #=> nil
|
729
|
+
* "hello".u.index(/[aeiou]/.U, -3) #=> 4
|
730
|
+
*/
|
731
|
+
|
732
|
+
VALUE
|
733
|
+
icu_ustr_index_m(argc, argv, str)
|
734
|
+
int argc;
|
735
|
+
VALUE *argv;
|
736
|
+
VALUE str;
|
737
|
+
{
|
738
|
+
VALUE sub;
|
739
|
+
VALUE initpos;
|
740
|
+
long pos ;
|
741
|
+
int processed = 0;
|
742
|
+
|
743
|
+
if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
|
744
|
+
pos = NUM2LONG(initpos);
|
745
|
+
} else {
|
746
|
+
pos = 0;
|
747
|
+
}
|
748
|
+
if (pos < 0) {
|
749
|
+
pos += ICU_LEN(str);
|
750
|
+
}
|
751
|
+
|
752
|
+
if( CLASS_OF(sub) == rb_cUString) {
|
753
|
+
pos = icu_ustr_index(str, sub, pos);
|
754
|
+
processed = 1;
|
755
|
+
}
|
756
|
+
if( CLASS_OF(sub) == rb_cURegexp) {
|
757
|
+
pos = icu_reg_search(sub, str, pos, 0);
|
758
|
+
processed = 1;
|
759
|
+
}
|
760
|
+
if(! processed ) {
|
761
|
+
rb_raise(rb_eTypeError, "Wrong Type, expected UString or URegexp, got %s", rb_class2name(CLASS_OF(sub)));
|
762
|
+
}
|
763
|
+
|
764
|
+
if (pos == -1)
|
765
|
+
return Qnil;
|
766
|
+
return LONG2NUM(pos);
|
767
|
+
}
|
768
|
+
|
769
|
+
static long
|
770
|
+
icu_ustr_rindex(str, sub, pos)
|
771
|
+
VALUE str,
|
772
|
+
sub;
|
773
|
+
long pos;
|
774
|
+
{
|
775
|
+
long len = ICU_LEN(sub);
|
776
|
+
UChar *found;
|
777
|
+
|
778
|
+
/*
|
779
|
+
* substring longer than string
|
780
|
+
*/
|
781
|
+
if (ICU_LEN(str) < len)
|
782
|
+
return -1;
|
783
|
+
if (ICU_LEN(str) - pos < len) {
|
784
|
+
pos = ICU_LEN(str) - len;
|
785
|
+
}
|
786
|
+
found = u_strFindLast(ICU_PTR(str), pos, ICU_PTR(sub), ICU_LEN(sub));
|
787
|
+
if (NULL == found)
|
788
|
+
return -1;
|
789
|
+
pos = found - (ICU_PTR(str));
|
790
|
+
return pos;
|
791
|
+
}
|
792
|
+
|
793
|
+
|
794
|
+
/**
|
795
|
+
* call-seq:
|
796
|
+
* str.rindex(substring [, fixnum]) => fixnum or nil
|
797
|
+
* str.rindex(regexp [, fixnum]) => fixnum or nil
|
798
|
+
*
|
799
|
+
* Returns the index of the last occurrence of the given <i>substring</i>,
|
800
|
+
* or pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
|
801
|
+
* found. If the second parameter is present, it specifies the position in the
|
802
|
+
* string to end the search---characters beyond this point will not be considered.
|
803
|
+
*
|
804
|
+
* "hello".u.rindex('e') #=> 1
|
805
|
+
* "hello".u.rindex('l') #=> 3
|
806
|
+
* "hello".u.rindex('a') #=> nil
|
807
|
+
* "hello".u.rindex(/[aeiou]/.U, -2) #=> 1
|
808
|
+
*/
|
809
|
+
|
810
|
+
VALUE
|
811
|
+
icu_ustr_rindex_m(argc, argv, str)
|
812
|
+
int argc;
|
813
|
+
VALUE *argv;
|
814
|
+
VALUE str;
|
815
|
+
{
|
816
|
+
VALUE sub;
|
817
|
+
VALUE position;
|
818
|
+
long pos;
|
819
|
+
|
820
|
+
if (rb_scan_args(argc, argv, "11", &sub, &position) == 2) {
|
821
|
+
pos = NUM2LONG(position);
|
822
|
+
if (pos < 0) {
|
823
|
+
pos += ICU_LEN(str);
|
824
|
+
if (pos < 0) {
|
825
|
+
return Qnil;
|
826
|
+
}
|
827
|
+
}
|
828
|
+
if (pos > ICU_LEN(str))
|
829
|
+
pos = ICU_LEN(str);
|
830
|
+
} else {
|
831
|
+
pos = ICU_LEN(str);
|
832
|
+
}
|
833
|
+
|
834
|
+
switch (TYPE(sub)) {
|
835
|
+
case T_DATA:
|
836
|
+
if (CLASS_OF(sub) == rb_cUString) {
|
837
|
+
pos = icu_ustr_rindex(str, sub, pos);
|
838
|
+
if (pos >= 0)
|
839
|
+
return LONG2NUM(pos);
|
840
|
+
break;
|
841
|
+
}
|
842
|
+
if (CLASS_OF(sub) == rb_cURegexp) {
|
843
|
+
pos = icu_reg_search(sub, str, pos, 1);
|
844
|
+
if (pos >= 0)
|
845
|
+
return LONG2NUM(pos);
|
846
|
+
break;
|
847
|
+
}
|
848
|
+
|
849
|
+
default:
|
850
|
+
rb_raise(rb_eTypeError, "type mismatch: %s given",
|
851
|
+
rb_obj_classname(sub));
|
852
|
+
}
|
853
|
+
return Qnil;
|
854
|
+
}
|
855
|
+
|
856
|
+
/**
|
857
|
+
* call-seq:
|
858
|
+
* str.lstrip! => self or nil
|
859
|
+
*
|
860
|
+
* Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
|
861
|
+
* change was made. See also <code>UString#rstrip!</code> and
|
862
|
+
* <code>UString#strip!</code>, in all these methods whitespace is an
|
863
|
+
* Unicode char that has White_Space property.
|
864
|
+
*
|
865
|
+
* " hello ".u.lstrip #=> "hello "
|
866
|
+
* "hello".u.lstrip! #=> nil
|
867
|
+
*/
|
868
|
+
|
869
|
+
VALUE
|
870
|
+
icu_ustr_lstrip_bang(str)
|
871
|
+
VALUE str;
|
872
|
+
{
|
873
|
+
UChar *s;
|
874
|
+
int32_t i,
|
875
|
+
n,
|
876
|
+
c;
|
877
|
+
icu_check_frozen(str);
|
878
|
+
s = ICU_PTR(str);
|
879
|
+
n = ICU_LEN(str);
|
880
|
+
if (!s || n == 0)
|
881
|
+
return Qnil;
|
882
|
+
/*
|
883
|
+
* remove spaces at head
|
884
|
+
*/
|
885
|
+
i = 0;
|
886
|
+
U16_GET(s, 0, i, n, c); /* care about surrogates */
|
887
|
+
while (i < n && u_isUWhiteSpace(c)) {
|
888
|
+
U16_NEXT(s, i, n, c); /* care surr */
|
889
|
+
}
|
890
|
+
|
891
|
+
if (i > 0) {
|
892
|
+
if(! u_isUWhiteSpace(c)) --i;
|
893
|
+
ICU_LEN(str) = n - i;
|
894
|
+
u_memmove(ICU_PTR(str), s + i, ICU_LEN(str));
|
895
|
+
ICU_PTR(str)[ICU_LEN(str)] = 0;
|
896
|
+
return str;
|
897
|
+
}
|
898
|
+
return Qnil;
|
899
|
+
}
|
900
|
+
|
901
|
+
|
902
|
+
/**
|
903
|
+
* call-seq:
|
904
|
+
* str.lstrip => new_str
|
905
|
+
*
|
906
|
+
* Returns a copy of <i>str</i> with leading whitespace removed. See also
|
907
|
+
* <code>UString#rstrip</code> and <code>UString#strip</code>.
|
908
|
+
*
|
909
|
+
* " hello ".u.lstrip #=> "hello "
|
910
|
+
* "hello".u.lstrip #=> "hello"
|
911
|
+
*/
|
912
|
+
|
913
|
+
VALUE
|
914
|
+
icu_ustr_lstrip(str)
|
915
|
+
VALUE str;
|
916
|
+
{
|
917
|
+
str = icu_ustr_dup(str);
|
918
|
+
icu_ustr_lstrip_bang(str);
|
919
|
+
return str;
|
920
|
+
}
|
921
|
+
|
922
|
+
|
923
|
+
/**
|
924
|
+
* call-seq:
|
925
|
+
* str.rstrip! => self or nil
|
926
|
+
*
|
927
|
+
* Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
|
928
|
+
* no change was made. See also <code>UString#lstrip!</code> and
|
929
|
+
* <code>UString#strip!</code>.
|
930
|
+
*
|
931
|
+
* " hello ".u.rstrip #=> " hello"
|
932
|
+
* "hello".u.rstrip! #=> nil
|
933
|
+
*/
|
934
|
+
|
935
|
+
VALUE
|
936
|
+
icu_ustr_rstrip_bang(str)
|
937
|
+
VALUE str;
|
938
|
+
{
|
939
|
+
UChar *s;
|
940
|
+
int32_t i,
|
941
|
+
n,
|
942
|
+
c;
|
943
|
+
|
944
|
+
icu_check_frozen(str);
|
945
|
+
s = ICU_PTR(str);
|
946
|
+
n = ICU_LEN(str);
|
947
|
+
|
948
|
+
if (!s || n == 0)
|
949
|
+
return Qnil;
|
950
|
+
i = n - 1;
|
951
|
+
|
952
|
+
U16_GET(s, 0, n - 1, n, c); /* care surrogates */
|
953
|
+
i = n;
|
954
|
+
/*
|
955
|
+
* remove trailing spaces
|
956
|
+
*/
|
957
|
+
while (i > 0 && u_isUWhiteSpace(c)) {
|
958
|
+
U16_PREV(s, 0, i, c); /* care surrogates */
|
959
|
+
}
|
960
|
+
|
961
|
+
if (i < n) {
|
962
|
+
if(! u_isUWhiteSpace(c)) ++i;
|
963
|
+
ICU_LEN(str) = i;
|
964
|
+
ICU_PTR(str)[i] = 0;
|
965
|
+
return str;
|
966
|
+
}
|
967
|
+
return Qnil;
|
968
|
+
}
|
969
|
+
|
970
|
+
|
971
|
+
/**
|
972
|
+
* call-seq:
|
973
|
+
* str.rstrip => new_str
|
974
|
+
*
|
975
|
+
* Returns a copy of <i>str</i> with trailing whitespace removed. See also
|
976
|
+
* <code>UString#lstrip</code> and <code>UString#strip</code>.
|
977
|
+
*
|
978
|
+
* " hello ".u.rstrip #=> " hello"
|
979
|
+
* "hello".u.rstrip #=> "hello"
|
980
|
+
*/
|
981
|
+
|
982
|
+
VALUE
|
983
|
+
icu_ustr_rstrip(str)
|
984
|
+
VALUE str;
|
985
|
+
{
|
986
|
+
str = icu_ustr_dup(str);
|
987
|
+
icu_ustr_rstrip_bang(str);
|
988
|
+
return str;
|
989
|
+
}
|
990
|
+
|
991
|
+
|
992
|
+
/**
|
993
|
+
* call-seq:
|
994
|
+
* str.strip! => str or nil
|
995
|
+
*
|
996
|
+
* Removes leading and trailing whitespace from <i>str</i>. Returns
|
997
|
+
* <code>nil</code> if <i>str</i> was not altered.
|
998
|
+
*/
|
999
|
+
|
1000
|
+
VALUE
|
1001
|
+
icu_ustr_strip_bang(str)
|
1002
|
+
VALUE str;
|
1003
|
+
{
|
1004
|
+
VALUE l = icu_ustr_lstrip_bang(str);
|
1005
|
+
VALUE r = icu_ustr_rstrip_bang(str);
|
1006
|
+
|
1007
|
+
if (NIL_P(l) && NIL_P(r))
|
1008
|
+
return Qnil;
|
1009
|
+
return str;
|
1010
|
+
}
|
1011
|
+
|
1012
|
+
|
1013
|
+
/**
|
1014
|
+
* call-seq:
|
1015
|
+
* str.strip => new_str
|
1016
|
+
*
|
1017
|
+
* Returns a copy of <i>str</i> with leading and trailing whitespace removed.
|
1018
|
+
*
|
1019
|
+
* " hello ".u.strip #=> "hello"
|
1020
|
+
* "\tgoodbye\r\n".u.strip #=> "goodbye"
|
1021
|
+
*/
|
1022
|
+
|
1023
|
+
VALUE
|
1024
|
+
icu_ustr_strip(str)
|
1025
|
+
VALUE str;
|
1026
|
+
{
|
1027
|
+
str = icu_ustr_dup(str);
|
1028
|
+
icu_ustr_strip_bang(str);
|
1029
|
+
return str;
|
1030
|
+
}
|
1031
|
+
|
1032
|
+
|
1033
|
+
|
1034
|
+
/* ----------------------------------- */
|
1035
|
+
VALUE
|
1036
|
+
icu_ustr_normalize(str, mode)
|
1037
|
+
VALUE str;
|
1038
|
+
int32_t mode;
|
1039
|
+
{
|
1040
|
+
UErrorCode error = U_ZERO_ERROR;
|
1041
|
+
long capa = ICU_LEN(str);
|
1042
|
+
UChar *buf;
|
1043
|
+
long needed;
|
1044
|
+
VALUE ret;
|
1045
|
+
if (UNORM_YES == unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), mode, &error))
|
1046
|
+
return icu_ustr_dup(str);
|
1047
|
+
|
1048
|
+
buf = ALLOC_N(UChar, capa + 20);
|
1049
|
+
do {
|
1050
|
+
error = 0;
|
1051
|
+
needed =
|
1052
|
+
unorm_normalize(ICU_PTR(str), ICU_LEN(str), mode, 0, buf, capa,
|
1053
|
+
&error);
|
1054
|
+
if (U_SUCCESS(error)) {
|
1055
|
+
ret = icu_ustr_new_set(buf, needed, capa);
|
1056
|
+
return ret;
|
1057
|
+
}
|
1058
|
+
if (error == U_BUFFER_OVERFLOW_ERROR) {
|
1059
|
+
capa = needed + 1;
|
1060
|
+
REALLOC_N(buf, UChar, capa);
|
1061
|
+
if (!buf)
|
1062
|
+
rb_raise(rb_eRuntimeError, "can't allocate memory");
|
1063
|
+
} else
|
1064
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
1065
|
+
}
|
1066
|
+
while (1);
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
/**
|
1070
|
+
* UNORM_NFKC Compatibility decomposition followed by canonical
|
1071
|
+
* composition.
|
1072
|
+
*/
|
1073
|
+
VALUE
|
1074
|
+
icu_ustr_normalize_KC(str)
|
1075
|
+
VALUE str;
|
1076
|
+
{
|
1077
|
+
return icu_ustr_normalize(str, UNORM_NFKC);
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
/**
|
1081
|
+
* UNORM_NFKD Compatibility decomposition.
|
1082
|
+
*/
|
1083
|
+
VALUE
|
1084
|
+
icu_ustr_normalize_KD(str)
|
1085
|
+
VALUE str;
|
1086
|
+
{
|
1087
|
+
return icu_ustr_normalize(str, UNORM_NFKD);
|
1088
|
+
}
|
1089
|
+
|
1090
|
+
/**
|
1091
|
+
* UNORM_NFD Canonical decomposition.
|
1092
|
+
*/
|
1093
|
+
VALUE
|
1094
|
+
icu_ustr_normalize_D(str)
|
1095
|
+
VALUE str;
|
1096
|
+
{
|
1097
|
+
return icu_ustr_normalize(str, UNORM_NFD);
|
1098
|
+
}
|
1099
|
+
|
1100
|
+
/**
|
1101
|
+
* UNORM_FCD
|
1102
|
+
*/
|
1103
|
+
VALUE
|
1104
|
+
icu_ustr_normalize_FCD(VALUE str)
|
1105
|
+
{
|
1106
|
+
return icu_ustr_normalize(str, UNORM_FCD);
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
/**
|
1110
|
+
* UNORM_NFC Canonical decomposition followed by canonical composition.
|
1111
|
+
*/
|
1112
|
+
VALUE
|
1113
|
+
icu_ustr_normalize_C(str)
|
1114
|
+
VALUE str;
|
1115
|
+
{
|
1116
|
+
return icu_ustr_normalize(str, UNORM_NFC);
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
/* UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE */
|
1120
|
+
VALUE
|
1121
|
+
icu_ustr_each_mode(argc, argv, str, mode)
|
1122
|
+
int argc;
|
1123
|
+
VALUE *argv;
|
1124
|
+
VALUE str;
|
1125
|
+
int32_t mode;
|
1126
|
+
{
|
1127
|
+
UErrorCode error = 0;
|
1128
|
+
UBreakIterator *boundary;
|
1129
|
+
int32_t end, start;
|
1130
|
+
VALUE loc ;
|
1131
|
+
char *locale = "";
|
1132
|
+
if( rb_scan_args(argc, argv, "01", &loc) == 1) {
|
1133
|
+
Check_Type(loc, T_STRING);
|
1134
|
+
locale = RSTRING(loc)->ptr;
|
1135
|
+
}
|
1136
|
+
boundary =
|
1137
|
+
ubrk_open(mode, locale, ICU_PTR(str), ICU_LEN(str),
|
1138
|
+
&error);
|
1139
|
+
if (U_FAILURE(error))
|
1140
|
+
rb_raise(rb_eArgError, "Error %s", u_errorName(error));
|
1141
|
+
start = ubrk_first(boundary);
|
1142
|
+
USTRING(str)->busy = 1;
|
1143
|
+
for (end = ubrk_next(boundary); end != UBRK_DONE;
|
1144
|
+
start = end, end = ubrk_next(boundary)) {
|
1145
|
+
rb_yield(icu_ustr_new(ICU_PTR(str) + start, end - start));
|
1146
|
+
}
|
1147
|
+
USTRING(str)->busy = 0;
|
1148
|
+
ubrk_close(boundary);
|
1149
|
+
return str;
|
1150
|
+
}
|
1151
|
+
|
1152
|
+
/**
|
1153
|
+
* call-seq:
|
1154
|
+
* str.each_word(locale = "") {|substr| block } => str
|
1155
|
+
*
|
1156
|
+
* Word boundary analysis is used by search and replace functions, as well as within text editing
|
1157
|
+
* applications that allow the user to select words with a double click. Word selection provides
|
1158
|
+
* correct interpretation of punctuation marks within and following words. Characters that are not
|
1159
|
+
* part of a word, such as symbols or punctuation marks, have word-breaks on both sides.
|
1160
|
+
*
|
1161
|
+
*/
|
1162
|
+
VALUE
|
1163
|
+
icu_ustr_each_word(argc, argv, str)
|
1164
|
+
int argc;
|
1165
|
+
VALUE *argv;
|
1166
|
+
VALUE str;
|
1167
|
+
|
1168
|
+
{
|
1169
|
+
return icu_ustr_each_mode(argc, argv, str, UBRK_WORD);
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
/**
|
1173
|
+
* call-seq:
|
1174
|
+
* str.each_char(locale = "") {|substr| block } => str
|
1175
|
+
*
|
1176
|
+
* Character boundary analysis allows users to interact with characters as they expect to,
|
1177
|
+
* for example, when moving the cursor through a text string. Character boundary analysis provides
|
1178
|
+
* correct navigation of through character strings, regardless of how the character is stored.
|
1179
|
+
* For example, an accented character might be stored as a base character and a diacritical mark.
|
1180
|
+
* What users consider to be a character can differ between languages.
|
1181
|
+
*
|
1182
|
+
*/
|
1183
|
+
VALUE
|
1184
|
+
icu_ustr_each_char(argc, argv, str)
|
1185
|
+
int argc;
|
1186
|
+
VALUE *argv;
|
1187
|
+
VALUE str;
|
1188
|
+
|
1189
|
+
{
|
1190
|
+
return icu_ustr_each_mode(argc, argv, str, UBRK_CHARACTER);
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
/**
|
1194
|
+
* call-seq:
|
1195
|
+
* str.each_line_break(locale = "") {|substr| block } => str
|
1196
|
+
*
|
1197
|
+
* Line boundary analysis determines where a text string can be broken when line-wrapping.
|
1198
|
+
* The mechanism correctly handles punctuation and hyphenated words.
|
1199
|
+
*
|
1200
|
+
*/
|
1201
|
+
VALUE
|
1202
|
+
icu_ustr_each_line(argc, argv, str)
|
1203
|
+
int argc;
|
1204
|
+
VALUE *argv;
|
1205
|
+
VALUE str;
|
1206
|
+
|
1207
|
+
{
|
1208
|
+
return icu_ustr_each_mode(argc, argv, str, UBRK_LINE);
|
1209
|
+
}
|
1210
|
+
|
1211
|
+
/**
|
1212
|
+
* call-seq:
|
1213
|
+
* str.each_sentence(locale = "") {|substr| block } => str
|
1214
|
+
*
|
1215
|
+
* Sentence boundary analysis allows selection with correct interpretation of periods
|
1216
|
+
* within numbers and abbreviations, and trailing punctuation marks such as quotation marks and parentheses.
|
1217
|
+
*
|
1218
|
+
*/
|
1219
|
+
VALUE
|
1220
|
+
icu_ustr_each_sentence(argc, argv, str)
|
1221
|
+
int argc;
|
1222
|
+
VALUE *argv;
|
1223
|
+
VALUE str;
|
1224
|
+
{
|
1225
|
+
return icu_ustr_each_mode(argc, argv, str, UBRK_SENTENCE);
|
1226
|
+
}
|
1227
|
+
|
1228
|
+
/**
|
1229
|
+
* call-seq:
|
1230
|
+
* str.to_u(encoding = 'utf8') => UString
|
1231
|
+
*
|
1232
|
+
* Returns self.
|
1233
|
+
*/
|
1234
|
+
VALUE
|
1235
|
+
icu_ustr_to_ustr(argc, argv, str)
|
1236
|
+
int argc;
|
1237
|
+
VALUE *argv;
|
1238
|
+
VALUE str;
|
1239
|
+
{
|
1240
|
+
return str;
|
1241
|
+
}
|
1242
|
+
|
1243
|
+
/**
|
1244
|
+
* call-seq:
|
1245
|
+
* str.to_s(encoding = 'utf8') => String
|
1246
|
+
*
|
1247
|
+
* Converts to Ruby String (byte-oriented) value in given encoding.
|
1248
|
+
* When no encoding is given, assumes UTF-8.
|
1249
|
+
*/
|
1250
|
+
VALUE
|
1251
|
+
icu_ustr_to_rstr(argc, argv, str)
|
1252
|
+
int argc;
|
1253
|
+
VALUE *argv,
|
1254
|
+
str;
|
1255
|
+
{
|
1256
|
+
VALUE enc;
|
1257
|
+
char *encoding = 0; /* default */
|
1258
|
+
UErrorCode error = 0;
|
1259
|
+
UConverter *conv ;
|
1260
|
+
int enclen, needed = 0;
|
1261
|
+
char * buf;
|
1262
|
+
VALUE s;
|
1263
|
+
if (rb_scan_args(argc, argv, "01", &enc) == 1) {
|
1264
|
+
Check_Type(enc, T_STRING);
|
1265
|
+
encoding = RSTRING(enc)->ptr;
|
1266
|
+
}
|
1267
|
+
|
1268
|
+
enclen = ICU_LEN(str) + 1;
|
1269
|
+
buf = ALLOC_N(char, enclen);
|
1270
|
+
|
1271
|
+
if( !encoding || !strncmp(encoding, "utf8", 4)){
|
1272
|
+
u_strToUTF8( buf, enclen, &needed, ICU_PTR(str), ICU_LEN(str), &error);
|
1273
|
+
if (U_BUFFER_OVERFLOW_ERROR == error) {
|
1274
|
+
REALLOC_N(buf, char, needed + 1);
|
1275
|
+
error = 0;
|
1276
|
+
u_strToUTF8( buf, needed, &needed, ICU_PTR(str), ICU_LEN(str), &error);
|
1277
|
+
}
|
1278
|
+
if( U_FAILURE(error) ){
|
1279
|
+
free(buf);
|
1280
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
1281
|
+
}
|
1282
|
+
s = rb_str_new(buf, needed);
|
1283
|
+
|
1284
|
+
} else {
|
1285
|
+
conv = ucnv_open(encoding, &error);
|
1286
|
+
if (U_FAILURE(error)) {
|
1287
|
+
ucnv_close(conv);
|
1288
|
+
free(buf);
|
1289
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
1290
|
+
}
|
1291
|
+
enclen =
|
1292
|
+
ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
|
1293
|
+
&error);
|
1294
|
+
if (U_BUFFER_OVERFLOW_ERROR == error) {
|
1295
|
+
REALLOC_N(buf, char, enclen + 1);
|
1296
|
+
error = 0;
|
1297
|
+
ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
|
1298
|
+
&error);
|
1299
|
+
}
|
1300
|
+
if( U_FAILURE(error) ){
|
1301
|
+
free(buf);
|
1302
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
1303
|
+
}
|
1304
|
+
s = rb_str_new(buf, enclen);
|
1305
|
+
ucnv_close(conv);
|
1306
|
+
}
|
1307
|
+
free(buf);
|
1308
|
+
return s;
|
1309
|
+
}
|
1310
|
+
|
1311
|
+
/* -------------- */
|
1312
|
+
extern VALUE icu_format(UChar * pattern, int32_t len, VALUE args,
|
1313
|
+
int32_t arg_len, char *locale);
|
1314
|
+
/**
|
1315
|
+
* call-seq:
|
1316
|
+
* str.format(locale, [*args])
|
1317
|
+
*
|
1318
|
+
* Powerful locale-sensitive message formatting. see [./docs/FORMATTING]
|
1319
|
+
*
|
1320
|
+
* Valid argument types are: +Fixnum+, +UString+, +Float+, +Time+ .
|
1321
|
+
*
|
1322
|
+
* */
|
1323
|
+
VALUE
|
1324
|
+
icu_ustr_format(str, args)
|
1325
|
+
VALUE str,
|
1326
|
+
args;
|
1327
|
+
{
|
1328
|
+
VALUE loc;
|
1329
|
+
Check_Type(args, T_ARRAY);
|
1330
|
+
loc = rb_ary_shift(args);
|
1331
|
+
Check_Type(loc, T_STRING);
|
1332
|
+
return icu_format(ICU_PTR(str), ICU_LEN(str), args, RARRAY(args)->len,
|
1333
|
+
RSTRING(loc)->ptr);
|
1334
|
+
}
|
1335
|
+
|
1336
|
+
/* ------ UString regexp related functions ---- */
|
1337
|
+
|
1338
|
+
/**
|
1339
|
+
* call-seq:
|
1340
|
+
* str =~ uregexp => UMatch or nil
|
1341
|
+
* str =~ other_str => integer or nil
|
1342
|
+
*
|
1343
|
+
* Match---If <code>URegexp</code> is given, use it as a pattern to
|
1344
|
+
* match against <i>uregexp</i> and return UMatch or +nil+.
|
1345
|
+
*
|
1346
|
+
* If <code>UString</code> is given, returns index of it
|
1347
|
+
* (similar to <code>UString#index</code>).
|
1348
|
+
*
|
1349
|
+
* Otherwise returns +nil+
|
1350
|
+
*
|
1351
|
+
* "cat o' 9 tails".u =~ '\d' #=> nil
|
1352
|
+
* "cat o' 9 tails".u =~ /\d/.U #=> #<UMatch:0xf6fb7d5c @cg=[<U000039>]>
|
1353
|
+
* "cat o' 9 tails".u =~ 9 #=> false
|
1354
|
+
* "cat o' 9 tails".u =~ '9'.u #=> 7
|
1355
|
+
*/
|
1356
|
+
|
1357
|
+
VALUE
|
1358
|
+
icu_ustr_match(x, y)
|
1359
|
+
VALUE x,
|
1360
|
+
y;
|
1361
|
+
{
|
1362
|
+
long pos ;
|
1363
|
+
if (TYPE(y) == T_REGEXP){
|
1364
|
+
rb_raise(rb_eTypeError, "Wrong type: can't match against Regexp. Use URegexp instead");
|
1365
|
+
}
|
1366
|
+
if (CLASS_OF(y) == rb_cURegexp) {
|
1367
|
+
return icu_reg_match(y, x);
|
1368
|
+
} else if (CLASS_OF(y) == rb_cUString) {
|
1369
|
+
pos = icu_ustr_index(x, y, 0);
|
1370
|
+
if (pos == -1) return Qnil;
|
1371
|
+
else return LONG2NUM(pos);
|
1372
|
+
} else {
|
1373
|
+
return Qnil;
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
|
1377
|
+
VALUE
|
1378
|
+
get_pat(pat, quote)
|
1379
|
+
VALUE pat;
|
1380
|
+
int quote;
|
1381
|
+
{
|
1382
|
+
if (CLASS_OF(pat) == rb_cURegexp)
|
1383
|
+
return pat;
|
1384
|
+
|
1385
|
+
if (CLASS_OF(pat) == rb_cUString)
|
1386
|
+
return icu_reg_comp(pat);
|
1387
|
+
Check_Class(pat, rb_cURegexp);
|
1388
|
+
return Qnil;
|
1389
|
+
}
|
1390
|
+
|
1391
|
+
|
1392
|
+
/**
|
1393
|
+
* call-seq:
|
1394
|
+
* str.match(pattern) => matchdata or nil
|
1395
|
+
*
|
1396
|
+
* Converts <i>pattern</i> to a <code>URegexp</code> (if it isn't already one),
|
1397
|
+
* then invokes its <code>match</code> method on <i>str</i>.
|
1398
|
+
*
|
1399
|
+
* 'hello'.u.match('(.)\1'.u) #=> #<UMatch:0x401b3d30>
|
1400
|
+
* 'hello'.u.match('(.)\1'.u)[0] #=> "ll"
|
1401
|
+
* 'hello'.u.match(/(.)\1/.U)[0] #=> "ll"
|
1402
|
+
* 'hello'.u.match('xx') #=> nil
|
1403
|
+
*/
|
1404
|
+
|
1405
|
+
VALUE
|
1406
|
+
icu_ustr_match_m(str, re)
|
1407
|
+
VALUE str,
|
1408
|
+
re;
|
1409
|
+
{
|
1410
|
+
return rb_funcall(get_pat(re, 0), rb_intern("match"), 1, str);
|
1411
|
+
}
|
1412
|
+
|
1413
|
+
VALUE
|
1414
|
+
ustr_scan_once(str, pat, start)
|
1415
|
+
VALUE str,
|
1416
|
+
pat;
|
1417
|
+
long *start;
|
1418
|
+
{
|
1419
|
+
VALUE result;
|
1420
|
+
long i;
|
1421
|
+
long beg,
|
1422
|
+
end, num_regs;
|
1423
|
+
|
1424
|
+
if (icu_reg_search(pat, str, *start, 0) >= 0) {
|
1425
|
+
icu_reg_range(pat, 0, &beg, &end);
|
1426
|
+
if (beg == end) {
|
1427
|
+
*start = end + 1;
|
1428
|
+
} else {
|
1429
|
+
*start = end;
|
1430
|
+
}
|
1431
|
+
num_regs = icu_group_count(pat);
|
1432
|
+
if (num_regs <= 1) {
|
1433
|
+
return icu_reg_nth_match(pat, 0);
|
1434
|
+
}
|
1435
|
+
result = rb_ary_new2(num_regs);
|
1436
|
+
for (i = 1; i <= num_regs; i++) {
|
1437
|
+
rb_ary_store(result, i - 1, icu_reg_nth_match(pat, i));
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
return result;
|
1441
|
+
}
|
1442
|
+
return Qnil;
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
|
1446
|
+
/**
|
1447
|
+
* call-seq:
|
1448
|
+
* str.scan(pattern) => array
|
1449
|
+
* str.scan(pattern) {|match, ...| block } => str
|
1450
|
+
*
|
1451
|
+
* Both forms iterate through <i>str</i>, matching the pattern (which may be a
|
1452
|
+
* <code>URegexp</code> or a <code>UString</code>). For each match, a result is
|
1453
|
+
* generated and either added to the result array or passed to the block. If
|
1454
|
+
* the pattern contains no groups, each individual result consists of the
|
1455
|
+
* matched string. If the pattern contains groups, each
|
1456
|
+
* individual result is itself an array containing one entry per group.
|
1457
|
+
*
|
1458
|
+
* a = "cruel world".u
|
1459
|
+
* a.scan(/\w+/.U) #=> ["cruel", "world"]
|
1460
|
+
* a.scan(/.../.U) #=> ["cru", "el ", "wor"]
|
1461
|
+
* a.scan(/(...)/.U) #=> [["cru"], ["el "], ["wor"]]
|
1462
|
+
* a.scan(/(..)(..)/.U) #=> [["cr", "ue"], ["l ", "wo"]]
|
1463
|
+
*
|
1464
|
+
* And the block form:
|
1465
|
+
*
|
1466
|
+
* a.scan(/\w+/.U) {|w| print "<<#{w}>> " }
|
1467
|
+
* print "\n"
|
1468
|
+
* a.scan(/(.)(.)/.U) {|a,b| print b, a }
|
1469
|
+
* print "\n"
|
1470
|
+
*
|
1471
|
+
* <em>produces:</em>
|
1472
|
+
*
|
1473
|
+
* <<cruel>> <<world>>
|
1474
|
+
* rceu lowlr
|
1475
|
+
*/
|
1476
|
+
|
1477
|
+
VALUE
|
1478
|
+
icu_ustr_scan(str, pat)
|
1479
|
+
VALUE str,
|
1480
|
+
pat;
|
1481
|
+
{
|
1482
|
+
VALUE result;
|
1483
|
+
long start = 0;
|
1484
|
+
|
1485
|
+
pat = get_pat(pat, 1);
|
1486
|
+
if (!rb_block_given_p()) {
|
1487
|
+
VALUE ary = rb_ary_new();
|
1488
|
+
|
1489
|
+
while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
|
1490
|
+
rb_ary_push(ary, result);
|
1491
|
+
}
|
1492
|
+
return ary;
|
1493
|
+
}
|
1494
|
+
USTRING(str)->busy = 1;
|
1495
|
+
while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
|
1496
|
+
rb_yield(result);
|
1497
|
+
}
|
1498
|
+
USTRING(str)->busy = 0;
|
1499
|
+
return str;
|
1500
|
+
}
|
1501
|
+
/**
|
1502
|
+
* call-seq:
|
1503
|
+
* str.char_span(start[, len, [locale]])
|
1504
|
+
*
|
1505
|
+
* Returns substring starting at <code>start</code>-th char, with <code>len</code> chars length.
|
1506
|
+
* Here "char" means "grapheme cluster", so start index and len are measured in terms of "graphemes"
|
1507
|
+
* locale parameter is optional.
|
1508
|
+
* Negative len can be supplied to receive to end of string.
|
1509
|
+
*
|
1510
|
+
* String is transformed to NFC before extract.
|
1511
|
+
*/
|
1512
|
+
VALUE
|
1513
|
+
icu_ustr_char_span(int argc, VALUE * argv, VALUE str)
|
1514
|
+
{
|
1515
|
+
UErrorCode error = 0;
|
1516
|
+
int32_t end, start, char_start = 0, char_len = -1, total_chars = 0;
|
1517
|
+
int32_t init_pos = -1, end_pos = -1, n;
|
1518
|
+
char *loc = NULL;
|
1519
|
+
VALUE cs, clen, locl, out;
|
1520
|
+
UBreakIterator *boundary;
|
1521
|
+
|
1522
|
+
n = rb_scan_args(argc, argv, "12", &cs, &clen, &locl);
|
1523
|
+
Check_Type(cs, T_FIXNUM);
|
1524
|
+
char_start = FIX2INT(cs);
|
1525
|
+
if(char_start < 0) rb_raise(rb_eArgError, "Negative offset aren't allowed!");
|
1526
|
+
|
1527
|
+
if( n > 1) {
|
1528
|
+
Check_Type(clen, T_FIXNUM);
|
1529
|
+
char_len = FIX2INT(clen);
|
1530
|
+
if(char_len <= 0) char_len = -1;
|
1531
|
+
}
|
1532
|
+
if( n > 2) {
|
1533
|
+
Check_Type(locl, T_STRING);
|
1534
|
+
loc = RSTRING(locl)->ptr;
|
1535
|
+
}
|
1536
|
+
if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
|
1537
|
+
str = icu_ustr_normalize_C(str);
|
1538
|
+
|
1539
|
+
boundary =
|
1540
|
+
ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
|
1541
|
+
if (U_FAILURE(error))
|
1542
|
+
rb_raise(rb_eArgError, "Error %s", u_errorName(error));
|
1543
|
+
|
1544
|
+
start = ubrk_first(boundary);
|
1545
|
+
for (end = ubrk_next(boundary); end != UBRK_DONE;
|
1546
|
+
start = end, end = ubrk_next(boundary)) {
|
1547
|
+
if( total_chars == char_start ) init_pos = start;
|
1548
|
+
total_chars ++;
|
1549
|
+
if( char_len>0 && total_chars == char_start+char_len) end_pos = end;
|
1550
|
+
}
|
1551
|
+
ubrk_close(boundary);
|
1552
|
+
if( init_pos == -1) rb_raise(rb_eArgError, "Char index %d out of bounds %d", char_start, total_chars);
|
1553
|
+
if( end_pos == -1) end_pos = ICU_LEN(str); /* reached end of string */
|
1554
|
+
out = icu_ustr_new(ICU_PTR(str)+init_pos, end_pos - init_pos);
|
1555
|
+
return out;
|
1556
|
+
}
|
1557
|
+
|
1558
|
+
VALUE
|
1559
|
+
icu_ustr_chars(str, loc)
|
1560
|
+
VALUE str;
|
1561
|
+
char *loc;
|
1562
|
+
{
|
1563
|
+
UErrorCode error = 0;
|
1564
|
+
int32_t end, start;
|
1565
|
+
VALUE out;
|
1566
|
+
UBreakIterator *boundary;
|
1567
|
+
if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
|
1568
|
+
str = icu_ustr_normalize_C(str);
|
1569
|
+
|
1570
|
+
boundary =
|
1571
|
+
ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
|
1572
|
+
if (U_FAILURE(error))
|
1573
|
+
rb_raise(rb_eArgError, "Error %s", u_errorName(error));
|
1574
|
+
|
1575
|
+
out = rb_ary_new();
|
1576
|
+
start = ubrk_first(boundary);
|
1577
|
+
for (end = ubrk_next(boundary); end != UBRK_DONE;
|
1578
|
+
start = end, end = ubrk_next(boundary)) {
|
1579
|
+
rb_ary_push(out, icu_ustr_new(ICU_PTR(str) + start, end - start));
|
1580
|
+
}
|
1581
|
+
ubrk_close(boundary);
|
1582
|
+
return out;
|
1583
|
+
}
|
1584
|
+
|
1585
|
+
/**
|
1586
|
+
* call-seq:
|
1587
|
+
* str.chars(locale = "") => array of character
|
1588
|
+
*
|
1589
|
+
* Returns array of character graphemes, locale dependent.
|
1590
|
+
* String is transformed to NFC before split.
|
1591
|
+
* */
|
1592
|
+
VALUE
|
1593
|
+
icu_ustr_chars_m(argc, argv, str)
|
1594
|
+
int argc;
|
1595
|
+
VALUE *argv;
|
1596
|
+
VALUE str;
|
1597
|
+
{
|
1598
|
+
VALUE locale;
|
1599
|
+
if (rb_scan_args(argc, argv, "01", &locale) == 1) {
|
1600
|
+
Check_Type(locale, T_STRING);
|
1601
|
+
return icu_ustr_chars(str, RSTRING(locale)->ptr);
|
1602
|
+
} else {
|
1603
|
+
return icu_ustr_chars(str, "");
|
1604
|
+
}
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
/**
|
1608
|
+
* call-seq:
|
1609
|
+
* str.split(pattern, [limit]) => anArray
|
1610
|
+
*
|
1611
|
+
* Divides <i>str</i> into substrings based on a delimiter, returning an array
|
1612
|
+
* of these substrings. <i>str</i> is divided where the
|
1613
|
+
* pattern matches.
|
1614
|
+
*
|
1615
|
+
* NOTE: split(//) or split("") is not supported.
|
1616
|
+
* To get array of chars use #chars or #codepoints methods
|
1617
|
+
*
|
1618
|
+
* If the <i>limit</i> parameter is omitted, trailing null fields are
|
1619
|
+
* suppressed. If <i>limit</i> is a positive number, at most that number of
|
1620
|
+
* fields will be returned (if <i>limit</i> is <code>1</code>, the entire
|
1621
|
+
* string is returned as the only entry in an array). If negative, there is no
|
1622
|
+
* limit to the number of fields returned, and trailing null fields are not
|
1623
|
+
* suppressed.
|
1624
|
+
*
|
1625
|
+
*/
|
1626
|
+
|
1627
|
+
VALUE
|
1628
|
+
icu_ustr_split_m(argc, argv, str)
|
1629
|
+
int argc;
|
1630
|
+
VALUE *argv;
|
1631
|
+
VALUE str;
|
1632
|
+
{
|
1633
|
+
VALUE spat;
|
1634
|
+
VALUE limit;
|
1635
|
+
int lim = 0;
|
1636
|
+
VALUE result;
|
1637
|
+
|
1638
|
+
if (rb_scan_args(argc, argv, "11", &spat, &limit) == 2) {
|
1639
|
+
lim = NUM2INT(limit);
|
1640
|
+
if (lim <= 0)
|
1641
|
+
limit = Qnil;
|
1642
|
+
}
|
1643
|
+
if (CLASS_OF(spat) == rb_cURegexp) {
|
1644
|
+
result = icu_reg_split(spat, str, limit);
|
1645
|
+
} else {
|
1646
|
+
if (CLASS_OF(spat) == rb_cUString) {
|
1647
|
+
result = icu_reg_split(icu_reg_comp(spat), str, limit);
|
1648
|
+
} else {
|
1649
|
+
rb_raise(rb_eArgError, "Expected UString or URegexp, got %s",
|
1650
|
+
rb_class2name(CLASS_OF(spat)));
|
1651
|
+
}
|
1652
|
+
}
|
1653
|
+
if (NIL_P(limit) && lim == 0) {
|
1654
|
+
while (RARRAY(result)->len > 0 &&
|
1655
|
+
ICU_LEN( (RARRAY(result)->ptr[RARRAY(result)->len - 1])) == 0)
|
1656
|
+
rb_ary_pop(result);
|
1657
|
+
}
|
1658
|
+
|
1659
|
+
return result;
|
1660
|
+
}
|
1661
|
+
|
1662
|
+
/**
|
1663
|
+
* call-seq:
|
1664
|
+
* str.inspect => String
|
1665
|
+
*
|
1666
|
+
* Shows codepoints in form of \uxxxx. For debug purposes.
|
1667
|
+
*/
|
1668
|
+
VALUE
|
1669
|
+
icu_ustr_inspect(str)
|
1670
|
+
VALUE str;
|
1671
|
+
{
|
1672
|
+
VALUE buf = rb_str_new2("");
|
1673
|
+
char temp[] = "\\u0010FFFF ";
|
1674
|
+
int32_t i,
|
1675
|
+
n,
|
1676
|
+
k,
|
1677
|
+
c;
|
1678
|
+
UChar *s = ICU_PTR(str);
|
1679
|
+
n = ICU_LEN(str);
|
1680
|
+
i = 0;
|
1681
|
+
while (i < n) {
|
1682
|
+
U16_NEXT(s, i, n, c); /* care surrogate */
|
1683
|
+
if(c >= 0x10000)
|
1684
|
+
k = sprintf(temp, "\\u%08X", c);
|
1685
|
+
else
|
1686
|
+
k = sprintf(temp, "\\u%04X", c);
|
1687
|
+
rb_str_cat(buf, temp, k);
|
1688
|
+
}
|
1689
|
+
return buf;
|
1690
|
+
}
|
1691
|
+
|
1692
|
+
/**
|
1693
|
+
* call-seq:
|
1694
|
+
* str.codepoints => array of fixnums
|
1695
|
+
*
|
1696
|
+
* Returns array of codepoints as fixnums.
|
1697
|
+
*/
|
1698
|
+
VALUE
|
1699
|
+
icu_ustr_points(str)
|
1700
|
+
VALUE str;
|
1701
|
+
{
|
1702
|
+
VALUE buf = rb_ary_new();
|
1703
|
+
int32_t i,
|
1704
|
+
n,
|
1705
|
+
c;
|
1706
|
+
UChar *s = ICU_PTR(str);
|
1707
|
+
n = ICU_LEN(str);
|
1708
|
+
i = 0;
|
1709
|
+
while (i < n) {
|
1710
|
+
U16_NEXT(s, i, n, c); /* care surrogates */
|
1711
|
+
rb_ary_push(buf, LONG2NUM(c));
|
1712
|
+
}
|
1713
|
+
return buf;
|
1714
|
+
}
|
1715
|
+
|
1716
|
+
|
1717
|
+
/**
|
1718
|
+
* call-seq:
|
1719
|
+
* str.inspect_names => String
|
1720
|
+
*
|
1721
|
+
* Dumps names of codepoints in this UString (debug).
|
1722
|
+
*/
|
1723
|
+
VALUE
|
1724
|
+
icu_ustr_inspect_names(str)
|
1725
|
+
VALUE str;
|
1726
|
+
{
|
1727
|
+
VALUE buf = rb_str_new2("");
|
1728
|
+
char temp[301];
|
1729
|
+
UErrorCode error;
|
1730
|
+
int32_t i,
|
1731
|
+
n,
|
1732
|
+
c,
|
1733
|
+
l;
|
1734
|
+
UChar *s = ICU_PTR(str);
|
1735
|
+
n = ICU_LEN(str);
|
1736
|
+
i = 0;
|
1737
|
+
while (i < n) {
|
1738
|
+
U16_NEXT(s, i, n, c) sprintf(temp, "<U%06X>", c); /* care surrogates */
|
1739
|
+
rb_str_cat(buf, temp, 9);
|
1740
|
+
error = 0;
|
1741
|
+
l = u_charName(c, U_UNICODE_CHAR_NAME, temp, 300, &error);
|
1742
|
+
rb_str_cat(buf, temp, l);
|
1743
|
+
rb_str_cat(buf, "\n", 1);
|
1744
|
+
}
|
1745
|
+
return buf;
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
VALUE
|
1749
|
+
icu_ustr_subpat(str, re, nth)
|
1750
|
+
VALUE str,
|
1751
|
+
re;
|
1752
|
+
int nth;
|
1753
|
+
{
|
1754
|
+
if (icu_reg_search(re, str, 0, 0) >= 0) {
|
1755
|
+
return icu_reg_nth_match(re, nth);
|
1756
|
+
}
|
1757
|
+
return Qnil;
|
1758
|
+
}
|
1759
|
+
|
1760
|
+
/* beg len are code unit indexes*/
|
1761
|
+
VALUE
|
1762
|
+
icu_ustr_substr(str, beg, len)
|
1763
|
+
VALUE str;
|
1764
|
+
long beg,
|
1765
|
+
len;
|
1766
|
+
{
|
1767
|
+
int32_t str_size;
|
1768
|
+
str_size = ICU_LEN(str);
|
1769
|
+
if (len < 0) return Qnil;
|
1770
|
+
|
1771
|
+
if (beg > str_size) return Qnil;
|
1772
|
+
if (beg < 0) {
|
1773
|
+
beg += str_size;
|
1774
|
+
if (beg < 0) return Qnil;
|
1775
|
+
}
|
1776
|
+
if (beg + len > str_size) {
|
1777
|
+
len = str_size - beg;
|
1778
|
+
}
|
1779
|
+
if (len < 0) {
|
1780
|
+
len = 0;
|
1781
|
+
}
|
1782
|
+
if( len == 0) return icu_ustr_new(0, 0);
|
1783
|
+
/* adjust to codepoint boundaries */
|
1784
|
+
U16_SET_CP_START(ICU_PTR(str), 0, beg);
|
1785
|
+
U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
|
1786
|
+
return icu_ustr_new(ICU_PTR(str) + beg, len);
|
1787
|
+
}
|
1788
|
+
|
1789
|
+
VALUE
|
1790
|
+
icu_ustr_aref(str, indx)
|
1791
|
+
VALUE str;
|
1792
|
+
VALUE indx;
|
1793
|
+
{
|
1794
|
+
long idx;
|
1795
|
+
int32_t cp_len = ICU_LEN(str);
|
1796
|
+
|
1797
|
+
switch (TYPE(indx)) {
|
1798
|
+
case T_FIXNUM:
|
1799
|
+
idx = FIX2LONG(indx);
|
1800
|
+
|
1801
|
+
num_index:
|
1802
|
+
if (idx < 0) {
|
1803
|
+
idx = cp_len + idx;
|
1804
|
+
}
|
1805
|
+
if (idx < 0 || cp_len <= idx) {
|
1806
|
+
return Qnil;
|
1807
|
+
}
|
1808
|
+
return icu_ustr_substr(str, idx, 1);
|
1809
|
+
|
1810
|
+
case T_DATA:
|
1811
|
+
if (CLASS_OF(indx) == rb_cURegexp)
|
1812
|
+
return icu_ustr_subpat(str, indx, 0);
|
1813
|
+
if (CLASS_OF(indx) == rb_cUString) {
|
1814
|
+
if (icu_ustr_index(str, indx, 0) != -1)
|
1815
|
+
return icu_ustr_dup(indx);
|
1816
|
+
return Qnil;
|
1817
|
+
}
|
1818
|
+
|
1819
|
+
default:
|
1820
|
+
/*
|
1821
|
+
* check if indx is Range
|
1822
|
+
*/
|
1823
|
+
{
|
1824
|
+
long beg,
|
1825
|
+
len;
|
1826
|
+
switch (rb_range_beg_len(indx, &beg, &len, cp_len, 0)) {
|
1827
|
+
case Qfalse:
|
1828
|
+
break;
|
1829
|
+
case Qnil:
|
1830
|
+
return Qnil;
|
1831
|
+
default:
|
1832
|
+
return icu_ustr_substr(str, beg, len);
|
1833
|
+
}
|
1834
|
+
}
|
1835
|
+
idx = NUM2LONG(indx);
|
1836
|
+
goto num_index;
|
1837
|
+
}
|
1838
|
+
return Qnil; /* not reached */
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
/**
|
1842
|
+
* call-seq:
|
1843
|
+
* str[fixnum] => new_str or nil
|
1844
|
+
* str[fixnum, fixnum] => new_str or nil
|
1845
|
+
* str[range] => new_str or nil
|
1846
|
+
* str[regexp] => new_str or nil
|
1847
|
+
* str[regexp, fixnum] => new_str or nil
|
1848
|
+
* str[other_str] => new_str or nil
|
1849
|
+
* str.slice(fixnum) => new_str or nil
|
1850
|
+
* str.slice(fixnum, fixnum) => new_str or nil
|
1851
|
+
* str.slice(range) => new_str or nil
|
1852
|
+
* str.slice(regexp) => new_str or nil
|
1853
|
+
* str.slice(regexp, fixnum) => new_str or nil
|
1854
|
+
* str.slice(other_str) => new_str or nil
|
1855
|
+
*
|
1856
|
+
* Element Reference---If passed a single <code>Fixnum</code>, returns
|
1857
|
+
* substring with the character at that position. If passed two <code>Fixnum</code>
|
1858
|
+
* objects, returns a substring starting at the offset given by the first, and
|
1859
|
+
* a length given by the second. If given a range, a substring containing
|
1860
|
+
* characters at offsets given by the range is returned. In all three cases, if
|
1861
|
+
* an offset is negative, it is counted from the end of <i>str</i>. Returns
|
1862
|
+
* <code>nil</code> if the initial offset falls outside the string, the length
|
1863
|
+
* is negative, or the beginning of the range is greater than the end.
|
1864
|
+
*
|
1865
|
+
* If a <code>URegexp</code> is supplied, the matching portion of <i>str</i> is
|
1866
|
+
* returned. If a numeric parameter follows the regular expression, that
|
1867
|
+
* component of the <code>UMatch</code> is returned instead. If a
|
1868
|
+
* <code>UString</code> is given, that string is returned if it occurs in
|
1869
|
+
* <i>str</i>. In both cases, <code>nil</code> is returned if there is no
|
1870
|
+
* match.
|
1871
|
+
*
|
1872
|
+
* a = "hello there".u
|
1873
|
+
* a[1] #=> 'e'
|
1874
|
+
* a[1,3] #=> "ell"
|
1875
|
+
* a[1..3] #=> "ell"
|
1876
|
+
* a[-3,2] #=> "er"
|
1877
|
+
* a[-4..-2] #=> "her"
|
1878
|
+
* a[12..-1] #=> nil
|
1879
|
+
* a[-2..-4] #=> ""
|
1880
|
+
* a[/[aeiou](.)\1/.U] #=> "ell"
|
1881
|
+
* a[/[aeiou](.)\1/.U, 0] #=> "ell"
|
1882
|
+
* a[/[aeiou](.)\1/.U, 1] #=> "l"
|
1883
|
+
* a[/[aeiou](.)\1/.U, 2] #=> nil
|
1884
|
+
* a["lo".u] #=> "lo"
|
1885
|
+
* a["bye".u] #=> nil
|
1886
|
+
*/
|
1887
|
+
|
1888
|
+
VALUE
|
1889
|
+
icu_ustr_aref_m(argc, argv, str)
|
1890
|
+
int argc;
|
1891
|
+
VALUE *argv;
|
1892
|
+
VALUE str;
|
1893
|
+
{
|
1894
|
+
if (argc == 2) {
|
1895
|
+
if (CLASS_OF(argv[0]) == rb_cURegexp) {
|
1896
|
+
return icu_ustr_subpat(str, argv[0], NUM2INT(argv[1]));
|
1897
|
+
}
|
1898
|
+
return icu_ustr_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
|
1899
|
+
}
|
1900
|
+
if (argc != 1) {
|
1901
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
|
1902
|
+
argc);
|
1903
|
+
}
|
1904
|
+
return icu_ustr_aref(str, argv[0]);
|
1905
|
+
}
|
1906
|
+
|
1907
|
+
/**
|
1908
|
+
* call-seq:
|
1909
|
+
* str.sub!(pattern, replacement) => str or nil
|
1910
|
+
* str.sub!(pattern) {|match| block } => str or nil
|
1911
|
+
*
|
1912
|
+
* Performs the substitutions of <code>UString#sub</code> in place,
|
1913
|
+
* returning <i>str</i>, or <code>nil</code> if no substitutions were
|
1914
|
+
* performed.
|
1915
|
+
*/
|
1916
|
+
|
1917
|
+
VALUE
|
1918
|
+
icu_ustr_sub_bang(argc, argv, str)
|
1919
|
+
int argc;
|
1920
|
+
VALUE *argv;
|
1921
|
+
VALUE str;
|
1922
|
+
{
|
1923
|
+
return ustr_gsub(argc, argv, str, 1, 1 );
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
|
1927
|
+
/**
|
1928
|
+
* call-seq:
|
1929
|
+
* str.sub(pattern, replacement) => new_str
|
1930
|
+
* str.sub(pattern) {|match| block } => new_str
|
1931
|
+
*
|
1932
|
+
* Returns a copy of <i>str</i> with the <em>first</em> occurrence of
|
1933
|
+
* <i>pattern</i> replaced with either <i>replacement</i> or the value of the
|
1934
|
+
* block. The <i>pattern</i> will typically be a <code>URegexp</code>; if it is
|
1935
|
+
* a <code>UString</code> then no regular expression metacharacters will be
|
1936
|
+
* interpreted (that is <code>/\d/.U</code> will match a digit, but
|
1937
|
+
* <code>'\d'</code> will match a backslash followed by a 'd').
|
1938
|
+
*
|
1939
|
+
* The sequences <code>$1</code>, <code>$2</code>, etc., may be used.
|
1940
|
+
*
|
1941
|
+
* In the block form, the current UMatch object is passed in as a parameter.
|
1942
|
+
* The value returned by the block will be substituted for the match on each call.
|
1943
|
+
*
|
1944
|
+
* "hello".u.sub(/[aeiou]/.U, '*'.u) #=> "h*llo"
|
1945
|
+
* "hello".u.sub(/([aeiou])/.U, '<$1>'.u) #=> "h<e>llo"
|
1946
|
+
*/
|
1947
|
+
|
1948
|
+
VALUE
|
1949
|
+
icu_ustr_sub(argc, argv, str)
|
1950
|
+
int argc;
|
1951
|
+
VALUE *argv;
|
1952
|
+
VALUE str;
|
1953
|
+
{
|
1954
|
+
str = icu_ustr_dup(str);
|
1955
|
+
icu_ustr_sub_bang(argc, argv, str);
|
1956
|
+
return str;
|
1957
|
+
}
|
1958
|
+
|
1959
|
+
/**
|
1960
|
+
* replace in string from +beg+ length +len+ (in code units)
|
1961
|
+
*/
|
1962
|
+
static void
|
1963
|
+
icu_ustr_splice(str, beg, len, val)
|
1964
|
+
VALUE str;
|
1965
|
+
long beg,
|
1966
|
+
len;
|
1967
|
+
VALUE val;
|
1968
|
+
{
|
1969
|
+
long char_len;
|
1970
|
+
Check_Class(val, rb_cUString);
|
1971
|
+
if (val == str) {
|
1972
|
+
val = icu_ustr_dup(str);
|
1973
|
+
}
|
1974
|
+
if (len < 0)
|
1975
|
+
rb_raise(rb_eIndexError, "negative length %ld", len);
|
1976
|
+
char_len = ICU_LEN(str);
|
1977
|
+
|
1978
|
+
if (char_len < beg) {
|
1979
|
+
out_of_range:
|
1980
|
+
rb_raise(rb_eIndexError, "index %ld out of string", beg);
|
1981
|
+
}
|
1982
|
+
if (beg < 0) {
|
1983
|
+
if (-beg > char_len) {
|
1984
|
+
goto out_of_range;
|
1985
|
+
}
|
1986
|
+
beg += char_len;
|
1987
|
+
}
|
1988
|
+
if (char_len < beg + len) {
|
1989
|
+
len = char_len - beg;
|
1990
|
+
}
|
1991
|
+
/* adjust to codepoint boundaries */
|
1992
|
+
U16_SET_CP_START(ICU_PTR(str), 0, beg);
|
1993
|
+
U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
|
1994
|
+
|
1995
|
+
ustr_splice_units(USTRING(str), beg, len, ICU_PTR(val), ICU_LEN(val));
|
1996
|
+
OBJ_INFECT(str, val);
|
1997
|
+
}
|
1998
|
+
|
1999
|
+
|
2000
|
+
/**
|
2001
|
+
* call-seq:
|
2002
|
+
* str.insert(index, other_str) => str
|
2003
|
+
*
|
2004
|
+
* Inserts <i>other_str</i> before the character at the given
|
2005
|
+
* <i>index</i>, modifying <i>str</i>. Negative indices count from the
|
2006
|
+
* end of the string, and insert <em>after</em> the given character.
|
2007
|
+
* The intent is insert <i>other_str</i> so that it starts at the given
|
2008
|
+
* <i>index</i>.
|
2009
|
+
*
|
2010
|
+
* "abcd".u.insert(0, 'X'.u) #=> "Xabcd"
|
2011
|
+
* "abcd".u.insert(3, 'X'.u) #=> "abcXd"
|
2012
|
+
* "abcd".u.insert(4, 'X'.u) #=> "abcdX"
|
2013
|
+
* "abcd".u.insert(-3, 'X'.u) #=> "abXcd"
|
2014
|
+
* "abcd".u.insert(-1, 'X'.u) #=> "abcdX"
|
2015
|
+
*/
|
2016
|
+
|
2017
|
+
VALUE
|
2018
|
+
icu_ustr_insert(str, idx, str2)
|
2019
|
+
VALUE str,
|
2020
|
+
idx,
|
2021
|
+
str2;
|
2022
|
+
{
|
2023
|
+
long pos = NUM2LONG(idx);
|
2024
|
+
icu_check_frozen(str);
|
2025
|
+
|
2026
|
+
if (pos == -1) {
|
2027
|
+
pos = NUM2LONG(icu_ustr_length(str));
|
2028
|
+
} else if (pos < 0) {
|
2029
|
+
pos++;
|
2030
|
+
}
|
2031
|
+
|
2032
|
+
icu_ustr_splice(str, pos, 0, str2);
|
2033
|
+
return str;
|
2034
|
+
}
|
2035
|
+
|
2036
|
+
/**
|
2037
|
+
* call-seq:
|
2038
|
+
* str.include? other_str => true or false
|
2039
|
+
*
|
2040
|
+
* Returns <code>true</code> if <i>str</i> contains the given string
|
2041
|
+
*
|
2042
|
+
* "hello".u.include? "lo".u #=> true
|
2043
|
+
* "hello".u.include? "ol".u #=> false
|
2044
|
+
*/
|
2045
|
+
|
2046
|
+
VALUE
|
2047
|
+
icu_ustr_include(str, arg)
|
2048
|
+
VALUE str,
|
2049
|
+
arg;
|
2050
|
+
{
|
2051
|
+
long i;
|
2052
|
+
i = icu_ustr_index(str, arg, 0);
|
2053
|
+
if (i == -1)
|
2054
|
+
return Qfalse;
|
2055
|
+
return Qtrue;
|
2056
|
+
}
|
2057
|
+
|
2058
|
+
static void
|
2059
|
+
icu_ustr_subpat_set(str, re, nth, val)
|
2060
|
+
VALUE str,
|
2061
|
+
re;
|
2062
|
+
int nth;
|
2063
|
+
VALUE val;
|
2064
|
+
{
|
2065
|
+
long start,
|
2066
|
+
end,
|
2067
|
+
len;
|
2068
|
+
VALUE matched;
|
2069
|
+
|
2070
|
+
if (icu_reg_search(re, str, 0, 0) < 0) {
|
2071
|
+
rb_raise(rb_eIndexError, "regexp not matched");
|
2072
|
+
}
|
2073
|
+
matched = icu_reg_range(re, nth, &start, &end);
|
2074
|
+
if (NIL_P(matched)) {
|
2075
|
+
rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
|
2076
|
+
}
|
2077
|
+
len = end - start;
|
2078
|
+
/* adjust to codepoint boundaries */
|
2079
|
+
U16_SET_CP_START(ICU_PTR(str), 0, start);
|
2080
|
+
U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
|
2081
|
+
|
2082
|
+
ustr_splice_units(USTRING(str), start, len, ICU_PTR(val), ICU_LEN(val));
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
VALUE
|
2086
|
+
icu_ustr_aset(str, indx, val)
|
2087
|
+
VALUE str;
|
2088
|
+
VALUE indx,
|
2089
|
+
val;
|
2090
|
+
{
|
2091
|
+
long idx,
|
2092
|
+
beg;
|
2093
|
+
long char_len = ICU_LEN(str);
|
2094
|
+
|
2095
|
+
switch (TYPE(indx)) {
|
2096
|
+
case T_FIXNUM:
|
2097
|
+
num_index:
|
2098
|
+
idx = FIX2LONG(indx);
|
2099
|
+
if (char_len <= idx) {
|
2100
|
+
out_of_range:
|
2101
|
+
rb_raise(rb_eIndexError, "index %ld out of string", idx);
|
2102
|
+
}
|
2103
|
+
if (idx < 0) {
|
2104
|
+
if (-idx > char_len)
|
2105
|
+
goto out_of_range;
|
2106
|
+
idx += char_len;
|
2107
|
+
}
|
2108
|
+
icu_ustr_splice(str, idx, 1, val);
|
2109
|
+
return val;
|
2110
|
+
|
2111
|
+
case T_DATA:
|
2112
|
+
if (CLASS_OF(indx) == rb_cURegexp) {
|
2113
|
+
icu_ustr_subpat_set(str, indx, 0, val);
|
2114
|
+
return val;
|
2115
|
+
}
|
2116
|
+
if (CLASS_OF(indx) == rb_cUString) {
|
2117
|
+
beg = icu_ustr_index(str, indx, 0);
|
2118
|
+
if (beg < 0) {
|
2119
|
+
rb_raise(rb_eIndexError, "string not matched");
|
2120
|
+
}
|
2121
|
+
ustr_splice_units(USTRING(str), beg, ICU_LEN(indx), ICU_PTR(val), ICU_LEN(val));
|
2122
|
+
return val;
|
2123
|
+
}
|
2124
|
+
default:
|
2125
|
+
/*
|
2126
|
+
* check if indx is Range
|
2127
|
+
*/
|
2128
|
+
{
|
2129
|
+
long beg,
|
2130
|
+
len;
|
2131
|
+
if (rb_range_beg_len(indx, &beg, &len, char_len, 2)) {
|
2132
|
+
icu_ustr_splice(str, beg, len, val);
|
2133
|
+
return val;
|
2134
|
+
}
|
2135
|
+
}
|
2136
|
+
idx = NUM2LONG(indx);
|
2137
|
+
goto num_index;
|
2138
|
+
}
|
2139
|
+
}
|
2140
|
+
|
2141
|
+
|
2142
|
+
/**
|
2143
|
+
* call-seq:
|
2144
|
+
* str[fixnum] = new_str
|
2145
|
+
* str[fixnum, fixnum] = new_str
|
2146
|
+
* str[range] = new_str
|
2147
|
+
* str[regexp] = new_str
|
2148
|
+
* str[regexp, fixnum] = new_str
|
2149
|
+
* str[other_str] = new_str
|
2150
|
+
*
|
2151
|
+
* Element Assignment---Replaces some or all of the content of <i>str</i>. The
|
2152
|
+
* portion of the string affected is determined using the same criteria as
|
2153
|
+
* <code>UString#[]</code>. If the replacement string is not the same length as
|
2154
|
+
* the text it is replacing, the string will be adjusted accordingly. If the
|
2155
|
+
* regular expression or string is used as the index doesn't match a position
|
2156
|
+
* in the string, <code>IndexError</code> is raised. If the regular expression
|
2157
|
+
* form is used, the optional second <code>Fixnum</code> allows you to specify
|
2158
|
+
* which portion of the match to replace (effectively using the
|
2159
|
+
* <code>UMatch</code> indexing rules. The forms that take a
|
2160
|
+
* <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
|
2161
|
+
* out of range; the <code>Range</code> form will raise a
|
2162
|
+
* <code>RangeError</code>, and the <code>URegexp</code> and <code>UString</code>
|
2163
|
+
* forms will silently ignore the assignment.
|
2164
|
+
*/
|
2165
|
+
|
2166
|
+
VALUE
|
2167
|
+
icu_ustr_aset_m(argc, argv, str)
|
2168
|
+
int argc;
|
2169
|
+
VALUE *argv;
|
2170
|
+
VALUE str;
|
2171
|
+
{
|
2172
|
+
icu_check_frozen(str);
|
2173
|
+
if (argc == 3) {
|
2174
|
+
if (CLASS_OF(argv[0]) == rb_cURegexp) {
|
2175
|
+
icu_ustr_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
|
2176
|
+
} else {
|
2177
|
+
icu_ustr_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]),
|
2178
|
+
argv[2]);
|
2179
|
+
}
|
2180
|
+
return argv[2];
|
2181
|
+
}
|
2182
|
+
if (argc != 2) {
|
2183
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
|
2184
|
+
argc);
|
2185
|
+
}
|
2186
|
+
return icu_ustr_aset(str, argv[0], argv[1]);
|
2187
|
+
}
|
2188
|
+
|
2189
|
+
/**
|
2190
|
+
* call-seq:
|
2191
|
+
* str.slice!(fixnum) => new_str or nil
|
2192
|
+
* str.slice!(fixnum, fixnum) => new_str or nil
|
2193
|
+
* str.slice!(range) => new_str or nil
|
2194
|
+
* str.slice!(regexp) => new_str or nil
|
2195
|
+
* str.slice!(other_str) => new_str or nil
|
2196
|
+
*
|
2197
|
+
* Deletes the specified portion from <i>str</i>, and returns the portion
|
2198
|
+
* deleted. The forms that take a <code>Fixnum</code> will raise an
|
2199
|
+
* <code>IndexError</code> if the value is out of range; the <code>Range</code>
|
2200
|
+
* form will raise a <code>RangeError</code>, and the <code>URegexp</code> and
|
2201
|
+
* <code>UString</code> forms will silently ignore the assignment.
|
2202
|
+
*
|
2203
|
+
* string = "this is a string".u
|
2204
|
+
* string.slice!(2) #=> 105
|
2205
|
+
* string.slice!(3..6) #=> " is "
|
2206
|
+
* string.slice!(/s.*t/.U) #=> "sa st"
|
2207
|
+
* string.slice!("r".u) #=> "r"
|
2208
|
+
* string #=> "thing"
|
2209
|
+
*/
|
2210
|
+
|
2211
|
+
VALUE
|
2212
|
+
icu_ustr_slice_bang(argc, argv, str)
|
2213
|
+
int argc;
|
2214
|
+
VALUE *argv;
|
2215
|
+
VALUE str;
|
2216
|
+
{
|
2217
|
+
VALUE result;
|
2218
|
+
VALUE buf[3];
|
2219
|
+
int i;
|
2220
|
+
icu_check_frozen(str);
|
2221
|
+
if (argc < 1 || 2 < argc) {
|
2222
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
|
2223
|
+
argc);
|
2224
|
+
}
|
2225
|
+
for (i = 0; i < argc; i++) {
|
2226
|
+
buf[i] = argv[i];
|
2227
|
+
}
|
2228
|
+
buf[i] = icu_ustr_new(0, 0);
|
2229
|
+
result = icu_ustr_aref_m(argc, buf, str);
|
2230
|
+
if (!NIL_P(result)) {
|
2231
|
+
icu_ustr_aset_m(argc + 1, buf, str);
|
2232
|
+
}
|
2233
|
+
return result;
|
2234
|
+
}
|
2235
|
+
|
2236
|
+
VALUE
|
2237
|
+
ustr_gsub(argc, argv, str, bang, once)
|
2238
|
+
int argc;
|
2239
|
+
VALUE *argv;
|
2240
|
+
VALUE str;
|
2241
|
+
int bang;
|
2242
|
+
int once;
|
2243
|
+
{
|
2244
|
+
VALUE pat,
|
2245
|
+
repl;
|
2246
|
+
long beg,
|
2247
|
+
end,
|
2248
|
+
prev_end;
|
2249
|
+
int tainted = 0,
|
2250
|
+
iter = 0;
|
2251
|
+
VALUE buf, curr_repl, umatch, block_res;
|
2252
|
+
if (argc == 1 && rb_block_given_p()) {
|
2253
|
+
iter = 1;
|
2254
|
+
} else if (argc == 2) {
|
2255
|
+
repl = argv[1];
|
2256
|
+
Check_Class(repl, rb_cUString);
|
2257
|
+
if (OBJ_TAINTED(repl))
|
2258
|
+
tainted = 1;
|
2259
|
+
} else {
|
2260
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
|
2261
|
+
argc);
|
2262
|
+
}
|
2263
|
+
|
2264
|
+
pat = get_pat(argv[0], 1);
|
2265
|
+
beg = icu_reg_search(pat, str, 0, 0);
|
2266
|
+
|
2267
|
+
if (beg < 0) {
|
2268
|
+
/* no match */
|
2269
|
+
if (bang)
|
2270
|
+
return Qnil;
|
2271
|
+
return icu_ustr_dup(str);
|
2272
|
+
}
|
2273
|
+
end = 0;
|
2274
|
+
icu_check_frozen(str);
|
2275
|
+
USTRING(str)->busy = 1;
|
2276
|
+
buf = icu_ustr_new(0, 0);
|
2277
|
+
pat = icu_reg_clone(pat);
|
2278
|
+
if(rb_block_given_p()) iter = 1;
|
2279
|
+
do {
|
2280
|
+
|
2281
|
+
prev_end = end;
|
2282
|
+
icu_reg_range(pat, 0, &beg, &end);
|
2283
|
+
icu_ustr_concat(buf, icu_reg_get_prematch(pat, prev_end));
|
2284
|
+
if ( iter ) {
|
2285
|
+
UChar * ptr = ICU_PTR(str);
|
2286
|
+
long o_len = ICU_LEN(str);
|
2287
|
+
umatch = icu_umatch_new(pat);
|
2288
|
+
block_res = rb_yield(umatch);
|
2289
|
+
if (CLASS_OF(block_res) == rb_cUString)
|
2290
|
+
curr_repl = block_res;
|
2291
|
+
else if (CLASS_OF(block_res) == rb_cUMatch)
|
2292
|
+
curr_repl = icu_umatch_aref(block_res, INT2FIX(0));
|
2293
|
+
else
|
2294
|
+
curr_repl =
|
2295
|
+
icu_from_rstr(0, NULL, rb_obj_as_string(block_res));
|
2296
|
+
ustr_mod_check(str, ptr, o_len);
|
2297
|
+
} else {
|
2298
|
+
curr_repl = icu_reg_get_replacement(pat, repl, prev_end);
|
2299
|
+
}
|
2300
|
+
icu_ustr_concat(buf, curr_repl);
|
2301
|
+
}
|
2302
|
+
while (icu_reg_find_next(pat) && !once);
|
2303
|
+
icu_ustr_concat(buf, icu_reg_get_tail(pat, end));
|
2304
|
+
USTRING(str)->busy = 0;
|
2305
|
+
if (bang) {
|
2306
|
+
icu_ustr_replace(str, buf);
|
2307
|
+
return str;
|
2308
|
+
} else {
|
2309
|
+
return buf;
|
2310
|
+
}
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
/**
|
2314
|
+
* call-seq:
|
2315
|
+
* str.gsub!(pattern, replacement) => str or nil
|
2316
|
+
* str.gsub!(pattern) {|match| block } => str or nil
|
2317
|
+
*
|
2318
|
+
* Performs the substitutions of <code>UString#gsub</code> in place, returning
|
2319
|
+
* <i>str</i>, or <code>nil</code> if no substitutions were performed.
|
2320
|
+
*/
|
2321
|
+
|
2322
|
+
VALUE
|
2323
|
+
icu_ustr_gsub_bang(argc, argv, str)
|
2324
|
+
int argc;
|
2325
|
+
VALUE *argv;
|
2326
|
+
VALUE str;
|
2327
|
+
{
|
2328
|
+
icu_check_frozen(str);
|
2329
|
+
return ustr_gsub(argc, argv, str, 1, 0);
|
2330
|
+
}
|
2331
|
+
|
2332
|
+
|
2333
|
+
/**
|
2334
|
+
* call-seq:
|
2335
|
+
* str.gsub(pattern, replacement) => new_str
|
2336
|
+
* str.gsub(pattern) {|match| block } => new_str
|
2337
|
+
*
|
2338
|
+
* Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
|
2339
|
+
* replaced with either <i>replacement</i> or the value of the block. The
|
2340
|
+
* <i>pattern</i> will typically be a <code>URegexp</code>; if it is a
|
2341
|
+
* <code>UString</code> then no regular expression metacharacters will be
|
2342
|
+
* interpreted (that is <code>/\d/</code> will match a digit, but
|
2343
|
+
* <code>'\d'</code> will match a backslash followed by a 'd').
|
2344
|
+
*
|
2345
|
+
* If a string is used as the replacement, the sequences <code>$1</code>, <code>$2</code>, and so on
|
2346
|
+
* may be used to interpolate successive groups in the match.
|
2347
|
+
*
|
2348
|
+
* In the block form, the current UMatch object is passed in as a parameter. The value
|
2349
|
+
* returned by the block will be substituted for the match on each call.
|
2350
|
+
*
|
2351
|
+
* "hello".gsub(/[aeiou]/.U, '*') #=> "h*ll*"
|
2352
|
+
* "hello".gsub(/([aeiou])/.U, '<$1>') #=> "h<e>ll<o>"
|
2353
|
+
*/
|
2354
|
+
|
2355
|
+
VALUE
|
2356
|
+
icu_ustr_gsub(argc, argv, str)
|
2357
|
+
int argc;
|
2358
|
+
VALUE *argv;
|
2359
|
+
VALUE str;
|
2360
|
+
{
|
2361
|
+
return ustr_gsub(argc, argv, str, 0, 0);
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
|
2365
|
+
/*-------------*/
|
2366
|
+
/* parsing */
|
2367
|
+
extern VALUE icu_date_parse(UChar * str, int32_t str_len, char * locale, UChar * val, int32_t len);
|
2368
|
+
|
2369
|
+
/**
|
2370
|
+
* call-seq:
|
2371
|
+
* str.parse_date( locale, value)
|
2372
|
+
*
|
2373
|
+
* Parses given value, using +str+ as format pattern with respect to +locale+.
|
2374
|
+
*
|
2375
|
+
* "HH:mm:ss E dd/MM/yyyy".u.parse_date("en", "20:15:01 Fri 13/01/2006".u)) # => Time.local(2006,"jan",13,20,15,1)
|
2376
|
+
*
|
2377
|
+
*/
|
2378
|
+
|
2379
|
+
VALUE
|
2380
|
+
icu_ustr_parse_date( str, locale, val)
|
2381
|
+
VALUE str, locale, val;
|
2382
|
+
{
|
2383
|
+
Check_Type(locale, T_STRING);
|
2384
|
+
Check_Class(val, rb_cUString);
|
2385
|
+
return icu_date_parse(ICU_PTR(str), ICU_LEN(str), RSTRING(locale)->ptr, ICU_PTR(val), ICU_LEN(val));
|
2386
|
+
}
|
2387
|
+
|
2388
|
+
/**
|
2389
|
+
* call-seq:
|
2390
|
+
* str.to_f( locale = "",[format_pattern]) => aFloat
|
2391
|
+
*
|
2392
|
+
* Parses string as double value, with respect to +locale+ and format pattern,
|
2393
|
+
* if they are provided.
|
2394
|
+
*
|
2395
|
+
* "456".u.to_f # => 456.0
|
2396
|
+
* "123,001".u.to_f("ru") # => 123.001
|
2397
|
+
* "123,001".u.to_f("en") # => 123001.0
|
2398
|
+
* "Got 123,001".u.to_f("en", "Got ###,###".u) # => 123001
|
2399
|
+
*/
|
2400
|
+
|
2401
|
+
VALUE
|
2402
|
+
icu_ustr_parse_double( int argc, VALUE * argv, VALUE str)
|
2403
|
+
{
|
2404
|
+
UParseError error;
|
2405
|
+
UErrorCode status = U_ZERO_ERROR;
|
2406
|
+
UNumberFormat * format = NULL;
|
2407
|
+
VALUE loc, pattern;
|
2408
|
+
char * locale;
|
2409
|
+
double value;
|
2410
|
+
int32_t pos, n;
|
2411
|
+
|
2412
|
+
n = rb_scan_args(argc, argv, "02", &loc, &pattern) ;
|
2413
|
+
if( n == 2) {
|
2414
|
+
Check_Class(pattern, rb_cUString);
|
2415
|
+
} else pattern = Qnil;
|
2416
|
+
|
2417
|
+
if (n > 0) {
|
2418
|
+
Check_Type(loc, T_STRING);
|
2419
|
+
locale = RSTRING(loc)->ptr;
|
2420
|
+
} else locale = NULL;
|
2421
|
+
|
2422
|
+
if( pattern != Qnil ) {
|
2423
|
+
format = unum_open(UNUM_PATTERN_DECIMAL, ICU_PTR(pattern), ICU_LEN(pattern), locale,
|
2424
|
+
&error, &status);
|
2425
|
+
} else {
|
2426
|
+
format = unum_open(UNUM_DECIMAL, NULL, 0, locale,&error, &status);
|
2427
|
+
}
|
2428
|
+
if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't open format %s", u_errorName(status));
|
2429
|
+
pos = 0;
|
2430
|
+
value = unum_parseDouble(format, ICU_PTR(str), ICU_LEN(str), &pos, &status);
|
2431
|
+
unum_close(format);
|
2432
|
+
if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't parse %s at %d", u_errorName(status), pos);
|
2433
|
+
return rb_float_new(value);
|
2434
|
+
}
|
2435
|
+
|
2436
|
+
/**
|
2437
|
+
* call-seq:
|
2438
|
+
* UString::strcoll(str1, str2 ) => Fixnum
|
2439
|
+
* UString::strcoll(str1, str2 , locale) => Fixnum
|
2440
|
+
* UString::strcoll(str1, str2 , locale, strength) => Fixnum
|
2441
|
+
*
|
2442
|
+
* Performs locale-sensitive string comparison.
|
2443
|
+
* Special values for locales can be passed in - if +nil+ is passed for the locale,
|
2444
|
+
* the default locale collation rules will be used. If empty string ("") or "root" are
|
2445
|
+
* passed, UCA rules will be used.
|
2446
|
+
*
|
2447
|
+
* Strength must be a fixnum that set collation strength:
|
2448
|
+
* -1 is default, 0 - primary, 1 - secondary, 2 - ternary.
|
2449
|
+
* E.g., pass 0 to ignore case and accents, 1 - to ignore case only.
|
2450
|
+
**/
|
2451
|
+
VALUE
|
2452
|
+
icu_ustr_coll(argc, argv, self)
|
2453
|
+
int argc;
|
2454
|
+
VALUE *argv;
|
2455
|
+
VALUE self;
|
2456
|
+
{
|
2457
|
+
UErrorCode status = 0 ;
|
2458
|
+
UCollator * collator = 0;
|
2459
|
+
int result;
|
2460
|
+
VALUE ret = Qnil;
|
2461
|
+
VALUE str1, str2, loc, strength = Qnil;
|
2462
|
+
char * locale = NULL;
|
2463
|
+
int n ;
|
2464
|
+
n = rb_scan_args(argc, argv, "22", &str1, &str2, &loc, &strength);
|
2465
|
+
if ( n == 3) {
|
2466
|
+
if( loc != Qnil) {
|
2467
|
+
Check_Type(loc, T_STRING);
|
2468
|
+
locale = RSTRING(loc)->ptr;
|
2469
|
+
}
|
2470
|
+
}
|
2471
|
+
Check_Class(str1, rb_cUString);
|
2472
|
+
Check_Class(str2, rb_cUString);
|
2473
|
+
collator = ucol_open(locale, &status);
|
2474
|
+
if( U_FAILURE(status) )
|
2475
|
+
{
|
2476
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
2477
|
+
}
|
2478
|
+
if( n == 4 ){
|
2479
|
+
Check_Type(strength, T_FIXNUM);
|
2480
|
+
ucol_setStrength(collator, NUM2INT(strength));
|
2481
|
+
}
|
2482
|
+
result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
|
2483
|
+
|
2484
|
+
switch(result){
|
2485
|
+
case UCOL_EQUAL: ret = INT2FIX(0);break;
|
2486
|
+
case UCOL_GREATER: ret = INT2FIX(1);break;
|
2487
|
+
case UCOL_LESS: ret = INT2FIX(-1);break;
|
2488
|
+
}
|
2489
|
+
ucol_close(collator);
|
2490
|
+
return ret;
|
2491
|
+
}
|
2492
|
+
|
2493
|
+
/**
|
2494
|
+
* call-seq:
|
2495
|
+
* UString::list_coll => anArray
|
2496
|
+
*
|
2497
|
+
* Returns array of available collator locales, to be used in UString#strcoll
|
2498
|
+
* */
|
2499
|
+
VALUE icu_ustr_list_coll(str)
|
2500
|
+
VALUE str;
|
2501
|
+
{
|
2502
|
+
int32_t i, n =ucol_countAvailable();
|
2503
|
+
VALUE ret = rb_ary_new();
|
2504
|
+
for( i = 0; i<n; i++) {
|
2505
|
+
rb_ary_push(ret, rb_str_new2(ucol_getAvailable(i)));
|
2506
|
+
}
|
2507
|
+
return ret;
|
2508
|
+
}
|
2509
|
+
|
2510
|
+
/**
|
2511
|
+
* call-seq:
|
2512
|
+
* UString::list_locales => anArray
|
2513
|
+
*
|
2514
|
+
* Returns array of available locales.
|
2515
|
+
* */
|
2516
|
+
VALUE icu_ustr_list_locales(str)
|
2517
|
+
VALUE str;
|
2518
|
+
{
|
2519
|
+
int32_t i, n =uloc_countAvailable();
|
2520
|
+
VALUE ret = rb_ary_new();
|
2521
|
+
for( i = 0; i<n; i++) {
|
2522
|
+
rb_ary_push(ret, rb_str_new2(uloc_getAvailable(i)));
|
2523
|
+
}
|
2524
|
+
return ret;
|
2525
|
+
}
|
2526
|
+
/**
|
2527
|
+
* call-seq:
|
2528
|
+
* UString::list_translits => anArray
|
2529
|
+
*
|
2530
|
+
* Returns array of available translits.
|
2531
|
+
* */
|
2532
|
+
VALUE icu_ustr_list_translits(str)
|
2533
|
+
VALUE str;
|
2534
|
+
{
|
2535
|
+
UErrorCode status = U_ZERO_ERROR;
|
2536
|
+
UEnumeration * ids ;
|
2537
|
+
VALUE ret ;
|
2538
|
+
UChar * name;
|
2539
|
+
int32_t len;
|
2540
|
+
ids = utrans_openIDs (&status);
|
2541
|
+
ICU_RAISE(status);
|
2542
|
+
ret = rb_ary_new();
|
2543
|
+
while( (name = (UChar*)uenum_unext(ids, &len, &status))) {
|
2544
|
+
rb_ary_push(ret, icu_ustr_new(name, len));
|
2545
|
+
}
|
2546
|
+
uenum_close(ids);
|
2547
|
+
return ret;
|
2548
|
+
|
2549
|
+
}
|
2550
|
+
/**
|
2551
|
+
* call-seq:
|
2552
|
+
* str.search(pattern, options = {})
|
2553
|
+
*
|
2554
|
+
* Searches for match in string. Returns array of +Range+
|
2555
|
+
* corresponding to position where pattern is matched.
|
2556
|
+
*
|
2557
|
+
* Valid options are:
|
2558
|
+
* :locale -- locale, +String+, value e.g. "en", "ru_RU"
|
2559
|
+
* :ignore_case -- whether to ignore case, valid values are +true+ or +false+, default to +false+
|
2560
|
+
* :ignore_case_accents -- sets collator options to strength +0+ - primary difference, e.g. ignore case and accents,
|
2561
|
+
* overrides :ignore_case: option, default to +false+,
|
2562
|
+
* :loosely -- same as :ignore_case_accents
|
2563
|
+
* :limit -- Fixnum limit of match positions to return.
|
2564
|
+
* :whole_words -- whether to match whole words only
|
2565
|
+
* :canonical -- use canonical equivalence
|
2566
|
+
*
|
2567
|
+
*
|
2568
|
+
* a = "A quick brown fox jumped over the lazy fox dancing foxtrote".u
|
2569
|
+
* a.search("fox".u) # => [14..16, 39..41, 51..53]
|
2570
|
+
* a.search("FoX".u) # => []
|
2571
|
+
* a.search("FoX".u, :ignore_case => true) # => [14..16, 39..41, 51..53]
|
2572
|
+
* a.search("FoX".u, :ignore_case => true, :whole_words => true) # => [14..16, 39..41]
|
2573
|
+
* a.search("FoX".u, :ignore_case => true, :whole_words => true, :limit => 1) # => [14..16]
|
2574
|
+
*
|
2575
|
+
* b = "Iñtërnâtiônàlizætiøn îs cọmpłèx".u.upcase # => IÑTËRNÂTIÔNÀLIZÆTIØN ÎS CỌMPŁÈX
|
2576
|
+
* b.search("nâtiôn".u, :locale => "en") # => []
|
2577
|
+
* b.search("nation".u) # => []
|
2578
|
+
* b.search("nation".u, :locale => "en", :ignore_case_accents => true) # => [5..10]
|
2579
|
+
* b.search("nâtiôn".u, :locale => "en", :ignore_case => true) # => [5..10]
|
2580
|
+
* b.search("zaeti".u, :locale => "en" ) # => []
|
2581
|
+
* b.search("zaeti".u, :locale => "en", :ignore_case => true) # => []
|
2582
|
+
* b.search("zaeti".u, :locale => "en", :ignore_case_accents => true) # => [14..17]
|
2583
|
+
*
|
2584
|
+
* v = [?a, 0x0325, 0x0300].to_u # => ḁ̀
|
2585
|
+
* v.search([?a, 0x300].to_u, :canonical => true) # => [0..2]
|
2586
|
+
* v.search([?a, 0x300].to_u) # => []
|
2587
|
+
**/
|
2588
|
+
|
2589
|
+
VALUE icu_ustr_search(argc, argv, str)
|
2590
|
+
int argc;
|
2591
|
+
VALUE *argv;
|
2592
|
+
VALUE str;
|
2593
|
+
|
2594
|
+
{
|
2595
|
+
UErrorCode status = U_ZERO_ERROR;
|
2596
|
+
UStringSearch * search = 0 ;
|
2597
|
+
VALUE pat, locale , limit, options;
|
2598
|
+
int lim = -1, count = 0 ;
|
2599
|
+
int32_t start, len;
|
2600
|
+
VALUE ret = rb_ary_new();
|
2601
|
+
UCollator * collator = 0;
|
2602
|
+
UBreakIterator * brkit = 0;
|
2603
|
+
char * loc = 0;
|
2604
|
+
if ( rb_scan_args(argc, argv, "11", &pat, &options) == 2 ) {
|
2605
|
+
Check_Type(options, T_HASH);
|
2606
|
+
} else {
|
2607
|
+
options = Qnil;
|
2608
|
+
}
|
2609
|
+
|
2610
|
+
Check_Class(pat, rb_cUString);
|
2611
|
+
locale = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("locale")));
|
2612
|
+
|
2613
|
+
if( locale != Qnil ) {
|
2614
|
+
Check_Type(locale, T_STRING);
|
2615
|
+
loc = RSTRING(locale) -> ptr;
|
2616
|
+
}
|
2617
|
+
limit = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("limit")));
|
2618
|
+
|
2619
|
+
if(TYPE(limit) == T_FIXNUM) {
|
2620
|
+
lim = FIX2INT(limit);
|
2621
|
+
if(lim <= 0) {
|
2622
|
+
rb_raise(rb_eTypeError, "Limit must be positive or nil, got: %d", lim);
|
2623
|
+
}
|
2624
|
+
}
|
2625
|
+
else
|
2626
|
+
if (limit!=Qnil)
|
2627
|
+
rb_raise(rb_eArgError, "Limit must be Fixnum, got %s", rb_class2name(CLASS_OF(limit)));
|
2628
|
+
|
2629
|
+
collator = ucol_open(loc, &status);
|
2630
|
+
ucol_setStrength(collator, -1);
|
2631
|
+
|
2632
|
+
if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("whole_words"))) )
|
2633
|
+
brkit = ubrk_open(UBRK_WORD, loc, ICU_PTR(str), ICU_LEN(str), &status);
|
2634
|
+
|
2635
|
+
if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case"))) )
|
2636
|
+
ucol_setStrength(collator, UCOL_SECONDARY);
|
2637
|
+
|
2638
|
+
if( options != Qnil &&
|
2639
|
+
( Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case_accents")) )
|
2640
|
+
|| Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("loosely")) )
|
2641
|
+
)
|
2642
|
+
)
|
2643
|
+
ucol_setStrength(collator, UCOL_PRIMARY );
|
2644
|
+
|
2645
|
+
|
2646
|
+
search = usearch_openFromCollator(ICU_PTR(pat), ICU_LEN(pat),
|
2647
|
+
ICU_PTR(str), ICU_LEN(str),
|
2648
|
+
collator, brkit, &status);
|
2649
|
+
|
2650
|
+
if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("canonical"))) )
|
2651
|
+
usearch_setAttribute(search, USEARCH_CANONICAL_MATCH, USEARCH_ON, &status);
|
2652
|
+
|
2653
|
+
if( U_FAILURE(status) ) goto failure;
|
2654
|
+
|
2655
|
+
status = U_ZERO_ERROR;
|
2656
|
+
if( usearch_first(search, &status) == USEARCH_DONE) {
|
2657
|
+
usearch_close(search);
|
2658
|
+
ucol_close(collator);
|
2659
|
+
ubrk_close(brkit);
|
2660
|
+
return ret;
|
2661
|
+
}
|
2662
|
+
|
2663
|
+
do {
|
2664
|
+
if( U_FAILURE(status) ) goto failure;
|
2665
|
+
|
2666
|
+
start = usearch_getMatchedStart(search);
|
2667
|
+
len = usearch_getMatchedLength(search);
|
2668
|
+
rb_ary_push(ret, rb_range_new(LONG2NUM(start), LONG2NUM(start+len-1), 0));
|
2669
|
+
|
2670
|
+
status = U_ZERO_ERROR;
|
2671
|
+
count += 1;
|
2672
|
+
if (lim > 0 && count >= lim) break;
|
2673
|
+
} while (USEARCH_DONE != usearch_next(search, &status));
|
2674
|
+
usearch_close( search);
|
2675
|
+
ucol_close(collator);
|
2676
|
+
ubrk_close(brkit);
|
2677
|
+
return ret;
|
2678
|
+
|
2679
|
+
failure:
|
2680
|
+
usearch_close( search);
|
2681
|
+
ucol_close(collator);
|
2682
|
+
ubrk_close(brkit);
|
2683
|
+
|
2684
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
2685
|
+
return Qnil;
|
2686
|
+
}
|
2687
|
+
/**
|
2688
|
+
* call-seq:
|
2689
|
+
* str.conv_unit_range(unit_range) => code_point_range
|
2690
|
+
*
|
2691
|
+
* Converts <b>code unit</b> range to <b>code point</b> range.
|
2692
|
+
* If your chars don't use multiple UTF16 codeunits, range will be the same.
|
2693
|
+
*/
|
2694
|
+
VALUE icu_ustr_convert_unit_range(str, range)
|
2695
|
+
VALUE str, range;
|
2696
|
+
{
|
2697
|
+
long cu_start, cu_len, cur_pos, cp_len ;
|
2698
|
+
if( rb_range_beg_len(range, &cu_start, &cu_len, ICU_LEN(str), 0) != Qtrue)
|
2699
|
+
return Qnil;
|
2700
|
+
|
2701
|
+
cur_pos = u_countChar32( ICU_PTR(str), cu_start );
|
2702
|
+
if( cu_start+cu_len > ICU_LEN(str)) --cu_len;
|
2703
|
+
cp_len = u_countChar32( ICU_PTR(str) + cu_start , cu_len);
|
2704
|
+
return rb_range_new(LONG2NUM(cur_pos), LONG2NUM(cur_pos + cp_len-1), 0);
|
2705
|
+
}
|
2706
|
+
/**
|
2707
|
+
* call-seq:
|
2708
|
+
* str.conv_point_range(point_range) => code_unit_range
|
2709
|
+
*
|
2710
|
+
* Converts <b>code point</b> range to <b>code unit</b> range.
|
2711
|
+
* (inversion of #conv_unit_range)
|
2712
|
+
* If your chars don't use multiple UTF16 codeuints, range will be the same.
|
2713
|
+
*/
|
2714
|
+
VALUE icu_ustr_convert_point_range(str, range)
|
2715
|
+
VALUE str, range;
|
2716
|
+
{
|
2717
|
+
long cp_start, cu_start, cu_end, cp_len, str_cp_len;
|
2718
|
+
str_cp_len = u_countChar32( ICU_PTR(str), ICU_LEN(str));
|
2719
|
+
if( Qtrue != rb_range_beg_len(range, &cp_start, &cp_len, str_cp_len, 0) ) return Qnil;
|
2720
|
+
|
2721
|
+
cu_start = 0;
|
2722
|
+
U16_FWD_N(ICU_PTR(str), cu_start, ICU_LEN(str), cp_start); /* care sur */
|
2723
|
+
cu_end = cu_start;
|
2724
|
+
U16_FWD_N(ICU_PTR(str), cu_end, ICU_LEN(str), cp_len); /* care sur */
|
2725
|
+
|
2726
|
+
return rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0);
|
2727
|
+
}
|
2728
|
+
/**
|
2729
|
+
* call-seq:
|
2730
|
+
* str.unit_count
|
2731
|
+
*
|
2732
|
+
* returns number of code units in string.
|
2733
|
+
*
|
2734
|
+
*/
|
2735
|
+
VALUE icu_ustr_unit_count(VALUE str){
|
2736
|
+
return LONG2NUM(ICU_LEN(str));
|
2737
|
+
}
|
2738
|
+
/**
|
2739
|
+
* call-seq:
|
2740
|
+
* str.point_count
|
2741
|
+
*
|
2742
|
+
* returns number of code points in string.
|
2743
|
+
*
|
2744
|
+
*/
|
2745
|
+
VALUE icu_ustr_point_count(VALUE str){
|
2746
|
+
return LONG2NUM(u_countChar32(ICU_PTR(str), ICU_LEN(str)));
|
2747
|
+
}
|
2748
|
+
|
2749
|
+
UChar icu_uchar_at(int32_t offset, void * context)
|
2750
|
+
{
|
2751
|
+
return ((UChar*)context)[offset];
|
2752
|
+
}
|
2753
|
+
/**
|
2754
|
+
* call-seq:
|
2755
|
+
* str.unescape => new_str
|
2756
|
+
*
|
2757
|
+
* Unescape a string of characters.
|
2758
|
+
*
|
2759
|
+
* The following escape sequences are recognized:
|
2760
|
+
* \uhhhh 4 hex digits; h in [0-9A-Fa-f]
|
2761
|
+
* \Uhhhhhhhh 8 hex digits
|
2762
|
+
* \xhh 1-2 hex digits \x{h...} 1-8 hex digits
|
2763
|
+
* \ooo 1-3 octal digits; o in [0-7]
|
2764
|
+
* \cX control-X; X is masked with 0x1F
|
2765
|
+
*
|
2766
|
+
* as well as the standard ANSI C escapes:
|
2767
|
+
* \a => U+0007, \b => U+0008, \t => U+0009, \n => U+000A, \v => U+000B, \f => U+000C, \r => U+000D, \e => U+001B, \" => U+0022, \' => U+0027, \? => U+003F, \\ => U+005C
|
2768
|
+
*
|
2769
|
+
* If escape sequence is invalid, it is ignored.
|
2770
|
+
*
|
2771
|
+
* "\\u044D\\u043A\\u0440\\u0430\\u043D\\u0438\\u0440\\u043E\\u0432\\u0430\\u043D\\u0438\\u0435".u.unescape => "экранирование"
|
2772
|
+
*
|
2773
|
+
**/
|
2774
|
+
|
2775
|
+
VALUE icu_ustr_unescape(str)
|
2776
|
+
VALUE str;
|
2777
|
+
{
|
2778
|
+
UChar32 c32;
|
2779
|
+
int32_t offset, leng, i, segment_start;
|
2780
|
+
UChar * ptr;
|
2781
|
+
UChar buf[3];
|
2782
|
+
VALUE ret;
|
2783
|
+
offset = 0;
|
2784
|
+
segment_start = 0;
|
2785
|
+
leng = ICU_LEN(str);
|
2786
|
+
ptr = ICU_PTR(str);
|
2787
|
+
ret = icu_ustr_new(0, 0);
|
2788
|
+
while(offset < leng) {
|
2789
|
+
if( ptr[offset] == '\\' ) {
|
2790
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
|
2791
|
+
++offset;
|
2792
|
+
c32 = u_unescapeAt(icu_uchar_at, &offset, leng, ICU_PTR(str));
|
2793
|
+
// append this char
|
2794
|
+
if( 0xFFFFFFFF == c32) continue;
|
2795
|
+
i = 0;
|
2796
|
+
U16_APPEND_UNSAFE(buf, i, c32);
|
2797
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, buf, U16_LENGTH(c32));
|
2798
|
+
segment_start = offset;
|
2799
|
+
} else {
|
2800
|
+
++offset;
|
2801
|
+
}
|
2802
|
+
}
|
2803
|
+
if( segment_start < offset)
|
2804
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
|
2805
|
+
|
2806
|
+
return ret;
|
2807
|
+
}
|
2808
|
+
|
2809
|
+
|
2810
|
+
|
2811
|
+
/* transliteration */
|
2812
|
+
extern VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len);
|
2813
|
+
/**
|
2814
|
+
* call-seq:
|
2815
|
+
* str.translit(id, [rules])
|
2816
|
+
*
|
2817
|
+
* Performs {transliteration}[http://icu.sourceforge.net/userguide/Transformations.html],
|
2818
|
+
* of this string, using given transform +id+ and +rules+
|
2819
|
+
*
|
2820
|
+
* "yukihiro matsumoto".u.translit("Latin-Hiragana".u) # => ゆきひろ まつもと
|
2821
|
+
* "hello".u.translit("null".u, ":: upper();".u) # => HELLO
|
2822
|
+
**/
|
2823
|
+
VALUE icu_ustr_translit(argc, argv, str)
|
2824
|
+
int argc;
|
2825
|
+
VALUE * argv ;
|
2826
|
+
VALUE str;
|
2827
|
+
{
|
2828
|
+
VALUE id, rules ;
|
2829
|
+
if(rb_scan_args(argc, argv, "11", &id, &rules) == 2) {
|
2830
|
+
Check_Class(rules, rb_cUString);
|
2831
|
+
} else rules = Qnil;
|
2832
|
+
|
2833
|
+
Check_Class(str, rb_cUString);
|
2834
|
+
Check_Class(id, rb_cUString);
|
2835
|
+
if( rules == Qnil) {
|
2836
|
+
return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id), NULL, 0);
|
2837
|
+
} else {
|
2838
|
+
return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id),
|
2839
|
+
ICU_PTR(rules), ICU_LEN(rules));
|
2840
|
+
}
|
2841
|
+
}
|
2842
|
+
void
|
2843
|
+
initialize_ustring(void)
|
2844
|
+
{
|
2845
|
+
UErrorCode status = U_ZERO_ERROR;
|
2846
|
+
u_init(&status);
|
2847
|
+
if( U_FAILURE(status) ){
|
2848
|
+
rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
|
2849
|
+
}
|
2850
|
+
s_UCA_collator = ucol_open("", &status);
|
2851
|
+
if( U_FAILURE(status) ){
|
2852
|
+
rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
|
2853
|
+
}
|
2854
|
+
s_case_UCA_collator = ucol_open("", &status);
|
2855
|
+
if( U_FAILURE(status) ){
|
2856
|
+
rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
|
2857
|
+
}
|
2858
|
+
ucol_setStrength(s_case_UCA_collator, UCOL_SECONDARY);
|
2859
|
+
|
2860
|
+
/*
|
2861
|
+
|
2862
|
+
Document-class: UString
|
2863
|
+
|
2864
|
+
UString is a string class that stores Unicode characters directly and provides
|
2865
|
+
similar functionality as the Ruby String class.
|
2866
|
+
|
2867
|
+
An UString string consists of 16-bit Unicode code units. A Unicode character
|
2868
|
+
may be stored with either one code unit which is the most common case or with a matched
|
2869
|
+
pair of special code units ("surrogates").
|
2870
|
+
|
2871
|
+
For single-character handling, a Unicode character code point is a value in the
|
2872
|
+
range 0..0x10ffff.
|
2873
|
+
|
2874
|
+
Indexes and offsets into and lengths of strings always count code units, not code points.
|
2875
|
+
This is the same as with multi-byte char* strings in traditional string handling.
|
2876
|
+
Operations on partial strings typically do not test for code point boundaries.
|
2877
|
+
|
2878
|
+
In order to use the collation, text boundary analysis, formatting and other ICU APIs,
|
2879
|
+
Unicode strings must be used. In order to get Unicode strings from your native codepage,
|
2880
|
+
you can use the conversion API.
|
2881
|
+
|
2882
|
+
UString class is also point for access to several ICU services, instead of
|
2883
|
+
mirroring ICU class hierarchy.
|
2884
|
+
|
2885
|
+
==== Methods by category:
|
2886
|
+
|
2887
|
+
- concat and modify: + , * , << , #concat , #replace
|
2888
|
+
|
2889
|
+
- element reference, insert, replace: [] , #slice , []= , #slice! , #insert , #char_span
|
2890
|
+
|
2891
|
+
- comparisons: <=> , == , #casecmp , #strcoll
|
2892
|
+
|
2893
|
+
- size and positions: #length , #point_count , #clear , #empty? , #conv_unit_range , #conv_point_range
|
2894
|
+
|
2895
|
+
- index/search methods: #index , #rindex , #include? , #search
|
2896
|
+
|
2897
|
+
- regexps, matching and replacing: =~ , #match , #scan , #split , #sub , #sub! , #gsub , #gsub!
|
2898
|
+
|
2899
|
+
- conversion String/UString: #to_s, Kernel#u, String#to_u
|
2900
|
+
|
2901
|
+
- iterators: #each_line_break , #each_word , #each_char , #each_sentence
|
2902
|
+
|
2903
|
+
- split to chars/codepoints: #chars , #codepoints , Array#to_u
|
2904
|
+
|
2905
|
+
- character case: #upcase , #upcase! , #downcase , #downcase!
|
2906
|
+
|
2907
|
+
- stripping spaces: #strip , #lstrip , #rstrip , #strip! , #lstrip! , #rstrip!
|
2908
|
+
|
2909
|
+
- formatting and parsing: #format , #parse_date , #to_f
|
2910
|
+
|
2911
|
+
- UNICODE normalization: #norm_C , #norm_D , #norm_KC , #norm_KD , #norm_FCD
|
2912
|
+
|
2913
|
+
- utilities: #unescape , #hash , #inspect , #inspect_names , #translit
|
2914
|
+
|
2915
|
+
- ICU avalable info: #list_coll , #list_locales , #list_translits
|
2916
|
+
*/
|
2917
|
+
rb_cUString = rb_define_class("UString", rb_cObject);
|
2918
|
+
rb_include_module(rb_cUString, rb_mComparable);
|
2919
|
+
|
2920
|
+
/* initializations */
|
2921
|
+
rb_define_alloc_func(rb_cUString, icu_ustr_alloc);
|
2922
|
+
rb_define_method(rb_cUString, "initialize", icu_ustr_init, -1);
|
2923
|
+
rb_define_method(rb_cUString, "initialize_copy", icu_ustr_replace, 1);
|
2924
|
+
rb_define_method(rb_cUString, "replace", icu_ustr_replace, 1);
|
2925
|
+
|
2926
|
+
/* comparisons */
|
2927
|
+
rb_define_method(rb_cUString, "<=>", icu_ustr_cmp_m, 1);
|
2928
|
+
rb_define_method(rb_cUString, "==", icu_ustr_equal, 1);
|
2929
|
+
rb_define_method(rb_cUString, "casecmp", icu_ustr_casecmp, 1);
|
2930
|
+
rb_define_singleton_method(rb_cUString, "strcoll", icu_ustr_coll, -1);
|
2931
|
+
|
2932
|
+
/* ICU avalable info */
|
2933
|
+
rb_define_singleton_method(rb_cUString, "list_coll", icu_ustr_list_coll, 0);
|
2934
|
+
rb_define_singleton_method(rb_cUString, "list_locales", icu_ustr_list_locales, 0);
|
2935
|
+
rb_define_singleton_method(rb_cUString, "list_translits", icu_ustr_list_translits, 0);
|
2936
|
+
|
2937
|
+
/* hash code */
|
2938
|
+
rb_define_method(rb_cUString, "hash", icu_ustr_hash_m, 0);
|
2939
|
+
|
2940
|
+
/* inspect */
|
2941
|
+
rb_define_method(rb_cUString, "inspect", icu_ustr_inspect, 0);
|
2942
|
+
rb_define_method(rb_cUString, "inspect_names", icu_ustr_inspect_names, 0);
|
2943
|
+
|
2944
|
+
/* size */
|
2945
|
+
rb_define_method(rb_cUString, "length", icu_ustr_length, 0);
|
2946
|
+
rb_define_alias (rb_cUString, "size", "length");
|
2947
|
+
rb_define_method(rb_cUString, "unit_count", icu_ustr_unit_count, 0);
|
2948
|
+
rb_define_method(rb_cUString, "point_count", icu_ustr_point_count, 0);
|
2949
|
+
rb_define_method(rb_cUString, "clear", icu_ustr_clear, 0);
|
2950
|
+
rb_define_method(rb_cUString, "empty?", icu_ustr_empty, 0);
|
2951
|
+
|
2952
|
+
/* UNICODE normalization */
|
2953
|
+
rb_define_method(rb_cUString, "norm_C", icu_ustr_normalize_C, 0);
|
2954
|
+
rb_define_method(rb_cUString, "norm_D", icu_ustr_normalize_D, 0);
|
2955
|
+
rb_define_method(rb_cUString, "norm_KC", icu_ustr_normalize_KC, 0);
|
2956
|
+
rb_define_method(rb_cUString, "norm_KD", icu_ustr_normalize_KD, 0);
|
2957
|
+
rb_define_method(rb_cUString, "norm_FCD", icu_ustr_normalize_FCD, 0);
|
2958
|
+
|
2959
|
+
/* iterators */
|
2960
|
+
rb_define_method(rb_cUString, "each_line_break", icu_ustr_each_line, -1);
|
2961
|
+
rb_define_method(rb_cUString, "each_word", icu_ustr_each_word, -1);
|
2962
|
+
rb_define_method(rb_cUString, "each_char", icu_ustr_each_char, -1);
|
2963
|
+
rb_define_method(rb_cUString, "each_sentence", icu_ustr_each_sentence, -1);
|
2964
|
+
rb_define_alias(rb_cUString, "each", "each_line_break");
|
2965
|
+
|
2966
|
+
/* split to chars/codepoints */
|
2967
|
+
rb_define_method(rb_cUString, "chars", icu_ustr_chars_m, -1);
|
2968
|
+
rb_define_method(rb_cUString, "char_span", icu_ustr_char_span, -1);
|
2969
|
+
rb_define_method(rb_cUString, "codepoints", icu_ustr_points, 0);
|
2970
|
+
|
2971
|
+
/* concat operations */
|
2972
|
+
rb_define_method(rb_cUString, "+", icu_ustr_plus, 1);
|
2973
|
+
rb_define_method(rb_cUString, "*", icu_ustr_times, 1);
|
2974
|
+
rb_define_method(rb_cUString, "concat", icu_ustr_concat, 1);
|
2975
|
+
rb_define_alias( rb_cUString, "<<", "concat");
|
2976
|
+
|
2977
|
+
/* character case */
|
2978
|
+
rb_define_method(rb_cUString, "upcase", icu_ustr_upcase, -1);
|
2979
|
+
rb_define_method(rb_cUString, "upcase!", icu_ustr_upcase_bang, -1);
|
2980
|
+
rb_define_method(rb_cUString, "downcase", icu_ustr_downcase, -1);
|
2981
|
+
rb_define_method(rb_cUString, "downcase!", icu_ustr_downcase_bang, -1);
|
2982
|
+
rb_define_method(rb_cUString, "foldcase", icu_ustr_foldcase, 0);
|
2983
|
+
|
2984
|
+
/* stripping spaces */
|
2985
|
+
rb_define_method(rb_cUString, "strip", icu_ustr_strip, 0);
|
2986
|
+
rb_define_method(rb_cUString, "lstrip", icu_ustr_lstrip, 0);
|
2987
|
+
rb_define_method(rb_cUString, "rstrip", icu_ustr_rstrip, 0);
|
2988
|
+
|
2989
|
+
rb_define_method(rb_cUString, "strip!", icu_ustr_strip_bang, 0);
|
2990
|
+
rb_define_method(rb_cUString, "lstrip!", icu_ustr_lstrip_bang, 0);
|
2991
|
+
rb_define_method(rb_cUString, "rstrip!", icu_ustr_rstrip_bang, 0);
|
2992
|
+
|
2993
|
+
/* index/search methods */
|
2994
|
+
rb_define_method(rb_cUString, "index", icu_ustr_index_m, -1);
|
2995
|
+
rb_define_method(rb_cUString, "rindex", icu_ustr_rindex_m, -1);
|
2996
|
+
rb_define_method(rb_cUString, "include?", icu_ustr_include, 1);
|
2997
|
+
rb_define_method(rb_cUString, "search", icu_ustr_search, -1);
|
2998
|
+
|
2999
|
+
/* element reference */
|
3000
|
+
rb_define_method(rb_cUString, "[]", icu_ustr_aref_m, -1);
|
3001
|
+
rb_define_alias(rb_cUString, "slice", "[]");
|
3002
|
+
|
3003
|
+
/* codeunit/codepoint conversion */
|
3004
|
+
rb_define_method(rb_cUString, "conv_unit_range", icu_ustr_convert_unit_range, 1);
|
3005
|
+
rb_define_method(rb_cUString, "conv_point_range", icu_ustr_convert_point_range, 1);
|
3006
|
+
|
3007
|
+
/* insert/replace */
|
3008
|
+
rb_define_method(rb_cUString, "[]=", icu_ustr_aset_m, -1);
|
3009
|
+
rb_define_method(rb_cUString, "slice!", icu_ustr_slice_bang, -1);
|
3010
|
+
rb_define_method(rb_cUString, "insert", icu_ustr_insert, 2);
|
3011
|
+
|
3012
|
+
/* conversion to String from UString */
|
3013
|
+
rb_define_method(rb_cUString, "to_u", icu_ustr_to_ustr, -1);
|
3014
|
+
rb_define_method(rb_cUString, "to_s", icu_ustr_to_rstr, -1);
|
3015
|
+
rb_define_alias(rb_cUString, "to_str", "to_s");
|
3016
|
+
|
3017
|
+
/* formatting messages */
|
3018
|
+
rb_define_method(rb_cUString, "format", icu_ustr_format, -2);
|
3019
|
+
rb_define_alias( rb_cUString, "fmt", "format");
|
3020
|
+
|
3021
|
+
/* parsing */
|
3022
|
+
rb_define_method(rb_cUString, "parse_date", icu_ustr_parse_date, 2);
|
3023
|
+
rb_define_method(rb_cUString, "to_f", icu_ustr_parse_double, -1);
|
3024
|
+
|
3025
|
+
/* transliteration */
|
3026
|
+
rb_define_method(rb_cUString, "translit", icu_ustr_translit, -1);
|
3027
|
+
|
3028
|
+
/* unescaping */
|
3029
|
+
rb_define_method(rb_cUString, "unescape", icu_ustr_unescape, 0);
|
3030
|
+
|
3031
|
+
/* regexp matching and replacing */
|
3032
|
+
rb_define_method(rb_cUString, "=~", icu_ustr_match, 1);
|
3033
|
+
rb_define_method(rb_cUString, "match", icu_ustr_match_m, 1);
|
3034
|
+
rb_define_method(rb_cUString, "scan", icu_ustr_scan, 1);
|
3035
|
+
rb_define_method(rb_cUString, "split", icu_ustr_split_m, -1);
|
3036
|
+
rb_define_method(rb_cUString, "sub", icu_ustr_sub, -1);
|
3037
|
+
rb_define_method(rb_cUString, "sub!", icu_ustr_sub_bang, -1);
|
3038
|
+
rb_define_method(rb_cUString, "gsub", icu_ustr_gsub, -1);
|
3039
|
+
rb_define_method(rb_cUString, "gsub!", icu_ustr_gsub_bang, -1);
|
3040
|
+
|
3041
|
+
}
|
3042
|
+
|