icu4r_19 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ extern void icu_regex_free (ICURegexp *ptr);
2
+ extern VALUE icu_reg_s_alloc (VALUE klass);
3
+ extern VALUE icu_reg_initialize_m (int argc, VALUE *argv, VALUE self);
4
+ extern VALUE icu_reg_new (UChar *s, long len, int options) ;
5
+ extern VALUE icu_reg_clone (VALUE obj);
6
+ extern VALUE icu_reg_comp (VALUE str);
7
+ extern VALUE icu_reg_from_rb_reg (VALUE re);
8
+ extern VALUE icu_reg_to_u (VALUE self);
9
+ extern VALUE icu_reg_split (VALUE self, VALUE str, VALUE limit);
10
+ extern VALUE icu_reg_nth_match (VALUE re, long nth);
11
+ extern VALUE icu_reg_range (VALUE re, int nth, long *start, long *end);
12
+ extern VALUE icu_reg_match (VALUE re, VALUE str);
13
+ extern VALUE icu_reg_eqq (VALUE re, VALUE str);
14
+ extern int icu_reg_find_next (VALUE pat);
15
+ extern VALUE icu_reg_get_replacement (VALUE pat, VALUE repl_text, long prev_end);
16
+ extern VALUE icu_reg_get_prematch (VALUE pat, long prev_end);
17
+ extern VALUE icu_reg_get_tail (VALUE pat, long prev_end);
18
+ extern VALUE icu_reg_from_rb_str (int argc, VALUE *argv, VALUE obj);
19
+ extern VALUE icu_umatch_range (VALUE match, VALUE index);
20
+ extern VALUE icu_umatch_size (VALUE match);
21
+ extern VALUE icu_umatch_init (VALUE self, VALUE re);
22
+ extern VALUE icu_umatch_aref (VALUE match, VALUE idx);
23
+ extern VALUE icu_umatch_new (VALUE re);
24
+ extern long icu_group_count(VALUE re);
25
+ extern long icu_reg_search(VALUE re, VALUE str, int pos, int reverse);
26
+
27
+ extern void initialize_uregexp (void);
@@ -0,0 +1,3039 @@
1
+ /**
2
+ * ustring.c - ICU based Unicode string support.
3
+ *
4
+ * $Id: ustring.c,v 1.20 2006/01/23 14:26:45 meadow Exp $
5
+ *
6
+ * Copyright (c) 2006 Nikolai Lugovoi
7
+ *
8
+ * This code is based on original ruby String class source (string.c):
9
+ *
10
+ * * string.c -
11
+ * *
12
+ * * Copyright (C) 1993-2003 Yukihiro Matsumoto
13
+ * * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
14
+ * * Copyright (C) 2000 Information-technology Promotion Agency, Japan
15
+ * *
16
+ **/
17
+
18
+ #include "icu_common.h"
19
+ VALUE icu_ustr_replace(VALUE str, VALUE str2);
20
+ VALUE ustr_gsub(int argc, VALUE * argv, VALUE str, int bang, int once);
21
+ extern VALUE icu_from_rstr(int argc, VALUE * argv, VALUE str);
22
+
23
+ VALUE rb_cURegexp;
24
+ VALUE rb_cUString;
25
+ VALUE rb_cUMatch;
26
+ VALUE rb_cUResourceBundle;
27
+ VALUE rb_cULocale;
28
+ VALUE rb_cUCalendar;
29
+ VALUE rb_cUConverter;
30
+ VALUE rb_cUCollator;
31
+
32
+ #include "uregex.h"
33
+
34
+
35
+ /* to be used in <=>, casecmp */
36
+ static UCollator * s_UCA_collator, * s_case_UCA_collator;
37
+
38
+ static void
39
+ free_ustr(str)
40
+ ICUString *str;
41
+ {
42
+ if (str->ptr)
43
+ free(str->ptr);
44
+ str->ptr = 0;
45
+ free(str);
46
+ }
47
+ inline void icu_check_frozen(int check_busy, VALUE str)
48
+ {
49
+ rb_check_frozen(str);
50
+ if(check_busy && USTRING(str)->busy > 0 ) rb_raise(rb_eRuntimeError, "String is busy. Can't modify");
51
+ }
52
+ #define START_BUF_LEN 16
53
+ /**
54
+ * Allocate ICUString struct with given +capa+ capacity,
55
+ * if mode == 1 and UChar != 0 - copy len UChars from src,
56
+ * else set pointer to src.
57
+ */
58
+ #define ICU_COPY 1
59
+ #define ICU_SET 0
60
+ VALUE icu_ustr_alloc_and_wrap(UChar * src, long len, long capa, int mode)
61
+ {
62
+ ICUString *n_str = ALLOC_N(ICUString, 1);
63
+ size_t alloc_capa;
64
+ if( mode == ICU_COPY ) {
65
+ alloc_capa = START_BUF_LEN > capa ? START_BUF_LEN : capa;
66
+ if(alloc_capa<=len) alloc_capa = len + 1;
67
+ n_str->ptr = ALLOC_N(UChar, alloc_capa);
68
+ n_str->capa = alloc_capa;
69
+ n_str->len = len;
70
+ if( src ) {
71
+ u_memcpy(n_str->ptr, src, len);
72
+ n_str->ptr[len] = 0;
73
+ }
74
+ } else {
75
+ n_str->ptr = src;
76
+ n_str->len = len;
77
+ n_str->capa = capa;
78
+ }
79
+ if(n_str->capa <= n_str->len) rb_raise(rb_eRuntimeError, "Capacity is not large then len, sentinel can't be set!");
80
+ n_str->busy = 0;
81
+ n_str->ptr[n_str->len] = 0;
82
+ return Data_Wrap_Struct(rb_cUString, 0, free_ustr, n_str);
83
+ }
84
+ VALUE
85
+ icu_ustr_alloc(klass)
86
+ VALUE klass;
87
+ {
88
+ return icu_ustr_alloc_and_wrap(NULL, 0, 0, ICU_COPY);
89
+ }
90
+ void ustr_capa_resize(ICUString * str, long new_capa)
91
+ {
92
+ if (new_capa != str->capa) {
93
+ if (str->capa < new_capa || (str->capa - new_capa > 1024)) {
94
+ if(new_capa < START_BUF_LEN) new_capa = START_BUF_LEN;
95
+ REALLOC_N(str->ptr, UChar, new_capa);
96
+ str->capa = new_capa;
97
+ }
98
+ }
99
+ }
100
+ /* delete +del_len+ units from string and insert replacement */
101
+ void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len)
102
+ {
103
+ long new_len;
104
+ UChar * temp = 0 ;
105
+ if( str->busy ) {
106
+ rb_warn("Attempt to modify busy string. Ignored");
107
+ return;
108
+ }
109
+ if( repl_len < 0) return;
110
+ if( del_len == 0 && repl_len == 0) return;
111
+ new_len = str->len - del_len + repl_len;
112
+ if (replacement == str->ptr ) {
113
+ temp = ALLOC_N(UChar, repl_len);
114
+ u_memcpy(temp, replacement, repl_len);
115
+ replacement = temp;
116
+ }
117
+ if ( repl_len >= del_len) ustr_capa_resize(str, new_len+1);
118
+ /* move tail */
119
+ if(str->len - (start+del_len) > 0) {
120
+ u_memmove(str->ptr + start+repl_len, str->ptr + start+del_len, str->len-(start+del_len) );
121
+ }
122
+ /* copy string */
123
+ if( repl_len > 0) u_memcpy(str->ptr+start, replacement, repl_len);
124
+ if ( repl_len < del_len) ustr_capa_resize(str, new_len+1);
125
+ str->len = new_len;
126
+ str->ptr[new_len] = 0;
127
+ if(temp) {
128
+ free(temp);
129
+ }
130
+ }
131
+ static inline void
132
+ ustr_mod_check(VALUE s, UChar *p, long len)
133
+ {
134
+ if (ICU_PTR(s) != p || ICU_LEN(s) != len){
135
+ rb_raise(rb_eRuntimeError, "string modified");
136
+ }
137
+ }
138
+ VALUE
139
+ ustr_new(klass, ptr, len)
140
+ VALUE klass;
141
+ UChar *ptr;
142
+ long len;
143
+ {
144
+ if (len < 0) {
145
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
146
+ }
147
+ return icu_ustr_alloc_and_wrap(ptr, len, len+1, ICU_COPY);
148
+ }
149
+
150
+ VALUE
151
+ icu_ustr_new(ptr, len)
152
+ const UChar *ptr;
153
+ long len;
154
+ {
155
+ return ustr_new(rb_cUString, ptr, len);
156
+ }
157
+ VALUE
158
+ icu_ustr_new_set(ptr, len, capa)
159
+ UChar *ptr;
160
+ long len;
161
+ long capa;
162
+ {
163
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_SET);
164
+ }
165
+ VALUE
166
+ icu_ustr_new2(ptr)
167
+ const UChar *ptr;
168
+ {
169
+ if (!ptr) {
170
+ rb_raise(rb_eArgError, "NULL pointer given");
171
+ }
172
+ return icu_ustr_new(ptr, u_strlen(ptr));
173
+ }
174
+
175
+ inline VALUE
176
+ icu_ustr_new_capa(UChar * ptr, long len, long capa)
177
+ {
178
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_COPY);
179
+ }
180
+
181
+ /* ------------ */
182
+
183
+ /**
184
+ * call-seq:
185
+ * UString.new(str="".u) => new_str
186
+ *
187
+ * Returns a new string object containing a copy of <i>str</i>.
188
+ */
189
+
190
+ VALUE
191
+ icu_ustr_init(argc, argv, str)
192
+ int argc;
193
+ VALUE *argv;
194
+ VALUE str;
195
+ {
196
+ VALUE orig;
197
+
198
+ if (rb_scan_args(argc, argv, "01", &orig) == 1)
199
+ {
200
+ icu_ustr_replace(str, orig);
201
+ }
202
+ return str;
203
+ }
204
+
205
+ /**
206
+ * call-seq:
207
+ * str.length => integer
208
+ *
209
+ * Returns the length of <i>str</i>.
210
+ */
211
+ VALUE
212
+ icu_ustr_length(str)
213
+ VALUE str;
214
+ {
215
+ return LONG2NUM(ICU_LEN(str));
216
+ }
217
+
218
+ /**
219
+ * call-seq:
220
+ * str.empty? => true or false
221
+ *
222
+ * Returns <code>true</code> if <i>str</i> has a length of zero.
223
+ *
224
+ * "hello".u.empty? #=> false
225
+ * "".u.empty? #=> true
226
+ */
227
+
228
+ VALUE
229
+ icu_ustr_empty(str)
230
+ VALUE str;
231
+ {
232
+ return 0 == ICU_LEN(str) ? Qtrue : Qfalse;
233
+ }
234
+
235
+ VALUE
236
+ icu_ustr_resize(str, len)
237
+ VALUE str;
238
+ long len;
239
+ {
240
+ if (len < 0) {
241
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
242
+ }
243
+ ustr_capa_resize(USTRING(str), len);
244
+ ICU_LEN(str) = len;
245
+ ICU_PTR(str)[len] = 0; /* sentinel */
246
+ return str;
247
+ }
248
+
249
+
250
+ /**
251
+ * call-seq:
252
+ * str.replace(other_str) => str
253
+ *
254
+ * Replaces the contents and taintedness of <i>str</i> with the corresponding
255
+ * values in <i>other_str</i>.
256
+ *
257
+ * s = "hello".u #=> "hello"
258
+ * s.replace "world".u #=> "world"
259
+ */
260
+ VALUE
261
+ icu_ustr_replace(str, str2)
262
+ VALUE str,
263
+ str2;
264
+ {
265
+ if (str == str2)
266
+ return str;
267
+ icu_check_frozen(1, str);
268
+ Check_Class(str2, rb_cUString);
269
+ ustr_splice_units(USTRING(str), 0, ICU_LEN(str), ICU_PTR(str2), ICU_LEN(str2));
270
+ OBJ_INFECT(str, str2);
271
+ return str;
272
+ }
273
+
274
+ /**
275
+ * call-seq:
276
+ * string.clear -> string
277
+ *
278
+ * Makes string empty.
279
+ *
280
+ * a = "abcde".u
281
+ * a.clear #=> ""
282
+ */
283
+
284
+ VALUE
285
+ icu_ustr_clear(str)
286
+ VALUE str;
287
+ {
288
+ icu_check_frozen(1, str);
289
+ icu_ustr_resize(str, 0);
290
+ return str;
291
+ }
292
+
293
+ int icu_collator_cmp (UCollator * collator, VALUE str1, VALUE str2)
294
+ {
295
+ int ret = 0, result ;
296
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
297
+ switch(result){
298
+ case UCOL_EQUAL: ret = 0;break;
299
+ case UCOL_GREATER: ret = 1;break;
300
+ case UCOL_LESS: ret = -1;break;
301
+ }
302
+ return ret;
303
+ }
304
+
305
+ int
306
+ icu_ustr_cmp(str1, str2)
307
+ VALUE str1,
308
+ str2;
309
+ {
310
+ return icu_collator_cmp(s_UCA_collator, str1, str2);
311
+ }
312
+
313
+ /**
314
+ * call-seq:
315
+ * str == obj => true or false
316
+ *
317
+ * Equality---If <i>obj</i> is not a <code>UString</code>, returns
318
+ * <code>false</code>. Otherwise, returns <code>true</code> if
319
+ * strings are of the same length and content
320
+ *
321
+ */
322
+
323
+ VALUE
324
+ icu_ustr_equal(str1, str2)
325
+ VALUE str1,
326
+ str2;
327
+ {
328
+ if (str1 == str2)
329
+ return Qtrue;
330
+ if (CLASS_OF(str2) != rb_cUString) {
331
+ return Qfalse;
332
+ }
333
+ if (ICU_LEN(str1) == ICU_LEN(str2) &&
334
+ u_strncmp(ICU_PTR(str1), ICU_PTR(str2), ICU_LEN(str1) ) == 0) {
335
+ return Qtrue;
336
+ }
337
+ return Qfalse;
338
+ }
339
+
340
+ /**
341
+ * call-seq:
342
+ * str <=> other_str => -1, 0, +1
343
+ *
344
+ * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
345
+ * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
346
+ * <i>str</i>.
347
+ *
348
+ * <code><=></code> is the basis for the methods <code><</code>,
349
+ * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
350
+ * included from module <code>Comparable</code>. The method
351
+ * <code>String#==</code> does not use <code>Comparable#==</code>.
352
+ *
353
+ * This method uses UCA rules, see also #strcoll for locale-specific string collation.
354
+ *
355
+ * "abcdef".u <=> "abcde".u #=> 1
356
+ * "abcdef".u <=> "abcdef".u #=> 0
357
+ * "abcdef".u <=> "abcdefg".u #=> -1
358
+ * "abcdef".u <=> "ABCDEF".u #=> -1
359
+ */
360
+
361
+ VALUE
362
+ icu_ustr_cmp_m(str1, str2)
363
+ VALUE str1,
364
+ str2;
365
+ {
366
+ long result;
367
+
368
+ if (CLASS_OF(str2) != rb_cUString) {
369
+ return Qnil;
370
+ } else {
371
+ result = icu_ustr_cmp(str1, str2);
372
+ }
373
+ return LONG2NUM(result);
374
+ }
375
+
376
+ /**
377
+ * call-seq:
378
+ * str.casecmp(other_str) => -1, 0, +1
379
+ *
380
+ * Case-insensitive version of <code>UString#<=></code> .
381
+ * This method uses UCA collator with secondary strength, see #strcoll
382
+ *
383
+ *
384
+ * "abcdef".u.casecmp("abcde".u) #=> 1
385
+ * "aBcDeF".u.casecmp("abcdef".u) #=> 0
386
+ * "abcdef".u.casecmp("abcdefg".u) #=> -1
387
+ * "abcdef".u.casecmp("ABCDEF".u) #=> 0
388
+ */
389
+
390
+ VALUE
391
+ icu_ustr_casecmp(str1, str2)
392
+ VALUE str1,
393
+ str2;
394
+ {
395
+ Check_Class(str2, rb_cUString);
396
+ return INT2FIX(icu_collator_cmp(s_case_UCA_collator, str1, str2));
397
+ }
398
+
399
+ /**
400
+ * call-seq:
401
+ * str + other_str => new_str
402
+ *
403
+ * Concatenation---Returns a new <code>UString</code> containing
404
+ * <i>other_str</i> concatenated to <i>str</i>.
405
+ *
406
+ * "Hello from ".u + "main".u #=> "Hello from main"
407
+ */
408
+
409
+ VALUE
410
+ icu_ustr_plus(str1, str2)
411
+ VALUE str1,
412
+ str2;
413
+ {
414
+ VALUE str3;
415
+ Check_Class(str2, rb_cUString);
416
+
417
+ str3 = icu_ustr_new_capa(ICU_PTR(str1), ICU_LEN(str1), ICU_LEN(str1) + ICU_LEN(str2));
418
+ ustr_splice_units(USTRING(str3), ICU_LEN(str3), 0, ICU_PTR(str2), ICU_LEN(str2));
419
+ if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
420
+ OBJ_TAINT(str3);
421
+ return str3;
422
+ }
423
+
424
+ /**
425
+ * call-seq:
426
+ * str * integer => new_str
427
+ *
428
+ * Copy---Returns a new <code>UString</code> containing <i>integer</i> copies of
429
+ * the receiver.
430
+ *
431
+ * "Ho! ".u * 3 #=> "Ho! Ho! Ho! ".u
432
+ */
433
+
434
+ VALUE
435
+ icu_ustr_times(str, times)
436
+ VALUE str,
437
+ times;
438
+ {
439
+ VALUE str2;
440
+ long i,
441
+ len;
442
+ Check_Type(times, T_FIXNUM);
443
+ len = NUM2LONG(times);
444
+ if (len < 0) {
445
+ rb_raise(rb_eArgError, "negative argument");
446
+ }
447
+ if (len && LONG_MAX / len < ICU_LEN(str)) {
448
+ rb_raise(rb_eArgError, "argument too big");
449
+ }
450
+
451
+ str2 = icu_ustr_new_capa(0, 0, len *= ICU_LEN(str));
452
+ for (i = 0; i < len; i += ICU_LEN(str)) {
453
+ ustr_splice_units(USTRING(str2), i, 0, ICU_PTR(str), ICU_LEN(str));
454
+ }
455
+ ICU_PTR(str2)[ICU_LEN(str2)] = 0;
456
+
457
+ OBJ_INFECT(str2, str);
458
+
459
+ return str2;
460
+ }
461
+
462
+
463
+ /**
464
+ * call-seq:
465
+ * str << other_str => str
466
+ * str.concat(other_str) => str
467
+ *
468
+ * Append---Concatenates the given string object to <i>str</i>.
469
+ *
470
+ * a = "hello ".u
471
+ * a << "world".u #=> "hello world"
472
+ */
473
+
474
+ VALUE
475
+ icu_ustr_concat(str1, str2)
476
+ VALUE str1,
477
+ str2;
478
+ {
479
+ icu_check_frozen(1, str1);
480
+ Check_Class(str2, rb_cUString);
481
+ if (ICU_LEN(str2) > 0) {
482
+ ustr_splice_units(USTRING(str1), ICU_LEN(str1), 0, ICU_PTR(str2), ICU_LEN(str2));
483
+ OBJ_INFECT(str1, str2);
484
+ }
485
+ return str1;
486
+ }
487
+
488
+ int
489
+ icu_ustr_hash(str)
490
+ VALUE str;
491
+ {
492
+ register long len = ICU_LEN(str) * (sizeof(UChar));
493
+ register char *p = (char*)ICU_PTR(str);
494
+ register int key = 0;
495
+
496
+ while (len--) {
497
+ key += *p++;
498
+ key += (key << 10);
499
+ key ^= (key >> 6);
500
+ }
501
+ key += (key << 3);
502
+ key ^= (key >> 11);
503
+ key += (key << 15);
504
+ return key;
505
+ }
506
+
507
+ /**
508
+ * call-seq:
509
+ * str.hash => fixnum
510
+ *
511
+ * Return a hash based on the string's length and content.
512
+ */
513
+
514
+ VALUE
515
+ icu_ustr_hash_m(str)
516
+ VALUE str;
517
+ {
518
+ int key = icu_ustr_hash(str);
519
+ return INT2FIX(key);
520
+ }
521
+
522
+ VALUE
523
+ icu_ustr_dup(str)
524
+ VALUE str;
525
+ {
526
+ VALUE dup = icu_ustr_new(ICU_PTR(str), ICU_LEN(str));
527
+ return dup;
528
+ }
529
+
530
+ /**
531
+ * call-seq:
532
+ * str.upcase!(locale = "") => str or nil
533
+ *
534
+ * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
535
+ * were made. This method is locale-sensitive.
536
+ */
537
+
538
+ VALUE
539
+ icu_ustr_upcase_bang(argc, argv, str)
540
+ int argc;
541
+ VALUE * argv;
542
+ VALUE str;
543
+
544
+ {
545
+ UErrorCode error = 0;
546
+ UChar *buf = 0;
547
+ long len ;
548
+ VALUE loc;
549
+ char * locale = NULL;
550
+ icu_check_frozen(1, str);
551
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
552
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
553
+ if( loc != Qnil) {
554
+ Check_Type(loc, T_STRING);
555
+ locale = RSTRING_PTR(loc);
556
+ }
557
+ }
558
+
559
+ len = u_strToUpper(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale, &error);
560
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
561
+ REALLOC_N(buf, UChar, len + 1);
562
+ error = 0;
563
+ len =
564
+ u_strToUpper(buf, len, ICU_PTR(str), ICU_LEN(str), locale, &error);
565
+ }
566
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
567
+ return Qnil;
568
+ free(ICU_PTR(str));
569
+ ICU_PTR(str) = buf;
570
+ ICU_LEN(str) = len;
571
+ return str;
572
+ }
573
+
574
+
575
+ /**
576
+ * call-seq:
577
+ * str.upcase(locale = "") => new_str
578
+ *
579
+ * Returns a copy of <i>str</i> with all lowercase letters replaced with their
580
+ * uppercase counterparts. The operation is locale sensitive.
581
+ *
582
+ * "hEllO".u.upcase #=> "HELLO"
583
+ */
584
+
585
+ VALUE
586
+ icu_ustr_upcase(argc, argv, str)
587
+ int argc;
588
+ VALUE * argv;
589
+ VALUE str;
590
+
591
+ {
592
+ str = icu_ustr_dup(str);
593
+ icu_ustr_upcase_bang(argc, argv, str);
594
+ return str;
595
+ }
596
+
597
+
598
+ /**
599
+ * call-seq:
600
+ * str.downcase!(locale = "") => str or nil
601
+ *
602
+ * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
603
+ * changes were made.
604
+ */
605
+
606
+ VALUE
607
+ icu_ustr_downcase_bang(argc, argv, str)
608
+ int argc;
609
+ VALUE * argv;
610
+ VALUE str;
611
+ {
612
+ UErrorCode error = 0;
613
+ UChar *buf;
614
+ long len ;
615
+ VALUE loc;
616
+ char * locale = NULL;
617
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
618
+ icu_check_frozen(1, str);
619
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
620
+ if( loc != Qnil) {
621
+ Check_Type(loc, T_STRING);
622
+ locale = RSTRING_PTR(loc);
623
+ }
624
+ }
625
+ len =
626
+ u_strToLower(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale,
627
+ &error);
628
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
629
+ REALLOC_N(buf, UChar, len + 1);
630
+ error = 0;
631
+ len =
632
+ u_strToLower(buf, len , ICU_PTR(str), ICU_LEN(str), locale,
633
+ &error);
634
+ }
635
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
636
+ return Qnil;
637
+ free(ICU_PTR(str));
638
+ ICU_PTR(str) = buf;
639
+ ICU_LEN(str) = len;
640
+ return str;
641
+ }
642
+
643
+ /**
644
+ * call-seq:
645
+ * str.downcase(locale = "") => new_str
646
+ *
647
+ * Returns a copy of <i>str</i> with all uppercase letters replaced with their
648
+ * lowercase counterparts. The operation is locale sensitive.
649
+ *
650
+ * "hEllO".u.downcase #=> "hello"
651
+ */
652
+
653
+ VALUE
654
+ icu_ustr_downcase(argc, argv, str)
655
+ int argc;
656
+ VALUE * argv;
657
+ VALUE str;
658
+ {
659
+ str = icu_ustr_dup(str);
660
+ icu_ustr_downcase_bang(argc, argv, str);
661
+ return str;
662
+ }
663
+
664
+ /**
665
+ * call-seq:
666
+ * str.foldcase
667
+ *
668
+ * Case-fold the characters in a string.
669
+ * Case-folding is locale-independent and not context-sensitive.
670
+ *
671
+ */
672
+ VALUE
673
+ icu_ustr_foldcase(str)
674
+ VALUE str;
675
+ {
676
+ UErrorCode error = 0;
677
+ UChar *buf;
678
+ long len, capa ;
679
+ capa = ICU_LEN(str) + 1;
680
+ buf = ALLOC_N(UChar, capa);
681
+ len = u_strFoldCase(buf, capa-1, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
682
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
683
+ capa = len + 1;
684
+ REALLOC_N(buf, UChar, len + 1);
685
+ error = 0;
686
+ len = u_strFoldCase(buf, capa, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
687
+ }
688
+ return icu_ustr_new_set(buf, len, capa) ;
689
+ }
690
+
691
+ static long
692
+ icu_ustr_index(str, sub, offset)
693
+ VALUE str,
694
+ sub;
695
+ long offset;
696
+ {
697
+ long pos;
698
+ UChar *found;
699
+ if (offset < 0) {
700
+ offset += ICU_LEN(str);
701
+ if (offset < 0)
702
+ return -1;
703
+ }
704
+ if (ICU_LEN(str) - offset < ICU_LEN(sub))
705
+ return -1;
706
+ if (ICU_LEN(sub) == 0)
707
+ return offset;
708
+ found =
709
+ u_strFindFirst(ICU_PTR(str) + offset, ICU_LEN(str) - offset,
710
+ ICU_PTR(sub), ICU_LEN(sub));
711
+ if (NULL == found)
712
+ return -1;
713
+ pos = found - (ICU_PTR(str) + offset);
714
+ return pos + offset;
715
+ }
716
+
717
+ /**
718
+ * call-seq:
719
+ * str.index(substring [, offset]) => fixnum or nil
720
+ * str.index(regexp [, offset]) => fixnum or nil
721
+ *
722
+ * Returns the index of the first occurrence of the given <i>substring</i>,
723
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns
724
+ * <code>nil</code> if not found. If the second parameter is present, it
725
+ * specifies the position in the string to begin the search.
726
+ *
727
+ * "hello".u.index('e'.u) #=> 1
728
+ * "hello".u.index('lo'.u) #=> 3
729
+ * "hello".u.index('a'.u) #=> nil
730
+ * "hello".u.index(/[aeiou]/.U, -3) #=> 4
731
+ */
732
+
733
+ VALUE
734
+ icu_ustr_index_m(argc, argv, str)
735
+ int argc;
736
+ VALUE *argv;
737
+ VALUE str;
738
+ {
739
+ VALUE sub;
740
+ VALUE initpos;
741
+ long pos ;
742
+ int processed = 0;
743
+
744
+ if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
745
+ pos = NUM2LONG(initpos);
746
+ } else {
747
+ pos = 0;
748
+ }
749
+ if (pos < 0) {
750
+ pos += ICU_LEN(str);
751
+ }
752
+
753
+ if( CLASS_OF(sub) == rb_cUString) {
754
+ pos = icu_ustr_index(str, sub, pos);
755
+ processed = 1;
756
+ }
757
+ if( CLASS_OF(sub) == rb_cURegexp) {
758
+ pos = icu_reg_search(sub, str, pos, 0);
759
+ processed = 1;
760
+ }
761
+ if(! processed ) {
762
+ rb_raise(rb_eTypeError, "Wrong Type, expected UString or URegexp, got %s", rb_class2name(CLASS_OF(sub)));
763
+ }
764
+
765
+ if (pos == -1)
766
+ return Qnil;
767
+ return LONG2NUM(pos);
768
+ }
769
+
770
+ static long
771
+ icu_ustr_rindex(str, sub, pos)
772
+ VALUE str,
773
+ sub;
774
+ long pos;
775
+ {
776
+ long len = ICU_LEN(sub);
777
+ UChar *found;
778
+
779
+ /*
780
+ * substring longer than string
781
+ */
782
+ if (ICU_LEN(str) < len)
783
+ return -1;
784
+ if (ICU_LEN(str) - pos < len) {
785
+ pos = ICU_LEN(str) - len;
786
+ }
787
+ found = u_strFindLast(ICU_PTR(str), pos, ICU_PTR(sub), ICU_LEN(sub));
788
+ if (NULL == found)
789
+ return -1;
790
+ pos = found - (ICU_PTR(str));
791
+ return pos;
792
+ }
793
+
794
+
795
+ /**
796
+ * call-seq:
797
+ * str.rindex(substring [, fixnum]) => fixnum or nil
798
+ * str.rindex(regexp [, fixnum]) => fixnum or nil
799
+ *
800
+ * Returns the index of the last occurrence of the given <i>substring</i>,
801
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
802
+ * found. If the second parameter is present, it specifies the position in the
803
+ * string to end the search---characters beyond this point will not be considered.
804
+ *
805
+ * "hello".u.rindex('e') #=> 1
806
+ * "hello".u.rindex('l') #=> 3
807
+ * "hello".u.rindex('a') #=> nil
808
+ * "hello".u.rindex(/[aeiou]/.U, -2) #=> 1
809
+ */
810
+
811
+ VALUE
812
+ icu_ustr_rindex_m(argc, argv, str)
813
+ int argc;
814
+ VALUE *argv;
815
+ VALUE str;
816
+ {
817
+ VALUE sub;
818
+ VALUE position;
819
+ long pos;
820
+
821
+ if (rb_scan_args(argc, argv, "11", &sub, &position) == 2) {
822
+ pos = NUM2LONG(position);
823
+ if (pos < 0) {
824
+ pos += ICU_LEN(str);
825
+ if (pos < 0) {
826
+ return Qnil;
827
+ }
828
+ }
829
+ if (pos > ICU_LEN(str))
830
+ pos = ICU_LEN(str);
831
+ } else {
832
+ pos = ICU_LEN(str);
833
+ }
834
+
835
+ switch (TYPE(sub)) {
836
+ case T_DATA:
837
+ if (CLASS_OF(sub) == rb_cUString) {
838
+ pos = icu_ustr_rindex(str, sub, pos);
839
+ if (pos >= 0)
840
+ return LONG2NUM(pos);
841
+ break;
842
+ }
843
+ if (CLASS_OF(sub) == rb_cURegexp) {
844
+ pos = icu_reg_search(sub, str, pos, 1);
845
+ if (pos >= 0)
846
+ return LONG2NUM(pos);
847
+ break;
848
+ }
849
+
850
+ default:
851
+ rb_raise(rb_eTypeError, "type mismatch: %s given",
852
+ rb_obj_classname(sub));
853
+ }
854
+ return Qnil;
855
+ }
856
+
857
+ /**
858
+ * call-seq:
859
+ * str.lstrip! => self or nil
860
+ *
861
+ * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
862
+ * change was made. See also <code>UString#rstrip!</code> and
863
+ * <code>UString#strip!</code>, in all these methods whitespace is an
864
+ * Unicode char that has White_Space property.
865
+ *
866
+ * " hello ".u.lstrip #=> "hello "
867
+ * "hello".u.lstrip! #=> nil
868
+ */
869
+
870
+ VALUE
871
+ icu_ustr_lstrip_bang(str)
872
+ VALUE str;
873
+ {
874
+ UChar *s;
875
+ int32_t i,
876
+ n,
877
+ c;
878
+ icu_check_frozen(1, str);
879
+ s = ICU_PTR(str);
880
+ n = ICU_LEN(str);
881
+ if (!s || n == 0)
882
+ return Qnil;
883
+ /*
884
+ * remove spaces at head
885
+ */
886
+ i = 0;
887
+ U16_GET(s, 0, i, n, c); /* care about surrogates */
888
+ while (i < n && u_isUWhiteSpace(c)) {
889
+ U16_NEXT(s, i, n, c); /* care surr */
890
+ }
891
+
892
+ if (i > 0) {
893
+ if(! u_isUWhiteSpace(c)) --i;
894
+ ICU_LEN(str) = n - i;
895
+ u_memmove(ICU_PTR(str), s + i, ICU_LEN(str));
896
+ ICU_PTR(str)[ICU_LEN(str)] = 0;
897
+ return str;
898
+ }
899
+ return Qnil;
900
+ }
901
+
902
+
903
+ /**
904
+ * call-seq:
905
+ * str.lstrip => new_str
906
+ *
907
+ * Returns a copy of <i>str</i> with leading whitespace removed. See also
908
+ * <code>UString#rstrip</code> and <code>UString#strip</code>.
909
+ *
910
+ * " hello ".u.lstrip #=> "hello "
911
+ * "hello".u.lstrip #=> "hello"
912
+ */
913
+
914
+ VALUE
915
+ icu_ustr_lstrip(str)
916
+ VALUE str;
917
+ {
918
+ str = icu_ustr_dup(str);
919
+ icu_ustr_lstrip_bang(str);
920
+ return str;
921
+ }
922
+
923
+
924
+ /**
925
+ * call-seq:
926
+ * str.rstrip! => self or nil
927
+ *
928
+ * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
929
+ * no change was made. See also <code>UString#lstrip!</code> and
930
+ * <code>UString#strip!</code>.
931
+ *
932
+ * " hello ".u.rstrip #=> " hello"
933
+ * "hello".u.rstrip! #=> nil
934
+ */
935
+
936
+ VALUE
937
+ icu_ustr_rstrip_bang(str)
938
+ VALUE str;
939
+ {
940
+ UChar *s;
941
+ int32_t i,
942
+ n,
943
+ c;
944
+
945
+ icu_check_frozen(1, str);
946
+ s = ICU_PTR(str);
947
+ n = ICU_LEN(str);
948
+
949
+ if (!s || n == 0)
950
+ return Qnil;
951
+ i = n - 1;
952
+
953
+ U16_GET(s, 0, n - 1, n, c); /* care surrogates */
954
+ i = n;
955
+ /*
956
+ * remove trailing spaces
957
+ */
958
+ while (i > 0 && u_isUWhiteSpace(c)) {
959
+ U16_PREV(s, 0, i, c); /* care surrogates */
960
+ }
961
+
962
+ if (i < n) {
963
+ if(! u_isUWhiteSpace(c)) ++i;
964
+ ICU_LEN(str) = i;
965
+ ICU_PTR(str)[i] = 0;
966
+ return str;
967
+ }
968
+ return Qnil;
969
+ }
970
+
971
+
972
+ /**
973
+ * call-seq:
974
+ * str.rstrip => new_str
975
+ *
976
+ * Returns a copy of <i>str</i> with trailing whitespace removed. See also
977
+ * <code>UString#lstrip</code> and <code>UString#strip</code>.
978
+ *
979
+ * " hello ".u.rstrip #=> " hello"
980
+ * "hello".u.rstrip #=> "hello"
981
+ */
982
+
983
+ VALUE
984
+ icu_ustr_rstrip(str)
985
+ VALUE str;
986
+ {
987
+ str = icu_ustr_dup(str);
988
+ icu_ustr_rstrip_bang(str);
989
+ return str;
990
+ }
991
+
992
+
993
+ /**
994
+ * call-seq:
995
+ * str.strip! => str or nil
996
+ *
997
+ * Removes leading and trailing whitespace from <i>str</i>. Returns
998
+ * <code>nil</code> if <i>str</i> was not altered.
999
+ */
1000
+
1001
+ VALUE
1002
+ icu_ustr_strip_bang(str)
1003
+ VALUE str;
1004
+ {
1005
+ VALUE l = icu_ustr_lstrip_bang(str);
1006
+ VALUE r = icu_ustr_rstrip_bang(str);
1007
+
1008
+ if (NIL_P(l) && NIL_P(r))
1009
+ return Qnil;
1010
+ return str;
1011
+ }
1012
+
1013
+
1014
+ /**
1015
+ * call-seq:
1016
+ * str.strip => new_str
1017
+ *
1018
+ * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
1019
+ *
1020
+ * " hello ".u.strip #=> "hello"
1021
+ * "\tgoodbye\r\n".u.strip #=> "goodbye"
1022
+ */
1023
+
1024
+ VALUE
1025
+ icu_ustr_strip(str)
1026
+ VALUE str;
1027
+ {
1028
+ str = icu_ustr_dup(str);
1029
+ icu_ustr_strip_bang(str);
1030
+ return str;
1031
+ }
1032
+
1033
+
1034
+
1035
+ /* ----------------------------------- */
1036
+ VALUE
1037
+ icu_ustr_normalize(str, mode)
1038
+ VALUE str;
1039
+ int32_t mode;
1040
+ {
1041
+ UErrorCode error = U_ZERO_ERROR;
1042
+ long capa = ICU_LEN(str)+20;
1043
+ UChar *buf;
1044
+ long needed;
1045
+ VALUE ret;
1046
+ if (UNORM_YES == unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), mode, &error))
1047
+ return icu_ustr_dup(str);
1048
+
1049
+ buf = ALLOC_N(UChar, capa );
1050
+ do {
1051
+ error = 0;
1052
+ needed =
1053
+ unorm_normalize(ICU_PTR(str), ICU_LEN(str), mode, 0, buf, capa,
1054
+ &error);
1055
+ if (U_SUCCESS(error)) {
1056
+ ret = icu_ustr_new_set(buf, needed, capa);
1057
+ return ret;
1058
+ }
1059
+ if (error == U_BUFFER_OVERFLOW_ERROR) {
1060
+ capa = needed + 1;
1061
+ REALLOC_N(buf, UChar, capa);
1062
+ if (!buf)
1063
+ rb_raise(rb_eRuntimeError, "can't allocate memory");
1064
+ } else
1065
+ rb_raise(rb_eArgError, u_errorName(error));
1066
+ }
1067
+ while (1);
1068
+ }
1069
+
1070
+ /**
1071
+ * UNORM_NFKC Compatibility decomposition followed by canonical
1072
+ * composition.
1073
+ */
1074
+ VALUE
1075
+ icu_ustr_normalize_KC(str)
1076
+ VALUE str;
1077
+ {
1078
+ return icu_ustr_normalize(str, UNORM_NFKC);
1079
+ }
1080
+
1081
+ /**
1082
+ * UNORM_NFKD Compatibility decomposition.
1083
+ */
1084
+ VALUE
1085
+ icu_ustr_normalize_KD(str)
1086
+ VALUE str;
1087
+ {
1088
+ return icu_ustr_normalize(str, UNORM_NFKD);
1089
+ }
1090
+
1091
+ /**
1092
+ * UNORM_NFD Canonical decomposition.
1093
+ */
1094
+ VALUE
1095
+ icu_ustr_normalize_D(str)
1096
+ VALUE str;
1097
+ {
1098
+ return icu_ustr_normalize(str, UNORM_NFD);
1099
+ }
1100
+
1101
+ /**
1102
+ * UNORM_FCD
1103
+ */
1104
+ VALUE
1105
+ icu_ustr_normalize_FCD(VALUE str)
1106
+ {
1107
+ return icu_ustr_normalize(str, UNORM_FCD);
1108
+ }
1109
+
1110
+ /**
1111
+ * UNORM_NFC Canonical decomposition followed by canonical composition.
1112
+ */
1113
+ VALUE
1114
+ icu_ustr_normalize_C(str)
1115
+ VALUE str;
1116
+ {
1117
+ return icu_ustr_normalize(str, UNORM_NFC);
1118
+ }
1119
+ VALUE my_ubrk_close(UBreakIterator ** boundary, VALUE errorinfo)
1120
+ {
1121
+ ubrk_close(*boundary);
1122
+ *boundary = NULL;
1123
+ rb_raise(rb_eRuntimeError, "Unhandled exception: %s", rb_obj_classname(errorinfo));
1124
+ return Qnil;
1125
+ }
1126
+
1127
+ /* UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE */
1128
+ VALUE
1129
+ icu_ustr_each_mode(argc, argv, str, mode)
1130
+ int argc;
1131
+ VALUE *argv;
1132
+ VALUE str;
1133
+ int32_t mode;
1134
+ {
1135
+ UErrorCode error = 0;
1136
+ UBreakIterator *boundary;
1137
+ int32_t end, start;
1138
+ VALUE loc ;
1139
+ VALUE temp;
1140
+ char *locale = "";
1141
+ if( rb_scan_args(argc, argv, "01", &loc) == 1) {
1142
+ Check_Type(loc, T_STRING);
1143
+ locale = RSTRING_PTR(loc);
1144
+ }
1145
+ boundary =
1146
+ ubrk_open(mode, locale, ICU_PTR(str), ICU_LEN(str),
1147
+ &error);
1148
+ if (U_FAILURE(error))
1149
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1150
+ start = ubrk_first(boundary);
1151
+ ++(USTRING(str)->busy);
1152
+ for (end = ubrk_next(boundary); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
1153
+ temp = icu_ustr_new(ICU_PTR(str) + start, end - start);
1154
+ rb_rescue(rb_yield, (VALUE)temp, my_ubrk_close, (VALUE)&boundary);
1155
+ }
1156
+ --(USTRING(str)->busy);
1157
+ ubrk_close(boundary);
1158
+ return str;
1159
+ }
1160
+
1161
+ /**
1162
+ * call-seq:
1163
+ * str.each_word(locale = "") {|substr| block } => str
1164
+ *
1165
+ * Word boundary analysis is used by search and replace functions, as well as within text editing
1166
+ * applications that allow the user to select words with a double click. Word selection provides
1167
+ * correct interpretation of punctuation marks within and following words. Characters that are not
1168
+ * part of a word, such as symbols or punctuation marks, have word-breaks on both sides.
1169
+ *
1170
+ */
1171
+ VALUE
1172
+ icu_ustr_each_word(argc, argv, str)
1173
+ int argc;
1174
+ VALUE *argv;
1175
+ VALUE str;
1176
+
1177
+ {
1178
+ return icu_ustr_each_mode(argc, argv, str, UBRK_WORD);
1179
+ }
1180
+
1181
+ /**
1182
+ * call-seq:
1183
+ * str.each_char(locale = "") {|substr| block } => str
1184
+ *
1185
+ * Character boundary analysis allows users to interact with characters as they expect to,
1186
+ * for example, when moving the cursor through a text string. Character boundary analysis provides
1187
+ * correct navigation of through character strings, regardless of how the character is stored.
1188
+ * For example, an accented character might be stored as a base character and a diacritical mark.
1189
+ * What users consider to be a character can differ between languages.
1190
+ *
1191
+ */
1192
+ VALUE
1193
+ icu_ustr_each_char(argc, argv, str)
1194
+ int argc;
1195
+ VALUE *argv;
1196
+ VALUE str;
1197
+
1198
+ {
1199
+ return icu_ustr_each_mode(argc, argv, str, UBRK_CHARACTER);
1200
+ }
1201
+
1202
+ /**
1203
+ * call-seq:
1204
+ * str.each_line_break(locale = "") {|substr| block } => str
1205
+ *
1206
+ * Line boundary analysis determines where a text string can be broken when line-wrapping.
1207
+ * The mechanism correctly handles punctuation and hyphenated words.
1208
+ *
1209
+ */
1210
+ VALUE
1211
+ icu_ustr_each_line(argc, argv, str)
1212
+ int argc;
1213
+ VALUE *argv;
1214
+ VALUE str;
1215
+
1216
+ {
1217
+ return icu_ustr_each_mode(argc, argv, str, UBRK_LINE);
1218
+ }
1219
+
1220
+ /**
1221
+ * call-seq:
1222
+ * str.each_sentence(locale = "") {|substr| block } => str
1223
+ *
1224
+ * Sentence boundary analysis allows selection with correct interpretation of periods
1225
+ * within numbers and abbreviations, and trailing punctuation marks such as quotation marks and parentheses.
1226
+ *
1227
+ */
1228
+ VALUE
1229
+ icu_ustr_each_sentence(argc, argv, str)
1230
+ int argc;
1231
+ VALUE *argv;
1232
+ VALUE str;
1233
+ {
1234
+ return icu_ustr_each_mode(argc, argv, str, UBRK_SENTENCE);
1235
+ }
1236
+
1237
+ /**
1238
+ * call-seq:
1239
+ * str.to_u(encoding = 'utf8') => UString
1240
+ *
1241
+ * Returns self.
1242
+ */
1243
+ VALUE
1244
+ icu_ustr_to_ustr(argc, argv, str)
1245
+ int argc;
1246
+ VALUE *argv;
1247
+ VALUE str;
1248
+ {
1249
+ return str;
1250
+ }
1251
+
1252
+ /**
1253
+ * call-seq:
1254
+ * str.to_s(encoding = 'utf8') => String
1255
+ *
1256
+ * Converts to Ruby String (byte-oriented) value in given encoding.
1257
+ * When no encoding is given, assumes UTF-8.
1258
+ */
1259
+ VALUE
1260
+ icu_ustr_to_rstr(argc, argv, str)
1261
+ int argc;
1262
+ VALUE *argv,
1263
+ str;
1264
+ {
1265
+ VALUE enc;
1266
+ char *encoding = 0; /* default */
1267
+ UErrorCode error = 0;
1268
+ UConverter *conv ;
1269
+ int enclen, needed = 0;
1270
+ char * buf;
1271
+ VALUE s;
1272
+ if (rb_scan_args(argc, argv, "01", &enc) == 1) {
1273
+ Check_Type(enc, T_STRING);
1274
+ encoding = RSTRING_PTR(enc);
1275
+ }
1276
+
1277
+ enclen = ICU_LEN(str) + 1;
1278
+ buf = ALLOC_N(char, enclen);
1279
+
1280
+ if( !encoding || !strncmp(encoding, "utf8", 4)){
1281
+ u_strToUTF8( buf, enclen, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1282
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1283
+ REALLOC_N(buf, char, needed + 1);
1284
+ error = 0;
1285
+ u_strToUTF8( buf, needed, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1286
+ }
1287
+ if( U_FAILURE(error) ){
1288
+ free(buf);
1289
+ rb_raise(rb_eArgError, u_errorName(error));
1290
+ }
1291
+ s = rb_str_new(buf, needed);
1292
+
1293
+ } else {
1294
+ conv = ucnv_open(encoding, &error);
1295
+ if (U_FAILURE(error)) {
1296
+ ucnv_close(conv);
1297
+ free(buf);
1298
+ rb_raise(rb_eArgError, u_errorName(error));
1299
+ }
1300
+ enclen =
1301
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1302
+ &error);
1303
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1304
+ REALLOC_N(buf, char, enclen + 1);
1305
+ error = 0;
1306
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1307
+ &error);
1308
+ }
1309
+ if( U_FAILURE(error) ){
1310
+ free(buf);
1311
+ rb_raise(rb_eArgError, u_errorName(error));
1312
+ }
1313
+ s = rb_str_new(buf, enclen);
1314
+ ucnv_close(conv);
1315
+ }
1316
+ free(buf);
1317
+ return s;
1318
+ }
1319
+
1320
+ /* -------------- */
1321
+ extern VALUE icu_format(UChar * pattern, int32_t len, VALUE args,
1322
+ int32_t arg_len, char *locale);
1323
+ /**
1324
+ * call-seq:
1325
+ * str.format(locale, [*args])
1326
+ *
1327
+ * Powerful locale-sensitive message formatting. see [./docs/FORMATTING]
1328
+ *
1329
+ * Valid argument types are: +Fixnum+, +UString+, +Float+, +Time+ .
1330
+ *
1331
+ * */
1332
+ VALUE
1333
+ icu_ustr_format(str, args)
1334
+ VALUE str,
1335
+ args;
1336
+ {
1337
+ VALUE loc;
1338
+ Check_Type(args, T_ARRAY);
1339
+ loc = rb_ary_shift(args);
1340
+ Check_Type(loc, T_STRING);
1341
+ return icu_format(ICU_PTR(str), ICU_LEN(str), args, RARRAY_LEN(args),
1342
+ RSTRING_PTR(loc));
1343
+ }
1344
+
1345
+ /* ------ UString regexp related functions ---- */
1346
+
1347
+ /**
1348
+ * call-seq:
1349
+ * str =~ uregexp => UMatch or nil
1350
+ * str =~ other_str => integer or nil
1351
+ *
1352
+ * Match---If <code>URegexp</code> is given, use it as a pattern to
1353
+ * match against <i>uregexp</i> and return UMatch or +nil+.
1354
+ *
1355
+ * If <code>UString</code> is given, returns index of it
1356
+ * (similar to <code>UString#index</code>).
1357
+ *
1358
+ * Otherwise returns +nil+
1359
+ *
1360
+ * "cat o' 9 tails".u =~ '\d' #=> nil
1361
+ * "cat o' 9 tails".u =~ /\d/.U #=> #<UMatch:0xf6fb7d5c @cg=[<U000039>]>
1362
+ * "cat o' 9 tails".u =~ 9 #=> false
1363
+ * "cat o' 9 tails".u =~ '9'.u #=> 7
1364
+ */
1365
+
1366
+ VALUE
1367
+ icu_ustr_match(x, y)
1368
+ VALUE x,
1369
+ y;
1370
+ {
1371
+ long pos ;
1372
+ if (TYPE(y) == T_REGEXP){
1373
+ rb_raise(rb_eTypeError, "Wrong type: can't match against Regexp. Use URegexp instead");
1374
+ }
1375
+ if (CLASS_OF(y) == rb_cURegexp) {
1376
+ return icu_reg_match(y, x);
1377
+ } else if (CLASS_OF(y) == rb_cUString) {
1378
+ pos = icu_ustr_index(x, y, 0);
1379
+ if (pos == -1) return Qnil;
1380
+ else return LONG2NUM(pos);
1381
+ } else {
1382
+ return Qnil;
1383
+ }
1384
+ }
1385
+
1386
+ VALUE
1387
+ get_pat(pat, quote)
1388
+ VALUE pat;
1389
+ int quote;
1390
+ {
1391
+ if (CLASS_OF(pat) == rb_cURegexp)
1392
+ return pat;
1393
+
1394
+ if (CLASS_OF(pat) == rb_cUString)
1395
+ return icu_reg_comp(pat);
1396
+ Check_Class(pat, rb_cURegexp);
1397
+ return Qnil;
1398
+ }
1399
+
1400
+
1401
+ /**
1402
+ * call-seq:
1403
+ * str.match(pattern) => matchdata or nil
1404
+ *
1405
+ * Converts <i>pattern</i> to a <code>URegexp</code> (if it isn't already one),
1406
+ * then invokes its <code>match</code> method on <i>str</i>.
1407
+ *
1408
+ * 'hello'.u.match('(.)\1'.u) #=> #<UMatch:0x401b3d30>
1409
+ * 'hello'.u.match('(.)\1'.u)[0] #=> "ll"
1410
+ * 'hello'.u.match(/(.)\1/.U)[0] #=> "ll"
1411
+ * 'hello'.u.match('xx') #=> nil
1412
+ */
1413
+
1414
+ VALUE
1415
+ icu_ustr_match_m(str, re)
1416
+ VALUE str,
1417
+ re;
1418
+ {
1419
+ return rb_funcall(get_pat(re, 0), rb_intern("match"), 1, str);
1420
+ }
1421
+
1422
+ VALUE
1423
+ ustr_scan_once(str, pat, start)
1424
+ VALUE str,
1425
+ pat;
1426
+ long *start;
1427
+ {
1428
+ VALUE result;
1429
+ long i;
1430
+ long beg,
1431
+ end, num_regs;
1432
+
1433
+ if (icu_reg_search(pat, str, *start, 0) >= 0) {
1434
+ icu_reg_range(pat, 0, &beg, &end);
1435
+ if (beg == end) {
1436
+ *start = end + 1;
1437
+ } else {
1438
+ *start = end;
1439
+ }
1440
+ num_regs = icu_group_count(pat);
1441
+ if (num_regs <= 1) {
1442
+ return icu_reg_nth_match(pat, 0);
1443
+ }
1444
+ result = rb_ary_new2(num_regs);
1445
+ for (i = 1; i <= num_regs; i++) {
1446
+ rb_ary_store(result, i - 1, icu_reg_nth_match(pat, i));
1447
+ }
1448
+
1449
+ return result;
1450
+ }
1451
+ return Qnil;
1452
+ }
1453
+
1454
+
1455
+ /**
1456
+ * call-seq:
1457
+ * str.scan(pattern) => array
1458
+ * str.scan(pattern) {|match, ...| block } => str
1459
+ *
1460
+ * Both forms iterate through <i>str</i>, matching the pattern (which may be a
1461
+ * <code>URegexp</code> or a <code>UString</code>). For each match, a result is
1462
+ * generated and either added to the result array or passed to the block. If
1463
+ * the pattern contains no groups, each individual result consists of the
1464
+ * matched string. If the pattern contains groups, each
1465
+ * individual result is itself an array containing one entry per group.
1466
+ *
1467
+ * a = "cruel world".u
1468
+ * a.scan(/\w+/.U) #=> ["cruel", "world"]
1469
+ * a.scan(/.../.U) #=> ["cru", "el ", "wor"]
1470
+ * a.scan(/(...)/.U) #=> [["cru"], ["el "], ["wor"]]
1471
+ * a.scan(/(..)(..)/.U) #=> [["cr", "ue"], ["l ", "wo"]]
1472
+ *
1473
+ * And the block form:
1474
+ *
1475
+ * a.scan(/\w+/.U) {|w| print "<<#{w}>> " }
1476
+ * print "\n"
1477
+ * a.scan(/(.)(.)/.U) {|a,b| print b, a }
1478
+ * print "\n"
1479
+ *
1480
+ * <em>produces:</em>
1481
+ *
1482
+ * <<cruel>> <<world>>
1483
+ * rceu lowlr
1484
+ */
1485
+
1486
+ VALUE
1487
+ icu_ustr_scan(str, pat)
1488
+ VALUE str,
1489
+ pat;
1490
+ {
1491
+ VALUE result;
1492
+ long start = 0;
1493
+
1494
+ pat = get_pat(pat, 1);
1495
+ if (!rb_block_given_p()) {
1496
+ VALUE ary = rb_ary_new();
1497
+
1498
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1499
+ rb_ary_push(ary, result);
1500
+ }
1501
+ return ary;
1502
+ }
1503
+ ++(USTRING(str)->busy);
1504
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1505
+ rb_yield(result);
1506
+ }
1507
+ --(USTRING(str)->busy);
1508
+ return str;
1509
+ }
1510
+ /**
1511
+ * call-seq:
1512
+ * str.char_span(start[, len, [locale]])
1513
+ *
1514
+ * Returns substring starting at <code>start</code>-th char, with <code>len</code> chars length.
1515
+ * Here "char" means "grapheme cluster", so start index and len are measured in terms of "graphemes"
1516
+ * locale parameter is optional.
1517
+ * Negative len can be supplied to receive to end of string.
1518
+ *
1519
+ * String is transformed to NFC before extract.
1520
+ */
1521
+ VALUE
1522
+ icu_ustr_char_span(int argc, VALUE * argv, VALUE str)
1523
+ {
1524
+ UErrorCode error = 0;
1525
+ int32_t end, start, char_start = 0, char_len = -1, total_chars = 0;
1526
+ int32_t init_pos = -1, end_pos = -1, n;
1527
+ char *loc = NULL;
1528
+ VALUE cs, clen, locl, out;
1529
+ UBreakIterator *boundary;
1530
+
1531
+ n = rb_scan_args(argc, argv, "12", &cs, &clen, &locl);
1532
+ Check_Type(cs, T_FIXNUM);
1533
+ char_start = FIX2INT(cs);
1534
+ if(char_start < 0) rb_raise(rb_eArgError, "Negative offset aren't allowed!");
1535
+
1536
+ if( n > 1) {
1537
+ Check_Type(clen, T_FIXNUM);
1538
+ char_len = FIX2INT(clen);
1539
+ if(char_len <= 0) char_len = -1;
1540
+ }
1541
+ if( n > 2) {
1542
+ Check_Type(locl, T_STRING);
1543
+ loc = RSTRING_PTR(locl);
1544
+ }
1545
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1546
+ str = icu_ustr_normalize_C(str);
1547
+
1548
+ boundary =
1549
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1550
+ if (U_FAILURE(error))
1551
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1552
+
1553
+ start = ubrk_first(boundary);
1554
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1555
+ start = end, end = ubrk_next(boundary)) {
1556
+ if( total_chars == char_start ) init_pos = start;
1557
+ total_chars ++;
1558
+ if( char_len>0 && total_chars == char_start+char_len) end_pos = end;
1559
+ }
1560
+ ubrk_close(boundary);
1561
+ if( init_pos == -1) rb_raise(rb_eArgError, "Char index %d out of bounds %d", char_start, total_chars);
1562
+ if( end_pos == -1) end_pos = ICU_LEN(str); /* reached end of string */
1563
+ out = icu_ustr_new(ICU_PTR(str)+init_pos, end_pos - init_pos);
1564
+ return out;
1565
+ }
1566
+
1567
+ VALUE
1568
+ icu_ustr_chars(str, loc)
1569
+ VALUE str;
1570
+ char *loc;
1571
+ {
1572
+ UErrorCode error = 0;
1573
+ int32_t end, start;
1574
+ VALUE out;
1575
+ UBreakIterator *boundary;
1576
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1577
+ str = icu_ustr_normalize_C(str);
1578
+
1579
+ boundary =
1580
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1581
+ if (U_FAILURE(error))
1582
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1583
+
1584
+ out = rb_ary_new();
1585
+ start = ubrk_first(boundary);
1586
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1587
+ start = end, end = ubrk_next(boundary)) {
1588
+ rb_ary_push(out, icu_ustr_new(ICU_PTR(str) + start, end - start));
1589
+ }
1590
+ ubrk_close(boundary);
1591
+ return out;
1592
+ }
1593
+
1594
+ /**
1595
+ * call-seq:
1596
+ * str.chars(locale = "") => array of character
1597
+ *
1598
+ * Returns array of character graphemes, locale dependent.
1599
+ * String is transformed to NFC before split.
1600
+ * */
1601
+ VALUE
1602
+ icu_ustr_chars_m(argc, argv, str)
1603
+ int argc;
1604
+ VALUE *argv;
1605
+ VALUE str;
1606
+ {
1607
+ VALUE locale;
1608
+ if (rb_scan_args(argc, argv, "01", &locale) == 1) {
1609
+ Check_Type(locale, T_STRING);
1610
+ return icu_ustr_chars(str, RSTRING_PTR(locale));
1611
+ } else {
1612
+ return icu_ustr_chars(str, "");
1613
+ }
1614
+ }
1615
+
1616
+ /**
1617
+ * call-seq:
1618
+ * str.split(pattern, [limit]) => anArray
1619
+ *
1620
+ * Divides <i>str</i> into substrings based on a delimiter, returning an array
1621
+ * of these substrings. <i>str</i> is divided where the
1622
+ * pattern matches.
1623
+ *
1624
+ * NOTE: split(//) or split("") is not supported.
1625
+ * To get array of chars use #chars or #codepoints methods
1626
+ *
1627
+ * If the <i>limit</i> parameter is omitted, trailing null fields are
1628
+ * suppressed. If <i>limit</i> is a positive number, at most that number of
1629
+ * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
1630
+ * string is returned as the only entry in an array). If negative, there is no
1631
+ * limit to the number of fields returned, and trailing null fields are not
1632
+ * suppressed.
1633
+ *
1634
+ * NOTE: there's a difference in ICU regexp split and Ruby Regexp actions:
1635
+ * "a,b,c,,".split(/,/, -1) # => ["a", "b", "c", "", ""]
1636
+ * "a,b,c,,".u.split(ure(","), -1) # => ["a", "b", "c", ""]
1637
+ * it seems to be by design, in icu/source/i18n/uregex.cpp uregex_split():
1638
+ * if (nextOutputStringStart == inputLen) {
1639
+ * // The delimiter was at the end of the string. We're done.
1640
+ * break;
1641
+ * }
1642
+ */
1643
+
1644
+ VALUE
1645
+ icu_ustr_split_m(argc, argv, str)
1646
+ int argc;
1647
+ VALUE *argv;
1648
+ VALUE str;
1649
+ {
1650
+ VALUE spat;
1651
+ VALUE limit = Qnil;
1652
+ int lim = 0;
1653
+ VALUE result;
1654
+
1655
+ if (rb_scan_args(argc, argv, "11", &spat, &limit) == 2) {
1656
+ lim = NUM2INT(limit);
1657
+ if (lim <= 0)
1658
+ limit = Qnil;
1659
+ }
1660
+ if (CLASS_OF(spat) == rb_cURegexp) {
1661
+ result = icu_reg_split(spat, str, limit);
1662
+ } else {
1663
+ if (CLASS_OF(spat) == rb_cUString) {
1664
+ result = icu_reg_split(icu_reg_comp(spat), str, limit);
1665
+ } else {
1666
+ rb_raise(rb_eArgError, "Expected UString or URegexp, got %s",
1667
+ rb_class2name(CLASS_OF(spat)));
1668
+ }
1669
+ }
1670
+ if (NIL_P(limit) && lim == 0) {
1671
+ while (RARRAY_LEN(result) > 0 &&
1672
+ ICU_LEN( (RARRAY_PTR(result)[RARRAY_LEN(result) - 1])) == 0)
1673
+ rb_ary_pop(result);
1674
+ }
1675
+
1676
+ return result;
1677
+ }
1678
+
1679
+ /**
1680
+ * call-seq:
1681
+ * str.inspect => String
1682
+ *
1683
+ * Shows codepoints in form of \uxxxx. For debug purposes.
1684
+ */
1685
+ VALUE
1686
+ icu_ustr_inspect(str)
1687
+ VALUE str;
1688
+ {
1689
+ VALUE buf = rb_str_new2("");
1690
+ char temp[] = "\\u0010FFFF ";
1691
+ int32_t i,
1692
+ n,
1693
+ k,
1694
+ c;
1695
+ UChar *s = ICU_PTR(str);
1696
+ n = ICU_LEN(str);
1697
+ i = 0;
1698
+ while (i < n) {
1699
+ U16_NEXT(s, i, n, c); /* care surrogate */
1700
+ if(c >= 0x10000)
1701
+ k = sprintf(temp, "\\u%08X", c);
1702
+ else
1703
+ k = sprintf(temp, "\\u%04X", c);
1704
+ rb_str_cat(buf, temp, k);
1705
+ }
1706
+ return buf;
1707
+ }
1708
+
1709
+ /**
1710
+ * call-seq:
1711
+ * str.codepoints => array of fixnums
1712
+ *
1713
+ * Returns array of codepoints as fixnums.
1714
+ */
1715
+ VALUE
1716
+ icu_ustr_points(str)
1717
+ VALUE str;
1718
+ {
1719
+ VALUE buf = rb_ary_new();
1720
+ int32_t i,
1721
+ n,
1722
+ c;
1723
+ UChar *s = ICU_PTR(str);
1724
+ n = ICU_LEN(str);
1725
+ i = 0;
1726
+ while (i < n) {
1727
+ U16_NEXT(s, i, n, c); /* care surrogates */
1728
+ rb_ary_push(buf, LONG2NUM(c));
1729
+ }
1730
+ return buf;
1731
+ }
1732
+
1733
+
1734
+ /**
1735
+ * call-seq:
1736
+ * str.inspect_names => String
1737
+ *
1738
+ * Dumps names of codepoints in this UString (debug).
1739
+ */
1740
+ VALUE
1741
+ icu_ustr_inspect_names(str)
1742
+ VALUE str;
1743
+ {
1744
+ VALUE buf = rb_str_new2("");
1745
+ char temp[301];
1746
+ UErrorCode error;
1747
+ int32_t i,
1748
+ n,
1749
+ c,
1750
+ l;
1751
+ UChar *s = ICU_PTR(str);
1752
+ n = ICU_LEN(str);
1753
+ i = 0;
1754
+ while (i < n) {
1755
+ U16_NEXT(s, i, n, c) sprintf(temp, "<U%06X>", c); /* care surrogates */
1756
+ rb_str_cat(buf, temp, 9);
1757
+ error = 0;
1758
+ l = u_charName(c, U_UNICODE_CHAR_NAME, temp, 300, &error);
1759
+ rb_str_cat(buf, temp, l);
1760
+ rb_str_cat(buf, "\n", 1);
1761
+ }
1762
+ return buf;
1763
+ }
1764
+
1765
+ VALUE
1766
+ icu_ustr_subpat(str, re, nth)
1767
+ VALUE str,
1768
+ re;
1769
+ int nth;
1770
+ {
1771
+ if (icu_reg_search(re, str, 0, 0) >= 0) {
1772
+ return icu_reg_nth_match(re, nth);
1773
+ }
1774
+ return Qnil;
1775
+ }
1776
+
1777
+ /* beg len are code unit indexes*/
1778
+ VALUE
1779
+ icu_ustr_substr(str, beg, len)
1780
+ VALUE str;
1781
+ long beg,
1782
+ len;
1783
+ {
1784
+ int32_t str_size;
1785
+ str_size = ICU_LEN(str);
1786
+ if (len < 0) return Qnil;
1787
+
1788
+ if (beg > str_size) return Qnil;
1789
+ if (beg < 0) {
1790
+ beg += str_size;
1791
+ if (beg < 0) return Qnil;
1792
+ }
1793
+ if (beg + len > str_size) {
1794
+ len = str_size - beg;
1795
+ }
1796
+ if (len < 0) {
1797
+ len = 0;
1798
+ }
1799
+ if( len == 0) return icu_ustr_new(0, 0);
1800
+ /* adjust to codepoint boundaries */
1801
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
1802
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
1803
+ return icu_ustr_new(ICU_PTR(str) + beg, len);
1804
+ }
1805
+
1806
+ VALUE
1807
+ icu_ustr_aref(str, indx)
1808
+ VALUE str;
1809
+ VALUE indx;
1810
+ {
1811
+ long idx;
1812
+ int32_t cp_len = ICU_LEN(str);
1813
+
1814
+ switch (TYPE(indx)) {
1815
+ case T_FIXNUM:
1816
+ idx = FIX2LONG(indx);
1817
+
1818
+ num_index:
1819
+ if (idx < 0) {
1820
+ idx = cp_len + idx;
1821
+ }
1822
+ if (idx < 0 || cp_len <= idx) {
1823
+ return Qnil;
1824
+ }
1825
+ return icu_ustr_substr(str, idx, 1);
1826
+
1827
+ case T_DATA:
1828
+ if (CLASS_OF(indx) == rb_cURegexp)
1829
+ return icu_ustr_subpat(str, indx, 0);
1830
+ if (CLASS_OF(indx) == rb_cUString) {
1831
+ if (icu_ustr_index(str, indx, 0) != -1)
1832
+ return icu_ustr_dup(indx);
1833
+ return Qnil;
1834
+ }
1835
+
1836
+ default:
1837
+ /*
1838
+ * check if indx is Range
1839
+ */
1840
+ {
1841
+ long beg,
1842
+ len;
1843
+ switch (rb_range_beg_len(indx, &beg, &len, cp_len, 0)) {
1844
+ case Qfalse:
1845
+ break;
1846
+ case Qnil:
1847
+ return Qnil;
1848
+ default:
1849
+ return icu_ustr_substr(str, beg, len);
1850
+ }
1851
+ }
1852
+ idx = NUM2LONG(indx);
1853
+ goto num_index;
1854
+ }
1855
+ return Qnil; /* not reached */
1856
+ }
1857
+
1858
+ /**
1859
+ * call-seq:
1860
+ * str[fixnum] => new_str or nil
1861
+ * str[fixnum, fixnum] => new_str or nil
1862
+ * str[range] => new_str or nil
1863
+ * str[regexp] => new_str or nil
1864
+ * str[regexp, fixnum] => new_str or nil
1865
+ * str[other_str] => new_str or nil
1866
+ * str.slice(fixnum) => new_str or nil
1867
+ * str.slice(fixnum, fixnum) => new_str or nil
1868
+ * str.slice(range) => new_str or nil
1869
+ * str.slice(regexp) => new_str or nil
1870
+ * str.slice(regexp, fixnum) => new_str or nil
1871
+ * str.slice(other_str) => new_str or nil
1872
+ *
1873
+ * Element Reference---If passed a single <code>Fixnum</code>, returns
1874
+ * substring with the character at that position. If passed two <code>Fixnum</code>
1875
+ * objects, returns a substring starting at the offset given by the first, and
1876
+ * a length given by the second. If given a range, a substring containing
1877
+ * characters at offsets given by the range is returned. In all three cases, if
1878
+ * an offset is negative, it is counted from the end of <i>str</i>. Returns
1879
+ * <code>nil</code> if the initial offset falls outside the string, the length
1880
+ * is negative, or the beginning of the range is greater than the end.
1881
+ *
1882
+ * If a <code>URegexp</code> is supplied, the matching portion of <i>str</i> is
1883
+ * returned. If a numeric parameter follows the regular expression, that
1884
+ * component of the <code>UMatch</code> is returned instead. If a
1885
+ * <code>UString</code> is given, that string is returned if it occurs in
1886
+ * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1887
+ * match.
1888
+ *
1889
+ * a = "hello there".u
1890
+ * a[1] #=> 'e'
1891
+ * a[1,3] #=> "ell"
1892
+ * a[1..3] #=> "ell"
1893
+ * a[-3,2] #=> "er"
1894
+ * a[-4..-2] #=> "her"
1895
+ * a[12..-1] #=> nil
1896
+ * a[-2..-4] #=> ""
1897
+ * a[/[aeiou](.)\1/.U] #=> "ell"
1898
+ * a[/[aeiou](.)\1/.U, 0] #=> "ell"
1899
+ * a[/[aeiou](.)\1/.U, 1] #=> "l"
1900
+ * a[/[aeiou](.)\1/.U, 2] #=> nil
1901
+ * a["lo".u] #=> "lo"
1902
+ * a["bye".u] #=> nil
1903
+ */
1904
+
1905
+ VALUE
1906
+ icu_ustr_aref_m(argc, argv, str)
1907
+ int argc;
1908
+ VALUE *argv;
1909
+ VALUE str;
1910
+ {
1911
+ if (argc == 2) {
1912
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
1913
+ return icu_ustr_subpat(str, argv[0], NUM2INT(argv[1]));
1914
+ }
1915
+ return icu_ustr_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
1916
+ }
1917
+ if (argc != 1) {
1918
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
1919
+ argc);
1920
+ }
1921
+ return icu_ustr_aref(str, argv[0]);
1922
+ }
1923
+
1924
+ /**
1925
+ * call-seq:
1926
+ * str.sub!(pattern, replacement) => str or nil
1927
+ * str.sub!(pattern) {|match| block } => str or nil
1928
+ *
1929
+ * Performs the substitutions of <code>UString#sub</code> in place,
1930
+ * returning <i>str</i>, or <code>nil</code> if no substitutions were
1931
+ * performed.
1932
+ */
1933
+
1934
+ VALUE
1935
+ icu_ustr_sub_bang(argc, argv, str)
1936
+ int argc;
1937
+ VALUE *argv;
1938
+ VALUE str;
1939
+ {
1940
+ return ustr_gsub(argc, argv, str, 1, 1 );
1941
+ }
1942
+
1943
+
1944
+ /**
1945
+ * call-seq:
1946
+ * str.sub(pattern, replacement) => new_str
1947
+ * str.sub(pattern) {|match| block } => new_str
1948
+ *
1949
+ * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
1950
+ * <i>pattern</i> replaced with either <i>replacement</i> or the value of the
1951
+ * block. The <i>pattern</i> will typically be a <code>URegexp</code>; if it is
1952
+ * a <code>UString</code> then no regular expression metacharacters will be
1953
+ * interpreted (that is <code>/\d/.U</code> will match a digit, but
1954
+ * <code>'\d'</code> will match a backslash followed by a 'd').
1955
+ *
1956
+ * The sequences <code>$1</code>, <code>$2</code>, etc., may be used.
1957
+ *
1958
+ * In the block form, the current UMatch object is passed in as a parameter.
1959
+ * The value returned by the block will be substituted for the match on each call.
1960
+ *
1961
+ * "hello".u.sub(/[aeiou]/.U, '*'.u) #=> "h*llo"
1962
+ * "hello".u.sub(/([aeiou])/.U, '<$1>'.u) #=> "h<e>llo"
1963
+ */
1964
+
1965
+ VALUE
1966
+ icu_ustr_sub(argc, argv, str)
1967
+ int argc;
1968
+ VALUE *argv;
1969
+ VALUE str;
1970
+ {
1971
+ str = icu_ustr_dup(str);
1972
+ icu_ustr_sub_bang(argc, argv, str);
1973
+ return str;
1974
+ }
1975
+
1976
+ /**
1977
+ * replace in string from +beg+ length +len+ (in code units)
1978
+ */
1979
+ static void
1980
+ icu_ustr_splice(str, beg, len, val)
1981
+ VALUE str;
1982
+ long beg,
1983
+ len;
1984
+ VALUE val;
1985
+ {
1986
+ long char_len;
1987
+ Check_Class(val, rb_cUString);
1988
+ if (val == str) {
1989
+ val = icu_ustr_dup(str);
1990
+ }
1991
+ if (len < 0)
1992
+ rb_raise(rb_eIndexError, "negative length %ld", len);
1993
+ char_len = ICU_LEN(str);
1994
+
1995
+ if (char_len < beg) {
1996
+ out_of_range:
1997
+ rb_raise(rb_eIndexError, "index %ld out of string", beg);
1998
+ }
1999
+ if (beg < 0) {
2000
+ if (-beg > char_len) {
2001
+ goto out_of_range;
2002
+ }
2003
+ beg += char_len;
2004
+ }
2005
+ if (char_len < beg + len) {
2006
+ len = char_len - beg;
2007
+ }
2008
+ /* adjust to codepoint boundaries */
2009
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
2010
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
2011
+
2012
+ ustr_splice_units(USTRING(str), beg, len, ICU_PTR(val), ICU_LEN(val));
2013
+ OBJ_INFECT(str, val);
2014
+ }
2015
+
2016
+
2017
+ /**
2018
+ * call-seq:
2019
+ * str.insert(index, other_str) => str
2020
+ *
2021
+ * Inserts <i>other_str</i> before the character at the given
2022
+ * <i>index</i>, modifying <i>str</i>. Negative indices count from the
2023
+ * end of the string, and insert <em>after</em> the given character.
2024
+ * The intent is insert <i>other_str</i> so that it starts at the given
2025
+ * <i>index</i>.
2026
+ *
2027
+ * "abcd".u.insert(0, 'X'.u) #=> "Xabcd"
2028
+ * "abcd".u.insert(3, 'X'.u) #=> "abcXd"
2029
+ * "abcd".u.insert(4, 'X'.u) #=> "abcdX"
2030
+ * "abcd".u.insert(-3, 'X'.u) #=> "abXcd"
2031
+ * "abcd".u.insert(-1, 'X'.u) #=> "abcdX"
2032
+ */
2033
+
2034
+ VALUE
2035
+ icu_ustr_insert(str, idx, str2)
2036
+ VALUE str,
2037
+ idx,
2038
+ str2;
2039
+ {
2040
+ long pos = NUM2LONG(idx);
2041
+ icu_check_frozen(1, str);
2042
+
2043
+ if (pos == -1) {
2044
+ pos = NUM2LONG(icu_ustr_length(str));
2045
+ } else if (pos < 0) {
2046
+ pos++;
2047
+ }
2048
+
2049
+ icu_ustr_splice(str, pos, 0, str2);
2050
+ return str;
2051
+ }
2052
+
2053
+ /**
2054
+ * call-seq:
2055
+ * str.include? other_str => true or false
2056
+ *
2057
+ * Returns <code>true</code> if <i>str</i> contains the given string
2058
+ *
2059
+ * "hello".u.include? "lo".u #=> true
2060
+ * "hello".u.include? "ol".u #=> false
2061
+ */
2062
+
2063
+ VALUE
2064
+ icu_ustr_include(str, arg)
2065
+ VALUE str,
2066
+ arg;
2067
+ {
2068
+ long i;
2069
+ i = icu_ustr_index(str, arg, 0);
2070
+ if (i == -1)
2071
+ return Qfalse;
2072
+ return Qtrue;
2073
+ }
2074
+
2075
+ static void
2076
+ icu_ustr_subpat_set(str, re, nth, val)
2077
+ VALUE str,
2078
+ re;
2079
+ int nth;
2080
+ VALUE val;
2081
+ {
2082
+ long start,
2083
+ end,
2084
+ len;
2085
+ VALUE matched;
2086
+
2087
+ if (icu_reg_search(re, str, 0, 0) < 0) {
2088
+ rb_raise(rb_eIndexError, "regexp not matched");
2089
+ }
2090
+ matched = icu_reg_range(re, nth, &start, &end);
2091
+ if (NIL_P(matched)) {
2092
+ rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2093
+ }
2094
+ len = end - start;
2095
+ /* adjust to codepoint boundaries */
2096
+ U16_SET_CP_START(ICU_PTR(str), 0, start);
2097
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
2098
+
2099
+ ustr_splice_units(USTRING(str), start, len, ICU_PTR(val), ICU_LEN(val));
2100
+ }
2101
+
2102
+ VALUE
2103
+ icu_ustr_aset(str, indx, val)
2104
+ VALUE str;
2105
+ VALUE indx,
2106
+ val;
2107
+ {
2108
+ long idx,
2109
+ beg;
2110
+ long char_len = ICU_LEN(str);
2111
+
2112
+ switch (TYPE(indx)) {
2113
+ case T_FIXNUM:
2114
+ num_index:
2115
+ idx = FIX2LONG(indx);
2116
+ if (char_len <= idx) {
2117
+ out_of_range:
2118
+ rb_raise(rb_eIndexError, "index %ld out of string", idx);
2119
+ }
2120
+ if (idx < 0) {
2121
+ if (-idx > char_len)
2122
+ goto out_of_range;
2123
+ idx += char_len;
2124
+ }
2125
+ icu_ustr_splice(str, idx, 1, val);
2126
+ return val;
2127
+
2128
+ case T_DATA:
2129
+ if (CLASS_OF(indx) == rb_cURegexp) {
2130
+ icu_ustr_subpat_set(str, indx, 0, val);
2131
+ return val;
2132
+ }
2133
+ if (CLASS_OF(indx) == rb_cUString) {
2134
+ beg = icu_ustr_index(str, indx, 0);
2135
+ if (beg < 0) {
2136
+ rb_raise(rb_eIndexError, "string not matched");
2137
+ }
2138
+ ustr_splice_units(USTRING(str), beg, ICU_LEN(indx), ICU_PTR(val), ICU_LEN(val));
2139
+ return val;
2140
+ }
2141
+ default:
2142
+ /*
2143
+ * check if indx is Range
2144
+ */
2145
+ {
2146
+ long beg,
2147
+ len;
2148
+ if (rb_range_beg_len(indx, &beg, &len, char_len, 2)) {
2149
+ icu_ustr_splice(str, beg, len, val);
2150
+ return val;
2151
+ }
2152
+ }
2153
+ idx = NUM2LONG(indx);
2154
+ goto num_index;
2155
+ }
2156
+ }
2157
+
2158
+
2159
+ /**
2160
+ * call-seq:
2161
+ * str[fixnum] = new_str
2162
+ * str[fixnum, fixnum] = new_str
2163
+ * str[range] = new_str
2164
+ * str[regexp] = new_str
2165
+ * str[regexp, fixnum] = new_str
2166
+ * str[other_str] = new_str
2167
+ *
2168
+ * Element Assignment---Replaces some or all of the content of <i>str</i>. The
2169
+ * portion of the string affected is determined using the same criteria as
2170
+ * <code>UString#[]</code>. If the replacement string is not the same length as
2171
+ * the text it is replacing, the string will be adjusted accordingly. If the
2172
+ * regular expression or string is used as the index doesn't match a position
2173
+ * in the string, <code>IndexError</code> is raised. If the regular expression
2174
+ * form is used, the optional second <code>Fixnum</code> allows you to specify
2175
+ * which portion of the match to replace (effectively using the
2176
+ * <code>UMatch</code> indexing rules. The forms that take a
2177
+ * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2178
+ * out of range; the <code>Range</code> form will raise a
2179
+ * <code>RangeError</code>, and the <code>URegexp</code> and <code>UString</code>
2180
+ * forms will silently ignore the assignment.
2181
+ */
2182
+
2183
+ VALUE
2184
+ icu_ustr_aset_m(argc, argv, str)
2185
+ int argc;
2186
+ VALUE *argv;
2187
+ VALUE str;
2188
+ {
2189
+ icu_check_frozen(1, str);
2190
+ if (argc == 3) {
2191
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
2192
+ icu_ustr_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2193
+ } else {
2194
+ icu_ustr_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]),
2195
+ argv[2]);
2196
+ }
2197
+ return argv[2];
2198
+ }
2199
+ if (argc != 2) {
2200
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2201
+ argc);
2202
+ }
2203
+ return icu_ustr_aset(str, argv[0], argv[1]);
2204
+ }
2205
+
2206
+ /**
2207
+ * call-seq:
2208
+ * str.slice!(fixnum) => new_str or nil
2209
+ * str.slice!(fixnum, fixnum) => new_str or nil
2210
+ * str.slice!(range) => new_str or nil
2211
+ * str.slice!(regexp) => new_str or nil
2212
+ * str.slice!(other_str) => new_str or nil
2213
+ *
2214
+ * Deletes the specified portion from <i>str</i>, and returns the portion
2215
+ * deleted. The forms that take a <code>Fixnum</code> will raise an
2216
+ * <code>IndexError</code> if the value is out of range; the <code>Range</code>
2217
+ * form will raise a <code>RangeError</code>, and the <code>URegexp</code> and
2218
+ * <code>UString</code> forms will silently ignore the assignment.
2219
+ *
2220
+ * string = "this is a string".u
2221
+ * string.slice!(2) #=> 105
2222
+ * string.slice!(3..6) #=> " is "
2223
+ * string.slice!(/s.*t/.U) #=> "sa st"
2224
+ * string.slice!("r".u) #=> "r"
2225
+ * string #=> "thing"
2226
+ */
2227
+
2228
+ VALUE
2229
+ icu_ustr_slice_bang(argc, argv, str)
2230
+ int argc;
2231
+ VALUE *argv;
2232
+ VALUE str;
2233
+ {
2234
+ VALUE result;
2235
+ VALUE buf[3];
2236
+ int i;
2237
+ icu_check_frozen(1, str);
2238
+ if (argc < 1 || 2 < argc) {
2239
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
2240
+ argc);
2241
+ }
2242
+ for (i = 0; i < argc; i++) {
2243
+ buf[i] = argv[i];
2244
+ }
2245
+ buf[i] = icu_ustr_new(0, 0);
2246
+ result = icu_ustr_aref_m(argc, buf, str);
2247
+ if (!NIL_P(result)) {
2248
+ icu_ustr_aset_m(argc + 1, buf, str);
2249
+ }
2250
+ return result;
2251
+ }
2252
+
2253
+ VALUE
2254
+ ustr_gsub(argc, argv, str, bang, once)
2255
+ int argc;
2256
+ VALUE *argv;
2257
+ VALUE str;
2258
+ int bang;
2259
+ int once;
2260
+ {
2261
+ VALUE pat,
2262
+ repl;
2263
+ long beg,
2264
+ end,
2265
+ prev_end;
2266
+ int tainted = 0,
2267
+ iter = 0;
2268
+ VALUE buf, curr_repl, umatch, block_res;
2269
+ if (argc == 1 && rb_block_given_p()) {
2270
+ iter = 1;
2271
+ } else if (argc == 2) {
2272
+ repl = argv[1];
2273
+ Check_Class(repl, rb_cUString);
2274
+ if (OBJ_TAINTED(repl))
2275
+ tainted = 1;
2276
+ } else {
2277
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2278
+ argc);
2279
+ }
2280
+
2281
+ pat = get_pat(argv[0], 1);
2282
+ beg = icu_reg_search(pat, str, 0, 0);
2283
+
2284
+ if (beg < 0) {
2285
+ /* no match */
2286
+ if (bang)
2287
+ return Qnil;
2288
+ return icu_ustr_dup(str);
2289
+ }
2290
+ end = 0;
2291
+ // icu_check_frozen(1, str);
2292
+ ++(USTRING(str)->busy);
2293
+ buf = icu_ustr_new(0, 0);
2294
+ pat = icu_reg_clone(pat);
2295
+ if(rb_block_given_p()) iter = 1;
2296
+ do {
2297
+
2298
+ prev_end = end;
2299
+ icu_reg_range(pat, 0, &beg, &end);
2300
+ icu_ustr_concat(buf, icu_reg_get_prematch(pat, prev_end));
2301
+ if ( iter ) {
2302
+ UChar * ptr = ICU_PTR(str);
2303
+ long o_len = ICU_LEN(str);
2304
+ umatch = icu_umatch_new(pat);
2305
+ block_res = rb_yield(umatch);
2306
+ if (CLASS_OF(block_res) == rb_cUString)
2307
+ curr_repl = block_res;
2308
+ else if (CLASS_OF(block_res) == rb_cUMatch)
2309
+ curr_repl = icu_umatch_aref(block_res, INT2FIX(0));
2310
+ else
2311
+ curr_repl =
2312
+ icu_from_rstr(0, NULL, rb_obj_as_string(block_res));
2313
+ ustr_mod_check(str, ptr, o_len);
2314
+ } else {
2315
+ curr_repl = icu_reg_get_replacement(pat, repl, prev_end);
2316
+ }
2317
+ icu_ustr_concat(buf, curr_repl);
2318
+ }
2319
+ while (icu_reg_find_next(pat) && !once);
2320
+ icu_ustr_concat(buf, icu_reg_get_tail(pat, end));
2321
+ --(USTRING(str)->busy);
2322
+ if (bang) {
2323
+ icu_ustr_replace(str, buf);
2324
+ return str;
2325
+ } else {
2326
+ return buf;
2327
+ }
2328
+ }
2329
+
2330
+ /**
2331
+ * call-seq:
2332
+ * str.gsub!(pattern, replacement) => str or nil
2333
+ * str.gsub!(pattern) {|match| block } => str or nil
2334
+ *
2335
+ * Performs the substitutions of <code>UString#gsub</code> in place, returning
2336
+ * <i>str</i>, or <code>nil</code> if no substitutions were performed.
2337
+ */
2338
+
2339
+ VALUE
2340
+ icu_ustr_gsub_bang(argc, argv, str)
2341
+ int argc;
2342
+ VALUE *argv;
2343
+ VALUE str;
2344
+ {
2345
+ icu_check_frozen(1, str);
2346
+ return ustr_gsub(argc, argv, str, 1, 0);
2347
+ }
2348
+
2349
+
2350
+ /**
2351
+ * call-seq:
2352
+ * str.gsub(pattern, replacement) => new_str
2353
+ * str.gsub(pattern) {|match| block } => new_str
2354
+ *
2355
+ * Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
2356
+ * replaced with either <i>replacement</i> or the value of the block. The
2357
+ * <i>pattern</i> will typically be a <code>URegexp</code>; if it is a
2358
+ * <code>UString</code> then no regular expression metacharacters will be
2359
+ * interpreted (that is <code>/\d/</code> will match a digit, but
2360
+ * <code>'\d'</code> will match a backslash followed by a 'd').
2361
+ *
2362
+ * If a string is used as the replacement, the sequences <code>$1</code>, <code>$2</code>, and so on
2363
+ * may be used to interpolate successive groups in the match.
2364
+ *
2365
+ * In the block form, the current UMatch object is passed in as a parameter. The value
2366
+ * returned by the block will be substituted for the match on each call.
2367
+ *
2368
+ * "hello".gsub(/[aeiou]/.U, '*') #=> "h*ll*"
2369
+ * "hello".gsub(/([aeiou])/.U, '<$1>') #=> "h<e>ll<o>"
2370
+ */
2371
+
2372
+ VALUE
2373
+ icu_ustr_gsub(argc, argv, str)
2374
+ int argc;
2375
+ VALUE *argv;
2376
+ VALUE str;
2377
+ {
2378
+ return ustr_gsub(argc, argv, str, 0, 0);
2379
+ }
2380
+
2381
+
2382
+ /*-------------*/
2383
+
2384
+
2385
+ /**
2386
+ * call-seq:
2387
+ * str.to_f( locale = "",[format_pattern]) => aFloat
2388
+ *
2389
+ * Parses string as double value, with respect to +locale+ and format pattern,
2390
+ * if they are provided.
2391
+ *
2392
+ * "456".u.to_f # => 456.0
2393
+ * "123,001".u.to_f("ru") # => 123.001
2394
+ * "123,001".u.to_f("en") # => 123001.0
2395
+ * "Got 123,001".u.to_f("en", "Got ###,###".u) # => 123001
2396
+ */
2397
+
2398
+ VALUE
2399
+ icu_ustr_parse_double( int argc, VALUE * argv, VALUE str)
2400
+ {
2401
+ UParseError error;
2402
+ UErrorCode status = U_ZERO_ERROR;
2403
+ UNumberFormat * format = NULL;
2404
+ VALUE loc, pattern;
2405
+ char * locale;
2406
+ double value;
2407
+ int32_t pos, n;
2408
+
2409
+ n = rb_scan_args(argc, argv, "02", &loc, &pattern) ;
2410
+ if( n == 2) {
2411
+ Check_Class(pattern, rb_cUString);
2412
+ } else pattern = Qnil;
2413
+
2414
+ if (n > 0) {
2415
+ Check_Type(loc, T_STRING);
2416
+ locale = RSTRING_PTR(loc);
2417
+ } else locale = NULL;
2418
+
2419
+ if( pattern != Qnil ) {
2420
+ format = unum_open(UNUM_PATTERN_DECIMAL, ICU_PTR(pattern), ICU_LEN(pattern), locale,
2421
+ &error, &status);
2422
+ } else {
2423
+ format = unum_open(UNUM_DECIMAL, NULL, 0, locale,&error, &status);
2424
+ }
2425
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't open format %s", u_errorName(status));
2426
+ pos = 0;
2427
+ value = unum_parseDouble(format, ICU_PTR(str), ICU_LEN(str), &pos, &status);
2428
+ unum_close(format);
2429
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't parse %s at %d", u_errorName(status), pos);
2430
+ return rb_float_new(value);
2431
+ }
2432
+
2433
+ /**
2434
+ * call-seq:
2435
+ * UString::strcoll(str1, str2 ) => Fixnum
2436
+ * UString::strcoll(str1, str2 , locale) => Fixnum
2437
+ * UString::strcoll(str1, str2 , locale, strength) => Fixnum
2438
+ *
2439
+ * Performs locale-sensitive string comparison.
2440
+ * Special values for locales can be passed in - if +nil+ is passed for the locale,
2441
+ * the default locale collation rules will be used. If empty string ("") or "root" are
2442
+ * passed, UCA rules will be used.
2443
+ *
2444
+ * Strength must be a fixnum that set collation strength:
2445
+ * -1 is default, 0 - primary, 1 - secondary, 2 - ternary.
2446
+ * E.g., pass 0 to ignore case and accents, 1 - to ignore case only.
2447
+ **/
2448
+ VALUE
2449
+ icu_ustr_coll(argc, argv, self)
2450
+ int argc;
2451
+ VALUE *argv;
2452
+ VALUE self;
2453
+ {
2454
+ UErrorCode status = 0 ;
2455
+ UCollator * collator = 0;
2456
+ int result;
2457
+ VALUE ret = Qnil;
2458
+ VALUE str1, str2, loc, strength = Qnil;
2459
+ char * locale = NULL;
2460
+ int n ;
2461
+ n = rb_scan_args(argc, argv, "22", &str1, &str2, &loc, &strength);
2462
+ if ( n == 3) {
2463
+ if( loc != Qnil) {
2464
+ Check_Type(loc, T_STRING);
2465
+ locale = RSTRING_PTR(loc);
2466
+ }
2467
+ }
2468
+ Check_Class(str1, rb_cUString);
2469
+ Check_Class(str2, rb_cUString);
2470
+ collator = ucol_open(locale, &status);
2471
+ if( U_FAILURE(status) )
2472
+ {
2473
+ rb_raise(rb_eArgError, u_errorName(status));
2474
+ }
2475
+ if( n == 4 ){
2476
+ Check_Type(strength, T_FIXNUM);
2477
+ ucol_setStrength(collator, NUM2INT(strength));
2478
+ }
2479
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
2480
+
2481
+ switch(result){
2482
+ case UCOL_EQUAL: ret = INT2FIX(0);break;
2483
+ case UCOL_GREATER: ret = INT2FIX(1);break;
2484
+ case UCOL_LESS: ret = INT2FIX(-1);break;
2485
+ }
2486
+ ucol_close(collator);
2487
+ return ret;
2488
+ }
2489
+
2490
+ /**
2491
+ * call-seq:
2492
+ * UString::list_coll => anArray
2493
+ *
2494
+ * Returns array of available collator locales, to be used in UString#strcoll
2495
+ * */
2496
+ VALUE icu_ustr_list_coll(str)
2497
+ VALUE str;
2498
+ {
2499
+ int32_t i, n =ucol_countAvailable();
2500
+ VALUE ret = rb_ary_new();
2501
+ for( i = 0; i<n; i++) {
2502
+ rb_ary_push(ret, rb_str_new2(ucol_getAvailable(i)));
2503
+ }
2504
+ return ret;
2505
+ }
2506
+
2507
+ /**
2508
+ * call-seq:
2509
+ * UString::list_locales => anArray
2510
+ *
2511
+ * Returns array of available locales.
2512
+ * */
2513
+ VALUE icu_ustr_list_locales(str)
2514
+ VALUE str;
2515
+ {
2516
+ int32_t i, n =uloc_countAvailable();
2517
+ VALUE ret = rb_ary_new();
2518
+ for( i = 0; i<n; i++) {
2519
+ rb_ary_push(ret, rb_str_new2(uloc_getAvailable(i)));
2520
+ }
2521
+ return ret;
2522
+ }
2523
+ /**
2524
+ * call-seq:
2525
+ * UString::list_translits => anArray
2526
+ *
2527
+ * Returns array of available translits.
2528
+ * */
2529
+ VALUE icu_ustr_list_translits(str)
2530
+ VALUE str;
2531
+ {
2532
+ UErrorCode status = U_ZERO_ERROR;
2533
+ UEnumeration * ids ;
2534
+ VALUE ret ;
2535
+ UChar * name;
2536
+ int32_t len;
2537
+ ids = utrans_openIDs (&status);
2538
+ ICU_RAISE(status);
2539
+ ret = rb_ary_new();
2540
+ while( (name = (UChar*)uenum_unext(ids, &len, &status))) {
2541
+ rb_ary_push(ret, icu_ustr_new(name, len));
2542
+ }
2543
+ uenum_close(ids);
2544
+ return ret;
2545
+
2546
+ }
2547
+ /**
2548
+ * call-seq:
2549
+ * str.search(pattern, options = {})
2550
+ *
2551
+ * Searches for match in string. Returns array of +Range+
2552
+ * corresponding to position where pattern is matched.
2553
+ *
2554
+ * Valid options are:
2555
+ * :locale -- locale, +String+, value e.g. "en", "ru_RU"
2556
+ * :ignore_case -- whether to ignore case, valid values are +true+ or +false+, default to +false+
2557
+ * :ignore_case_accents -- sets collator options to strength +0+ - primary difference, e.g. ignore case and accents,
2558
+ * overrides :ignore_case: option, default to +false+,
2559
+ * :loosely -- same as :ignore_case_accents
2560
+ * :limit -- Fixnum limit of match positions to return.
2561
+ * :whole_words -- whether to match whole words only
2562
+ * :canonical -- use canonical equivalence
2563
+ *
2564
+ *
2565
+ * a = "A quick brown fox jumped over the lazy fox dancing foxtrote".u
2566
+ * a.search("fox".u) # => [14..16, 39..41, 51..53]
2567
+ * a.search("FoX".u) # => []
2568
+ * a.search("FoX".u, :ignore_case => true) # => [14..16, 39..41, 51..53]
2569
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true) # => [14..16, 39..41]
2570
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true, :limit => 1) # => [14..16]
2571
+ *
2572
+ * b = "Iñtërnâtiônàlizætiøn îs cọmpłèx".u.upcase # => IÑTËRNÂTIÔNÀLIZÆTIØN ÎS CỌMPŁÈX
2573
+ * b.search("nâtiôn".u, :locale => "en") # => []
2574
+ * b.search("nation".u) # => []
2575
+ * b.search("nation".u, :locale => "en", :ignore_case_accents => true) # => [5..10]
2576
+ * b.search("nâtiôn".u, :locale => "en", :ignore_case => true) # => [5..10]
2577
+ * b.search("zaeti".u, :locale => "en" ) # => []
2578
+ * b.search("zaeti".u, :locale => "en", :ignore_case => true) # => []
2579
+ * b.search("zaeti".u, :locale => "en", :ignore_case_accents => true) # => [14..17]
2580
+ *
2581
+ * v = [?a, 0x0325, 0x0300].to_u # => ḁ̀
2582
+ * v.search([?a, 0x300].to_u, :canonical => true) # => [0..2]
2583
+ * v.search([?a, 0x300].to_u) # => []
2584
+ **/
2585
+
2586
+ VALUE icu_ustr_search(argc, argv, str)
2587
+ int argc;
2588
+ VALUE *argv;
2589
+ VALUE str;
2590
+
2591
+ {
2592
+ UErrorCode status = U_ZERO_ERROR;
2593
+ UStringSearch * search = 0 ;
2594
+ VALUE pat, locale , limit, options;
2595
+ int lim = -1, count = 0 ;
2596
+ int32_t start, len;
2597
+ VALUE ret = rb_ary_new();
2598
+ UCollator * collator = 0;
2599
+ UBreakIterator * brkit = 0;
2600
+ char * loc = 0;
2601
+ if ( rb_scan_args(argc, argv, "11", &pat, &options) == 2 ) {
2602
+ Check_Type(options, T_HASH);
2603
+ } else {
2604
+ options = Qnil;
2605
+ }
2606
+
2607
+ Check_Class(pat, rb_cUString);
2608
+ locale = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("locale")));
2609
+
2610
+ if( locale != Qnil ) {
2611
+ Check_Type(locale, T_STRING);
2612
+ loc = RSTRING_PTR(locale);
2613
+ }
2614
+ limit = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("limit")));
2615
+
2616
+ if(TYPE(limit) == T_FIXNUM) {
2617
+ lim = FIX2INT(limit);
2618
+ if(lim <= 0) {
2619
+ rb_raise(rb_eTypeError, "Limit must be positive or nil, got: %d", lim);
2620
+ }
2621
+ }
2622
+ else
2623
+ if (limit!=Qnil)
2624
+ rb_raise(rb_eArgError, "Limit must be Fixnum, got %s", rb_class2name(CLASS_OF(limit)));
2625
+
2626
+ collator = ucol_open(loc, &status);
2627
+ ucol_setStrength(collator, -1);
2628
+
2629
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("whole_words"))) )
2630
+ brkit = ubrk_open(UBRK_WORD, loc, ICU_PTR(str), ICU_LEN(str), &status);
2631
+
2632
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case"))) )
2633
+ ucol_setStrength(collator, UCOL_SECONDARY);
2634
+
2635
+ if( options != Qnil &&
2636
+ ( Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case_accents")) )
2637
+ || Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("loosely")) )
2638
+ )
2639
+ )
2640
+ ucol_setStrength(collator, UCOL_PRIMARY );
2641
+
2642
+
2643
+ search = usearch_openFromCollator(ICU_PTR(pat), ICU_LEN(pat),
2644
+ ICU_PTR(str), ICU_LEN(str),
2645
+ collator, brkit, &status);
2646
+
2647
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("canonical"))) )
2648
+ usearch_setAttribute(search, USEARCH_CANONICAL_MATCH, USEARCH_ON, &status);
2649
+
2650
+ if( U_FAILURE(status) ) goto failure;
2651
+
2652
+ status = U_ZERO_ERROR;
2653
+ if( usearch_first(search, &status) == USEARCH_DONE) {
2654
+ usearch_close(search);
2655
+ ucol_close(collator);
2656
+ ubrk_close(brkit);
2657
+ return ret;
2658
+ }
2659
+
2660
+ do {
2661
+ if( U_FAILURE(status) ) goto failure;
2662
+
2663
+ start = usearch_getMatchedStart(search);
2664
+ len = usearch_getMatchedLength(search);
2665
+ rb_ary_push(ret, rb_range_new(LONG2NUM(start), LONG2NUM(start+len-1), 0));
2666
+
2667
+ status = U_ZERO_ERROR;
2668
+ count += 1;
2669
+ if (lim > 0 && count >= lim) break;
2670
+ } while (USEARCH_DONE != usearch_next(search, &status));
2671
+ usearch_close( search);
2672
+ ucol_close(collator);
2673
+ ubrk_close(brkit);
2674
+ return ret;
2675
+
2676
+ failure:
2677
+ usearch_close( search);
2678
+ ucol_close(collator);
2679
+ ubrk_close(brkit);
2680
+
2681
+ rb_raise(rb_eArgError, u_errorName(status));
2682
+ return Qnil;
2683
+ }
2684
+ /**
2685
+ * call-seq:
2686
+ * str.conv_unit_range(unit_range) => code_point_range
2687
+ *
2688
+ * Converts <b>code unit</b> range to <b>code point</b> range.
2689
+ * If your chars don't use multiple UTF16 codeunits, range will be the same.
2690
+ */
2691
+ VALUE icu_ustr_convert_unit_range(str, range)
2692
+ VALUE str, range;
2693
+ {
2694
+ long cu_start, cu_len, cur_pos, cp_len ;
2695
+ if( rb_range_beg_len(range, &cu_start, &cu_len, ICU_LEN(str), 0) != Qtrue)
2696
+ return Qnil;
2697
+
2698
+ cur_pos = u_countChar32( ICU_PTR(str), cu_start );
2699
+ if( cu_start+cu_len > ICU_LEN(str)) --cu_len;
2700
+ cp_len = u_countChar32( ICU_PTR(str) + cu_start , cu_len);
2701
+ return rb_range_new(LONG2NUM(cur_pos), LONG2NUM(cur_pos + cp_len-1), 0);
2702
+ }
2703
+ /**
2704
+ * call-seq:
2705
+ * str.conv_point_range(point_range) => code_unit_range
2706
+ *
2707
+ * Converts <b>code point</b> range to <b>code unit</b> range.
2708
+ * (inversion of #conv_unit_range)
2709
+ * If your chars don't use multiple UTF16 codeuints, range will be the same.
2710
+ */
2711
+ VALUE icu_ustr_convert_point_range(str, range)
2712
+ VALUE str, range;
2713
+ {
2714
+ long cp_start, cu_start, cu_end, cp_len, str_cp_len;
2715
+ str_cp_len = u_countChar32( ICU_PTR(str), ICU_LEN(str));
2716
+ if( Qtrue != rb_range_beg_len(range, &cp_start, &cp_len, str_cp_len, 0) ) return Qnil;
2717
+
2718
+ cu_start = 0;
2719
+ U16_FWD_N(ICU_PTR(str), cu_start, ICU_LEN(str), cp_start); /* care sur */
2720
+ cu_end = cu_start;
2721
+ U16_FWD_N(ICU_PTR(str), cu_end, ICU_LEN(str), cp_len); /* care sur */
2722
+
2723
+ return rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0);
2724
+ }
2725
+ /**
2726
+ * call-seq:
2727
+ * str.unit_count
2728
+ *
2729
+ * returns number of code units in string.
2730
+ *
2731
+ */
2732
+ VALUE icu_ustr_unit_count(VALUE str){
2733
+ return LONG2NUM(ICU_LEN(str));
2734
+ }
2735
+ /**
2736
+ * call-seq:
2737
+ * str.point_count
2738
+ *
2739
+ * returns number of code points in string.
2740
+ *
2741
+ */
2742
+ VALUE icu_ustr_point_count(VALUE str){
2743
+ return LONG2NUM(u_countChar32(ICU_PTR(str), ICU_LEN(str)));
2744
+ }
2745
+
2746
+ UChar icu_uchar_at(int32_t offset, void * context)
2747
+ {
2748
+ return ((UChar*)context)[offset];
2749
+ }
2750
+ /**
2751
+ * call-seq:
2752
+ * str.unescape => new_str
2753
+ *
2754
+ * Unescape a string of characters.
2755
+ *
2756
+ * The following escape sequences are recognized:
2757
+ * \uhhhh 4 hex digits; h in [0-9A-Fa-f]
2758
+ * \Uhhhhhhhh 8 hex digits
2759
+ * \xhh 1-2 hex digits \x{h...} 1-8 hex digits
2760
+ * \ooo 1-3 octal digits; o in [0-7]
2761
+ * \cX control-X; X is masked with 0x1F
2762
+ *
2763
+ * as well as the standard ANSI C escapes:
2764
+ * \a => U+0007, \b => U+0008, \t => U+0009, \n => U+000A, \v => U+000B, \f => U+000C, \r => U+000D, \e => U+001B, \" => U+0022, \' => U+0027, \? => U+003F, \\ => U+005C
2765
+ *
2766
+ * If escape sequence is invalid, it is ignored.
2767
+ *
2768
+ * "\\u044D\\u043A\\u0440\\u0430\\u043D\\u0438\\u0440\\u043E\\u0432\\u0430\\u043D\\u0438\\u0435".u.unescape => "экранирование"
2769
+ *
2770
+ **/
2771
+
2772
+ VALUE icu_ustr_unescape(str)
2773
+ VALUE str;
2774
+ {
2775
+ UChar32 c32;
2776
+ int32_t offset, leng, i, segment_start;
2777
+ UChar * ptr;
2778
+ UChar buf[3];
2779
+ VALUE ret;
2780
+ offset = 0;
2781
+ segment_start = 0;
2782
+ leng = ICU_LEN(str);
2783
+ ptr = ICU_PTR(str);
2784
+ ret = icu_ustr_new(0, 0);
2785
+ while(offset < leng) {
2786
+ if( ptr[offset] == '\\' ) {
2787
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2788
+ ++offset;
2789
+ c32 = u_unescapeAt(icu_uchar_at, &offset, leng, ICU_PTR(str));
2790
+ // append this char
2791
+ if( 0xFFFFFFFF == c32) continue;
2792
+ i = 0;
2793
+ U16_APPEND_UNSAFE(buf, i, c32);
2794
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, buf, U16_LENGTH(c32));
2795
+ segment_start = offset;
2796
+ } else {
2797
+ ++offset;
2798
+ }
2799
+ }
2800
+ if( segment_start < offset)
2801
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2802
+
2803
+ return ret;
2804
+ }
2805
+
2806
+
2807
+
2808
+ /* transliteration */
2809
+ extern VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len);
2810
+ /**
2811
+ * call-seq:
2812
+ * str.translit(id, [rules])
2813
+ *
2814
+ * Performs {transliteration}[http://icu.sourceforge.net/userguide/Transformations.html],
2815
+ * of this string, using given transform +id+ and +rules+
2816
+ *
2817
+ * "yukihiro matsumoto".u.translit("Latin-Hiragana".u) # => ゆきひろ まつもと
2818
+ * "hello".u.translit("null".u, ":: upper();".u) # => HELLO
2819
+ **/
2820
+ VALUE icu_ustr_translit(argc, argv, str)
2821
+ int argc;
2822
+ VALUE * argv ;
2823
+ VALUE str;
2824
+ {
2825
+ VALUE id, rules ;
2826
+ if(rb_scan_args(argc, argv, "11", &id, &rules) == 2) {
2827
+ Check_Class(rules, rb_cUString);
2828
+ } else rules = Qnil;
2829
+
2830
+ Check_Class(str, rb_cUString);
2831
+ Check_Class(id, rb_cUString);
2832
+ if( rules == Qnil) {
2833
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id), NULL, 0);
2834
+ } else {
2835
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id),
2836
+ ICU_PTR(rules), ICU_LEN(rules));
2837
+ }
2838
+ }
2839
+ void
2840
+ initialize_ustring(void)
2841
+ {
2842
+ UErrorCode status = U_ZERO_ERROR;
2843
+ u_init(&status);
2844
+ if( U_FAILURE(status) ){
2845
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2846
+ }
2847
+ s_UCA_collator = ucol_open("", &status);
2848
+ if( U_FAILURE(status) ){
2849
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2850
+ }
2851
+ s_case_UCA_collator = ucol_open("", &status);
2852
+ if( U_FAILURE(status) ){
2853
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2854
+ }
2855
+ ucol_setStrength(s_case_UCA_collator, UCOL_SECONDARY);
2856
+
2857
+ /*
2858
+
2859
+ Document-class: UString
2860
+
2861
+ UString is a string class that stores Unicode characters directly and provides
2862
+ similar functionality as the Ruby String class.
2863
+
2864
+ An UString string consists of 16-bit Unicode code units. A Unicode character
2865
+ may be stored with either one code unit which is the most common case or with a matched
2866
+ pair of special code units ("surrogates").
2867
+
2868
+ For single-character handling, a Unicode character code point is a value in the
2869
+ range 0..0x10ffff.
2870
+
2871
+ Indexes and offsets into and lengths of strings always count code units, not code points.
2872
+ This is the same as with multi-byte char* strings in traditional string handling.
2873
+ Operations on partial strings typically do not test for code point boundaries.
2874
+
2875
+ In order to use the collation, text boundary analysis, formatting and other ICU APIs,
2876
+ Unicode strings must be used. In order to get Unicode strings from your native codepage,
2877
+ you can use the conversion API.
2878
+
2879
+ UString class is also point for access to several ICU services, instead of
2880
+ mirroring ICU class hierarchy.
2881
+
2882
+ ==== Methods by category:
2883
+
2884
+ - concat and modify: + , * , << , #concat , #replace
2885
+
2886
+ - element reference, insert, replace: [] , #slice , []= , #slice! , #insert , #char_span
2887
+
2888
+ - comparisons: <=> , == , #casecmp , #strcoll
2889
+
2890
+ - size and positions: #length , #point_count , #clear , #empty? , #conv_unit_range , #conv_point_range
2891
+
2892
+ - index/search methods: #index , #rindex , #include? , #search
2893
+
2894
+ - regexps, matching and replacing: =~ , #match , #scan , #split , #sub , #sub! , #gsub , #gsub!
2895
+
2896
+ - conversion String/UString: #to_s, Kernel#u, String#to_u
2897
+
2898
+ - iterators: #each_line_break , #each_word , #each_char , #each_sentence
2899
+
2900
+ - split to chars/codepoints: #chars , #codepoints , Array#to_u
2901
+
2902
+ - character case: #upcase , #upcase! , #downcase , #downcase!
2903
+
2904
+ - stripping spaces: #strip , #lstrip , #rstrip , #strip! , #lstrip! , #rstrip!
2905
+
2906
+ - formatting and parsing: #format , #parse_date , #to_f
2907
+
2908
+ - UNICODE normalization: #norm_C , #norm_D , #norm_KC , #norm_KD , #norm_FCD
2909
+
2910
+ - utilities: #unescape , #hash , #inspect , #inspect_names , #translit
2911
+
2912
+ - ICU avalable info: #list_coll , #list_locales , #list_translits
2913
+ */
2914
+ rb_cUString = rb_define_class("UString", rb_cObject);
2915
+ rb_include_module(rb_cUString, rb_mComparable);
2916
+
2917
+ /* initializations */
2918
+ rb_define_alloc_func(rb_cUString, icu_ustr_alloc);
2919
+ rb_define_method(rb_cUString, "initialize", icu_ustr_init, -1);
2920
+ rb_define_method(rb_cUString, "initialize_copy", icu_ustr_replace, 1);
2921
+ rb_define_method(rb_cUString, "replace", icu_ustr_replace, 1);
2922
+
2923
+ /* comparisons */
2924
+ rb_define_method(rb_cUString, "<=>", icu_ustr_cmp_m, 1);
2925
+ rb_define_method(rb_cUString, "==", icu_ustr_equal, 1);
2926
+ rb_define_method(rb_cUString, "eql?", icu_ustr_equal, 1);
2927
+ rb_define_method(rb_cUString, "casecmp", icu_ustr_casecmp, 1);
2928
+ rb_define_singleton_method(rb_cUString, "strcoll", icu_ustr_coll, -1);
2929
+
2930
+ /* ICU avalable info */
2931
+ rb_define_singleton_method(rb_cUString, "list_coll", icu_ustr_list_coll, 0);
2932
+ rb_define_singleton_method(rb_cUString, "list_locales", icu_ustr_list_locales, 0);
2933
+ rb_define_singleton_method(rb_cUString, "list_translits", icu_ustr_list_translits, 0);
2934
+
2935
+ /* hash code */
2936
+ rb_define_method(rb_cUString, "hash", icu_ustr_hash_m, 0);
2937
+
2938
+ /* inspect */
2939
+ rb_define_method(rb_cUString, "inspect", icu_ustr_inspect, 0);
2940
+ rb_define_method(rb_cUString, "inspect_names", icu_ustr_inspect_names, 0);
2941
+
2942
+ /* size */
2943
+ rb_define_method(rb_cUString, "length", icu_ustr_length, 0);
2944
+ rb_define_alias (rb_cUString, "size", "length");
2945
+ rb_define_method(rb_cUString, "unit_count", icu_ustr_unit_count, 0);
2946
+ rb_define_method(rb_cUString, "point_count", icu_ustr_point_count, 0);
2947
+ rb_define_method(rb_cUString, "clear", icu_ustr_clear, 0);
2948
+ rb_define_method(rb_cUString, "empty?", icu_ustr_empty, 0);
2949
+
2950
+ /* UNICODE normalization */
2951
+ rb_define_method(rb_cUString, "norm_C", icu_ustr_normalize_C, 0);
2952
+ rb_define_method(rb_cUString, "norm_D", icu_ustr_normalize_D, 0);
2953
+ rb_define_method(rb_cUString, "norm_KC", icu_ustr_normalize_KC, 0);
2954
+ rb_define_method(rb_cUString, "norm_KD", icu_ustr_normalize_KD, 0);
2955
+ rb_define_method(rb_cUString, "norm_FCD", icu_ustr_normalize_FCD, 0);
2956
+
2957
+ /* iterators */
2958
+ rb_define_method(rb_cUString, "each_line_break", icu_ustr_each_line, -1);
2959
+ rb_define_method(rb_cUString, "each_word", icu_ustr_each_word, -1);
2960
+ rb_define_method(rb_cUString, "each_char", icu_ustr_each_char, -1);
2961
+ rb_define_method(rb_cUString, "each_sentence", icu_ustr_each_sentence, -1);
2962
+ rb_define_alias(rb_cUString, "each", "each_line_break");
2963
+
2964
+ /* split to chars/codepoints */
2965
+ rb_define_method(rb_cUString, "chars", icu_ustr_chars_m, -1);
2966
+ rb_define_method(rb_cUString, "char_span", icu_ustr_char_span, -1);
2967
+ rb_define_method(rb_cUString, "codepoints", icu_ustr_points, 0);
2968
+
2969
+ /* concat operations */
2970
+ rb_define_method(rb_cUString, "+", icu_ustr_plus, 1);
2971
+ rb_define_method(rb_cUString, "*", icu_ustr_times, 1);
2972
+ rb_define_method(rb_cUString, "concat", icu_ustr_concat, 1);
2973
+ rb_define_alias( rb_cUString, "<<", "concat");
2974
+
2975
+ /* character case */
2976
+ rb_define_method(rb_cUString, "upcase", icu_ustr_upcase, -1);
2977
+ rb_define_method(rb_cUString, "upcase!", icu_ustr_upcase_bang, -1);
2978
+ rb_define_method(rb_cUString, "downcase", icu_ustr_downcase, -1);
2979
+ rb_define_method(rb_cUString, "downcase!", icu_ustr_downcase_bang, -1);
2980
+ rb_define_method(rb_cUString, "foldcase", icu_ustr_foldcase, 0);
2981
+
2982
+ /* stripping spaces */
2983
+ rb_define_method(rb_cUString, "strip", icu_ustr_strip, 0);
2984
+ rb_define_method(rb_cUString, "lstrip", icu_ustr_lstrip, 0);
2985
+ rb_define_method(rb_cUString, "rstrip", icu_ustr_rstrip, 0);
2986
+
2987
+ rb_define_method(rb_cUString, "strip!", icu_ustr_strip_bang, 0);
2988
+ rb_define_method(rb_cUString, "lstrip!", icu_ustr_lstrip_bang, 0);
2989
+ rb_define_method(rb_cUString, "rstrip!", icu_ustr_rstrip_bang, 0);
2990
+
2991
+ /* index/search methods */
2992
+ rb_define_method(rb_cUString, "index", icu_ustr_index_m, -1);
2993
+ rb_define_method(rb_cUString, "rindex", icu_ustr_rindex_m, -1);
2994
+ rb_define_method(rb_cUString, "include?", icu_ustr_include, 1);
2995
+ rb_define_method(rb_cUString, "search", icu_ustr_search, -1);
2996
+
2997
+ /* element reference */
2998
+ rb_define_method(rb_cUString, "[]", icu_ustr_aref_m, -1);
2999
+ rb_define_alias(rb_cUString, "slice", "[]");
3000
+
3001
+ /* codeunit/codepoint conversion */
3002
+ rb_define_method(rb_cUString, "conv_unit_range", icu_ustr_convert_unit_range, 1);
3003
+ rb_define_method(rb_cUString, "conv_point_range", icu_ustr_convert_point_range, 1);
3004
+
3005
+ /* insert/replace */
3006
+ rb_define_method(rb_cUString, "[]=", icu_ustr_aset_m, -1);
3007
+ rb_define_method(rb_cUString, "slice!", icu_ustr_slice_bang, -1);
3008
+ rb_define_method(rb_cUString, "insert", icu_ustr_insert, 2);
3009
+
3010
+ /* conversion to String from UString */
3011
+ rb_define_method(rb_cUString, "to_u", icu_ustr_to_ustr, -1);
3012
+ rb_define_method(rb_cUString, "to_s", icu_ustr_to_rstr, -1);
3013
+ rb_define_alias(rb_cUString, "to_str", "to_s");
3014
+
3015
+ /* formatting messages */
3016
+ rb_define_method(rb_cUString, "format", icu_ustr_format, -2);
3017
+ rb_define_alias( rb_cUString, "fmt", "format");
3018
+
3019
+ /* parsing */
3020
+ rb_define_method(rb_cUString, "to_f", icu_ustr_parse_double, -1);
3021
+
3022
+ /* transliteration */
3023
+ rb_define_method(rb_cUString, "translit", icu_ustr_translit, -1);
3024
+
3025
+ /* unescaping */
3026
+ rb_define_method(rb_cUString, "unescape", icu_ustr_unescape, 0);
3027
+
3028
+ /* regexp matching and replacing */
3029
+ rb_define_method(rb_cUString, "=~", icu_ustr_match, 1);
3030
+ rb_define_method(rb_cUString, "match", icu_ustr_match_m, 1);
3031
+ rb_define_method(rb_cUString, "scan", icu_ustr_scan, 1);
3032
+ rb_define_method(rb_cUString, "split", icu_ustr_split_m, -1);
3033
+ rb_define_method(rb_cUString, "sub", icu_ustr_sub, -1);
3034
+ rb_define_method(rb_cUString, "sub!", icu_ustr_sub_bang, -1);
3035
+ rb_define_method(rb_cUString, "gsub", icu_ustr_gsub, -1);
3036
+ rb_define_method(rb_cUString, "gsub!", icu_ustr_gsub_bang, -1);
3037
+
3038
+ }
3039
+