icu4r 0.1.3.2006.01.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ustring.c ADDED
@@ -0,0 +1,3042 @@
1
+ /**
2
+ * ustring.c - ICU based Unicode string support.
3
+ *
4
+ * $Id: ustring.c,v 1.20 2006/01/23 14:26:45 meadow Exp $
5
+ *
6
+ * Copyright (c) 2006 Nikolai Lugovoi
7
+ *
8
+ * This code is based on original ruby String class source (string.c):
9
+ *
10
+ * * string.c -
11
+ * *
12
+ * * Copyright (C) 1993-2003 Yukihiro Matsumoto
13
+ * * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
14
+ * * Copyright (C) 2000 Information-technology Promotion Agency, Japan
15
+ * *
16
+ **/
17
+
18
+ #include "icu_common.h"
19
+ VALUE icu_ustr_replace(VALUE str, VALUE str2);
20
+ VALUE ustr_gsub(int argc, VALUE * argv, VALUE str, int bang, int once);
21
+ extern VALUE icu_from_rstr(int argc, VALUE * argv, VALUE str);
22
+
23
+ VALUE rb_cURegexp;
24
+ VALUE rb_cUString;
25
+ VALUE rb_cUMatch;
26
+ VALUE rb_cUResourceBundle;
27
+ VALUE rb_cULocale;
28
+ VALUE rb_cUCalendar;
29
+
30
+ #include "uregex.h"
31
+
32
+
33
+ /* to be used in <=>, casecmp */
34
+ static UCollator * s_UCA_collator, * s_case_UCA_collator;
35
+
36
+ static void
37
+ free_ustr(str)
38
+ ICUString *str;
39
+ {
40
+ if (str->ptr)
41
+ free(str->ptr);
42
+ str->ptr = 0;
43
+ free(str);
44
+ }
45
+ inline void icu_check_frozen(VALUE str)
46
+ {
47
+ rb_check_frozen(str);
48
+ if(USTRING(str)->busy) rb_raise(rb_eRuntimeError, "String is busy. Can't modify");
49
+ }
50
+ #define START_BUF_LEN 16
51
+ /**
52
+ * Allocate ICUString struct with given +capa+ capacity,
53
+ * if mode == 1 and UChar != 0 - copy len UChars from src,
54
+ * else set pointer to src.
55
+ */
56
+ #define ICU_COPY 1
57
+ #define ICU_SET 0
58
+ VALUE icu_ustr_alloc_and_wrap(UChar * src, long len, long capa, int mode)
59
+ {
60
+ ICUString *n_str = ALLOC_N(ICUString, 1);
61
+ size_t alloc_capa;
62
+ if( mode == ICU_COPY ) {
63
+ alloc_capa = START_BUF_LEN > capa ? START_BUF_LEN : capa;
64
+ if(alloc_capa<=len) alloc_capa = len + 1;
65
+ n_str->ptr = ALLOC_N(UChar, alloc_capa);
66
+ n_str->capa = alloc_capa;
67
+ n_str->len = len;
68
+ if( src ) {
69
+ u_memcpy(n_str->ptr, src, len);
70
+ n_str->ptr[len] = 0;
71
+ }
72
+ } else {
73
+ n_str->ptr = src;
74
+ n_str->len = len;
75
+ n_str->capa = capa;
76
+ }
77
+ if(n_str->capa <= n_str->len) rb_raise(rb_eRuntimeError, "Capacity is not large then len, sentinel can't be set!");
78
+ n_str->busy = 0;
79
+ n_str->ptr[n_str->len] = 0;
80
+ return Data_Wrap_Struct(rb_cUString, 0, free_ustr, n_str);
81
+ }
82
+ VALUE
83
+ icu_ustr_alloc(klass)
84
+ VALUE klass;
85
+ {
86
+ return icu_ustr_alloc_and_wrap(NULL, 0, 0, ICU_COPY);
87
+ }
88
+
89
+ void ustr_capa_resize(ICUString * str, long new_capa)
90
+ {
91
+ if (new_capa != str->capa) {
92
+ if (str->capa < new_capa || (str->capa - new_capa > 1024)) {
93
+ if(new_capa < START_BUF_LEN) new_capa = START_BUF_LEN;
94
+ REALLOC_N(str->ptr, UChar, new_capa);
95
+ str->capa = new_capa;
96
+ }
97
+ }
98
+ }
99
+ /* delete +del_len+ units from string and insert replacement */
100
+ void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len)
101
+ {
102
+ long new_len;
103
+ UChar * temp = 0 ;
104
+ if( str->busy ) {
105
+ rb_warn("Attempt to modify busy string. Ignored");
106
+ return;
107
+ }
108
+ if( repl_len < 0) return;
109
+ if( del_len == 0 && repl_len == 0) return;
110
+ new_len = str->len - del_len + repl_len;
111
+ if (replacement == str->ptr ) {
112
+ temp = ALLOC_N(UChar, repl_len);
113
+ u_memcpy(temp, replacement, repl_len);
114
+ replacement = temp;
115
+ }
116
+ if ( repl_len >= del_len) ustr_capa_resize(str, new_len+1);
117
+ /* move tail */
118
+ if(str->len - (start+del_len) > 0) {
119
+ u_memmove(str->ptr + start+repl_len, str->ptr + start+del_len, str->len-(start+del_len) );
120
+ }
121
+ /* copy string */
122
+ if( repl_len > 0) u_memcpy(str->ptr+start, replacement, repl_len);
123
+ if ( repl_len < del_len) ustr_capa_resize(str, new_len+1);
124
+ str->len = new_len;
125
+ str->ptr[new_len] = 0;
126
+ if(temp) {
127
+ free(temp);
128
+ }
129
+ }
130
+ static inline void
131
+ ustr_mod_check(VALUE s, UChar *p, long len)
132
+ {
133
+ if (ICU_PTR(s) != p || ICU_LEN(s) != len){
134
+ rb_raise(rb_eRuntimeError, "string modified");
135
+ }
136
+ }
137
+ VALUE
138
+ ustr_new(klass, ptr, len)
139
+ VALUE klass;
140
+ UChar *ptr;
141
+ long len;
142
+ {
143
+ if (len < 0) {
144
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
145
+ }
146
+ return icu_ustr_alloc_and_wrap(ptr, len, len+1, ICU_COPY);
147
+ }
148
+
149
+ VALUE
150
+ icu_ustr_new(ptr, len)
151
+ const UChar *ptr;
152
+ long len;
153
+ {
154
+ return ustr_new(rb_cUString, ptr, len);
155
+ }
156
+ VALUE
157
+ icu_ustr_new_set(ptr, len, capa)
158
+ UChar *ptr;
159
+ long len;
160
+ long capa;
161
+ {
162
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_SET);
163
+ }
164
+ VALUE
165
+ icu_ustr_new2(ptr)
166
+ const UChar *ptr;
167
+ {
168
+ if (!ptr) {
169
+ rb_raise(rb_eArgError, "NULL pointer given");
170
+ }
171
+ return icu_ustr_new(ptr, u_strlen(ptr));
172
+ }
173
+
174
+ inline VALUE
175
+ icu_ustr_new_capa(UChar * ptr, long len, long capa)
176
+ {
177
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_COPY);
178
+ }
179
+
180
+ /* ------------ */
181
+
182
+ /**
183
+ * call-seq:
184
+ * UString.new(str="".u) => new_str
185
+ *
186
+ * Returns a new string object containing a copy of <i>str</i>.
187
+ */
188
+
189
+ VALUE
190
+ icu_ustr_init(argc, argv, str)
191
+ int argc;
192
+ VALUE *argv;
193
+ VALUE str;
194
+ {
195
+ VALUE orig;
196
+
197
+ if (rb_scan_args(argc, argv, "01", &orig) == 1)
198
+ {
199
+ icu_ustr_replace(str, orig);
200
+ }
201
+ return str;
202
+ }
203
+
204
+ /**
205
+ * call-seq:
206
+ * str.length => integer
207
+ *
208
+ * Returns the length of <i>str</i>.
209
+ */
210
+ VALUE
211
+ icu_ustr_length(str)
212
+ VALUE str;
213
+ {
214
+ return LONG2NUM(ICU_LEN(str));
215
+ }
216
+
217
+ /**
218
+ * call-seq:
219
+ * str.empty? => true or false
220
+ *
221
+ * Returns <code>true</code> if <i>str</i> has a length of zero.
222
+ *
223
+ * "hello".u.empty? #=> false
224
+ * "".u.empty? #=> true
225
+ */
226
+
227
+ VALUE
228
+ icu_ustr_empty(str)
229
+ VALUE str;
230
+ {
231
+ return 0 == ICU_LEN(str) ? Qtrue : Qfalse;
232
+ }
233
+
234
+ VALUE
235
+ icu_ustr_resize(str, len)
236
+ VALUE str;
237
+ long len;
238
+ {
239
+ if (len < 0) {
240
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
241
+ }
242
+ ustr_capa_resize(USTRING(str), len);
243
+ ICU_LEN(str) = len;
244
+ ICU_PTR(str)[len] = 0; /* sentinel */
245
+ return str;
246
+ }
247
+
248
+
249
+ /**
250
+ * call-seq:
251
+ * str.replace(other_str) => str
252
+ *
253
+ * Replaces the contents and taintedness of <i>str</i> with the corresponding
254
+ * values in <i>other_str</i>.
255
+ *
256
+ * s = "hello".u #=> "hello"
257
+ * s.replace "world".u #=> "world"
258
+ */
259
+ VALUE
260
+ icu_ustr_replace(str, str2)
261
+ VALUE str,
262
+ str2;
263
+ {
264
+ if (str == str2)
265
+ return str;
266
+ icu_check_frozen(str);
267
+ Check_Class(str2, rb_cUString);
268
+ ustr_splice_units(USTRING(str), 0, ICU_LEN(str), ICU_PTR(str2), ICU_LEN(str2));
269
+ OBJ_INFECT(str, str2);
270
+ return str;
271
+ }
272
+
273
+ /**
274
+ * call-seq:
275
+ * string.clear -> string
276
+ *
277
+ * Makes string empty.
278
+ *
279
+ * a = "abcde".u
280
+ * a.clear #=> ""
281
+ */
282
+
283
+ VALUE
284
+ icu_ustr_clear(str)
285
+ VALUE str;
286
+ {
287
+ icu_check_frozen(str);
288
+ icu_ustr_resize(str, 0);
289
+ return str;
290
+ }
291
+
292
+ int icu_collator_cmp (UCollator * collator, VALUE str1, VALUE str2)
293
+ {
294
+ int ret = 0, result ;
295
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
296
+ switch(result){
297
+ case UCOL_EQUAL: ret = 0;break;
298
+ case UCOL_GREATER: ret = 1;break;
299
+ case UCOL_LESS: ret = -1;break;
300
+ }
301
+ return ret;
302
+ }
303
+
304
+ int
305
+ icu_ustr_cmp(str1, str2)
306
+ VALUE str1,
307
+ str2;
308
+ {
309
+ return icu_collator_cmp(s_UCA_collator, str1, str2);
310
+ }
311
+
312
+ /**
313
+ * call-seq:
314
+ * str == obj => true or false
315
+ *
316
+ * Equality---If <i>obj</i> is not a <code>UString</code>, returns
317
+ * <code>false</code>. Otherwise, returns <code>true</code> if
318
+ * strings are of the same length and content
319
+ *
320
+ */
321
+
322
+ VALUE
323
+ icu_ustr_equal(str1, str2)
324
+ VALUE str1,
325
+ str2;
326
+ {
327
+ if (str1 == str2)
328
+ return Qtrue;
329
+ if (CLASS_OF(str2) != rb_cUString) {
330
+ return Qfalse;
331
+ }
332
+ if (ICU_LEN(str1) == ICU_LEN(str2) &&
333
+ u_strncmp(ICU_PTR(str1), ICU_PTR(str2), ICU_LEN(str1) ) == 0) {
334
+ return Qtrue;
335
+ }
336
+ return Qfalse;
337
+ }
338
+
339
+ /**
340
+ * call-seq:
341
+ * str <=> other_str => -1, 0, +1
342
+ *
343
+ * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
344
+ * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
345
+ * <i>str</i>.
346
+ *
347
+ * <code><=></code> is the basis for the methods <code><</code>,
348
+ * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
349
+ * included from module <code>Comparable</code>. The method
350
+ * <code>String#==</code> does not use <code>Comparable#==</code>.
351
+ *
352
+ * This method uses UCA rules, see also #strcoll for locale-specific string collation.
353
+ *
354
+ * "abcdef".u <=> "abcde".u #=> 1
355
+ * "abcdef".u <=> "abcdef".u #=> 0
356
+ * "abcdef".u <=> "abcdefg".u #=> -1
357
+ * "abcdef".u <=> "ABCDEF".u #=> -1
358
+ */
359
+
360
+ VALUE
361
+ icu_ustr_cmp_m(str1, str2)
362
+ VALUE str1,
363
+ str2;
364
+ {
365
+ long result;
366
+
367
+ if (CLASS_OF(str2) != rb_cUString) {
368
+ return Qnil;
369
+ } else {
370
+ result = icu_ustr_cmp(str1, str2);
371
+ }
372
+ return LONG2NUM(result);
373
+ }
374
+
375
+ /**
376
+ * call-seq:
377
+ * str.casecmp(other_str) => -1, 0, +1
378
+ *
379
+ * Case-insensitive version of <code>UString#<=></code> .
380
+ * This method uses UCA collator with secondary strength, see #strcoll
381
+ *
382
+ *
383
+ * "abcdef".u.casecmp("abcde".u) #=> 1
384
+ * "aBcDeF".u.casecmp("abcdef".u) #=> 0
385
+ * "abcdef".u.casecmp("abcdefg".u) #=> -1
386
+ * "abcdef".u.casecmp("ABCDEF".u) #=> 0
387
+ */
388
+
389
+ VALUE
390
+ icu_ustr_casecmp(str1, str2)
391
+ VALUE str1,
392
+ str2;
393
+ {
394
+ Check_Class(str2, rb_cUString);
395
+ return INT2FIX(icu_collator_cmp(s_case_UCA_collator, str1, str2));
396
+ }
397
+
398
+ /**
399
+ * call-seq:
400
+ * str + other_str => new_str
401
+ *
402
+ * Concatenation---Returns a new <code>UString</code> containing
403
+ * <i>other_str</i> concatenated to <i>str</i>.
404
+ *
405
+ * "Hello from ".u + "main".u #=> "Hello from main"
406
+ */
407
+
408
+ VALUE
409
+ icu_ustr_plus(str1, str2)
410
+ VALUE str1,
411
+ str2;
412
+ {
413
+ VALUE str3;
414
+ Check_Class(str2, rb_cUString);
415
+
416
+ str3 = icu_ustr_new_capa(ICU_PTR(str1), ICU_LEN(str1), ICU_LEN(str1) + ICU_LEN(str2));
417
+ ustr_splice_units(USTRING(str3), ICU_LEN(str3), 0, ICU_PTR(str2), ICU_LEN(str2));
418
+ if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
419
+ OBJ_TAINT(str3);
420
+ return str3;
421
+ }
422
+
423
+ /**
424
+ * call-seq:
425
+ * str * integer => new_str
426
+ *
427
+ * Copy---Returns a new <code>UString</code> containing <i>integer</i> copies of
428
+ * the receiver.
429
+ *
430
+ * "Ho! ".u * 3 #=> "Ho! Ho! Ho! ".u
431
+ */
432
+
433
+ VALUE
434
+ icu_ustr_times(str, times)
435
+ VALUE str,
436
+ times;
437
+ {
438
+ VALUE str2;
439
+ long i,
440
+ len;
441
+ Check_Type(times, T_FIXNUM);
442
+ len = NUM2LONG(times);
443
+ if (len < 0) {
444
+ rb_raise(rb_eArgError, "negative argument");
445
+ }
446
+ if (len && LONG_MAX / len < ICU_LEN(str)) {
447
+ rb_raise(rb_eArgError, "argument too big");
448
+ }
449
+
450
+ str2 = icu_ustr_new_capa(0, 0, len *= ICU_LEN(str));
451
+ for (i = 0; i < len; i += ICU_LEN(str)) {
452
+ ustr_splice_units(USTRING(str2), i, 0, ICU_PTR(str), ICU_LEN(str));
453
+ }
454
+ ICU_PTR(str2)[ICU_LEN(str2)] = 0;
455
+
456
+ OBJ_INFECT(str2, str);
457
+
458
+ return str2;
459
+ }
460
+
461
+
462
+ /**
463
+ * call-seq:
464
+ * str << other_str => str
465
+ * str.concat(other_str) => str
466
+ *
467
+ * Append---Concatenates the given string object to <i>str</i>.
468
+ *
469
+ * a = "hello ".u
470
+ * a << "world".u #=> "hello world"
471
+ */
472
+
473
+ VALUE
474
+ icu_ustr_concat(str1, str2)
475
+ VALUE str1,
476
+ str2;
477
+ {
478
+ icu_check_frozen(str1);
479
+ Check_Class(str2, rb_cUString);
480
+ if (ICU_LEN(str2) > 0) {
481
+ ustr_splice_units(USTRING(str1), ICU_LEN(str1), 0, ICU_PTR(str2), ICU_LEN(str2));
482
+ OBJ_INFECT(str1, str2);
483
+ }
484
+ return str1;
485
+ }
486
+
487
+ int
488
+ icu_ustr_hash(str)
489
+ VALUE str;
490
+ {
491
+ register long len = ICU_LEN(str) * (sizeof(UChar));
492
+ register char *p = (char*)ICU_PTR(str);
493
+ register int key = 0;
494
+
495
+ while (len--) {
496
+ key += *p++;
497
+ key += (key << 10);
498
+ key ^= (key >> 6);
499
+ }
500
+ key += (key << 3);
501
+ key ^= (key >> 11);
502
+ key += (key << 15);
503
+ return key;
504
+ }
505
+
506
+ /**
507
+ * call-seq:
508
+ * str.hash => fixnum
509
+ *
510
+ * Return a hash based on the string's length and content.
511
+ */
512
+
513
+ VALUE
514
+ icu_ustr_hash_m(str)
515
+ VALUE str;
516
+ {
517
+ int key = icu_ustr_hash(str);
518
+ return INT2FIX(key);
519
+ }
520
+
521
+ VALUE
522
+ icu_ustr_dup(str)
523
+ VALUE str;
524
+ {
525
+ VALUE dup = icu_ustr_new(ICU_PTR(str), ICU_LEN(str));
526
+ return dup;
527
+ }
528
+
529
+ /**
530
+ * call-seq:
531
+ * str.upcase!(locale = "") => str or nil
532
+ *
533
+ * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
534
+ * were made. This method is locale-sensitive.
535
+ */
536
+
537
+ VALUE
538
+ icu_ustr_upcase_bang(argc, argv, str)
539
+ int argc;
540
+ VALUE * argv;
541
+ VALUE str;
542
+
543
+ {
544
+ UErrorCode error = 0;
545
+ UChar *buf = 0;
546
+ long len ;
547
+ VALUE loc;
548
+ char * locale = NULL;
549
+ icu_check_frozen(str);
550
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
551
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
552
+ if( loc != Qnil) {
553
+ Check_Type(loc, T_STRING);
554
+ locale = RSTRING(loc)->ptr;
555
+ }
556
+ }
557
+
558
+ len = u_strToUpper(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale, &error);
559
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
560
+ REALLOC_N(buf, UChar, len + 1);
561
+ error = 0;
562
+ len =
563
+ u_strToUpper(buf, len, ICU_PTR(str), ICU_LEN(str), locale, &error);
564
+ }
565
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
566
+ return Qnil;
567
+ free(ICU_PTR(str));
568
+ ICU_PTR(str) = buf;
569
+ ICU_LEN(str) = len;
570
+ return str;
571
+ }
572
+
573
+
574
+ /**
575
+ * call-seq:
576
+ * str.upcase(locale = "") => new_str
577
+ *
578
+ * Returns a copy of <i>str</i> with all lowercase letters replaced with their
579
+ * uppercase counterparts. The operation is locale sensitive.
580
+ *
581
+ * "hEllO".u.upcase #=> "HELLO"
582
+ */
583
+
584
+ VALUE
585
+ icu_ustr_upcase(argc, argv, str)
586
+ int argc;
587
+ VALUE * argv;
588
+ VALUE str;
589
+
590
+ {
591
+ str = icu_ustr_dup(str);
592
+ icu_ustr_upcase_bang(argc, argv, str);
593
+ return str;
594
+ }
595
+
596
+
597
+ /**
598
+ * call-seq:
599
+ * str.downcase!(locale = "") => str or nil
600
+ *
601
+ * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
602
+ * changes were made.
603
+ */
604
+
605
+ VALUE
606
+ icu_ustr_downcase_bang(argc, argv, str)
607
+ int argc;
608
+ VALUE * argv;
609
+ VALUE str;
610
+ {
611
+ UErrorCode error = 0;
612
+ UChar *buf;
613
+ long len ;
614
+ VALUE loc;
615
+ char * locale = NULL;
616
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
617
+ icu_check_frozen(str);
618
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
619
+ if( loc != Qnil) {
620
+ Check_Type(loc, T_STRING);
621
+ locale = RSTRING(loc)->ptr;
622
+ }
623
+ }
624
+ len =
625
+ u_strToLower(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale,
626
+ &error);
627
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
628
+ REALLOC_N(buf, UChar, len + 1);
629
+ error = 0;
630
+ len =
631
+ u_strToLower(buf, len , ICU_PTR(str), ICU_LEN(str), locale,
632
+ &error);
633
+ }
634
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
635
+ return Qnil;
636
+ free(ICU_PTR(str));
637
+ ICU_PTR(str) = buf;
638
+ ICU_LEN(str) = len;
639
+ return str;
640
+ }
641
+
642
+ /**
643
+ * call-seq:
644
+ * str.downcase(locale = "") => new_str
645
+ *
646
+ * Returns a copy of <i>str</i> with all uppercase letters replaced with their
647
+ * lowercase counterparts. The operation is locale sensitive.
648
+ *
649
+ * "hEllO".u.downcase #=> "hello"
650
+ */
651
+
652
+ VALUE
653
+ icu_ustr_downcase(argc, argv, str)
654
+ int argc;
655
+ VALUE * argv;
656
+ VALUE str;
657
+ {
658
+ str = icu_ustr_dup(str);
659
+ icu_ustr_downcase_bang(argc, argv, str);
660
+ return str;
661
+ }
662
+
663
+ /**
664
+ * call-seq:
665
+ * str.foldcase
666
+ *
667
+ * Case-fold the characters in a string.
668
+ * Case-folding is locale-independent and not context-sensitive.
669
+ *
670
+ */
671
+ VALUE
672
+ icu_ustr_foldcase(str)
673
+ VALUE str;
674
+ {
675
+ UErrorCode error = 0;
676
+ UChar *buf;
677
+ long len, capa ;
678
+ capa = ICU_LEN(str) + 1;
679
+ buf = ALLOC_N(UChar, capa);
680
+ len = u_strFoldCase(buf, capa-1, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
681
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
682
+ capa = len + 1;
683
+ REALLOC_N(buf, UChar, len + 1);
684
+ error = 0;
685
+ len = u_strFoldCase(buf, capa, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
686
+ }
687
+ return icu_ustr_new_set(buf, len, capa) ;
688
+ }
689
+
690
+ static long
691
+ icu_ustr_index(str, sub, offset)
692
+ VALUE str,
693
+ sub;
694
+ long offset;
695
+ {
696
+ long pos;
697
+ UChar *found;
698
+ if (offset < 0) {
699
+ offset += ICU_LEN(str);
700
+ if (offset < 0)
701
+ return -1;
702
+ }
703
+ if (ICU_LEN(str) - offset < ICU_LEN(sub))
704
+ return -1;
705
+ if (ICU_LEN(sub) == 0)
706
+ return offset;
707
+ found =
708
+ u_strFindFirst(ICU_PTR(str) + offset, ICU_LEN(str) - offset,
709
+ ICU_PTR(sub), ICU_LEN(sub));
710
+ if (NULL == found)
711
+ return -1;
712
+ pos = found - (ICU_PTR(str) + offset);
713
+ return pos + offset;
714
+ }
715
+
716
+ /**
717
+ * call-seq:
718
+ * str.index(substring [, offset]) => fixnum or nil
719
+ * str.index(regexp [, offset]) => fixnum or nil
720
+ *
721
+ * Returns the index of the first occurrence of the given <i>substring</i>,
722
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns
723
+ * <code>nil</code> if not found. If the second parameter is present, it
724
+ * specifies the position in the string to begin the search.
725
+ *
726
+ * "hello".u.index('e'.u) #=> 1
727
+ * "hello".u.index('lo'.u) #=> 3
728
+ * "hello".u.index('a'.u) #=> nil
729
+ * "hello".u.index(/[aeiou]/.U, -3) #=> 4
730
+ */
731
+
732
+ VALUE
733
+ icu_ustr_index_m(argc, argv, str)
734
+ int argc;
735
+ VALUE *argv;
736
+ VALUE str;
737
+ {
738
+ VALUE sub;
739
+ VALUE initpos;
740
+ long pos ;
741
+ int processed = 0;
742
+
743
+ if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
744
+ pos = NUM2LONG(initpos);
745
+ } else {
746
+ pos = 0;
747
+ }
748
+ if (pos < 0) {
749
+ pos += ICU_LEN(str);
750
+ }
751
+
752
+ if( CLASS_OF(sub) == rb_cUString) {
753
+ pos = icu_ustr_index(str, sub, pos);
754
+ processed = 1;
755
+ }
756
+ if( CLASS_OF(sub) == rb_cURegexp) {
757
+ pos = icu_reg_search(sub, str, pos, 0);
758
+ processed = 1;
759
+ }
760
+ if(! processed ) {
761
+ rb_raise(rb_eTypeError, "Wrong Type, expected UString or URegexp, got %s", rb_class2name(CLASS_OF(sub)));
762
+ }
763
+
764
+ if (pos == -1)
765
+ return Qnil;
766
+ return LONG2NUM(pos);
767
+ }
768
+
769
+ static long
770
+ icu_ustr_rindex(str, sub, pos)
771
+ VALUE str,
772
+ sub;
773
+ long pos;
774
+ {
775
+ long len = ICU_LEN(sub);
776
+ UChar *found;
777
+
778
+ /*
779
+ * substring longer than string
780
+ */
781
+ if (ICU_LEN(str) < len)
782
+ return -1;
783
+ if (ICU_LEN(str) - pos < len) {
784
+ pos = ICU_LEN(str) - len;
785
+ }
786
+ found = u_strFindLast(ICU_PTR(str), pos, ICU_PTR(sub), ICU_LEN(sub));
787
+ if (NULL == found)
788
+ return -1;
789
+ pos = found - (ICU_PTR(str));
790
+ return pos;
791
+ }
792
+
793
+
794
+ /**
795
+ * call-seq:
796
+ * str.rindex(substring [, fixnum]) => fixnum or nil
797
+ * str.rindex(regexp [, fixnum]) => fixnum or nil
798
+ *
799
+ * Returns the index of the last occurrence of the given <i>substring</i>,
800
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
801
+ * found. If the second parameter is present, it specifies the position in the
802
+ * string to end the search---characters beyond this point will not be considered.
803
+ *
804
+ * "hello".u.rindex('e') #=> 1
805
+ * "hello".u.rindex('l') #=> 3
806
+ * "hello".u.rindex('a') #=> nil
807
+ * "hello".u.rindex(/[aeiou]/.U, -2) #=> 1
808
+ */
809
+
810
+ VALUE
811
+ icu_ustr_rindex_m(argc, argv, str)
812
+ int argc;
813
+ VALUE *argv;
814
+ VALUE str;
815
+ {
816
+ VALUE sub;
817
+ VALUE position;
818
+ long pos;
819
+
820
+ if (rb_scan_args(argc, argv, "11", &sub, &position) == 2) {
821
+ pos = NUM2LONG(position);
822
+ if (pos < 0) {
823
+ pos += ICU_LEN(str);
824
+ if (pos < 0) {
825
+ return Qnil;
826
+ }
827
+ }
828
+ if (pos > ICU_LEN(str))
829
+ pos = ICU_LEN(str);
830
+ } else {
831
+ pos = ICU_LEN(str);
832
+ }
833
+
834
+ switch (TYPE(sub)) {
835
+ case T_DATA:
836
+ if (CLASS_OF(sub) == rb_cUString) {
837
+ pos = icu_ustr_rindex(str, sub, pos);
838
+ if (pos >= 0)
839
+ return LONG2NUM(pos);
840
+ break;
841
+ }
842
+ if (CLASS_OF(sub) == rb_cURegexp) {
843
+ pos = icu_reg_search(sub, str, pos, 1);
844
+ if (pos >= 0)
845
+ return LONG2NUM(pos);
846
+ break;
847
+ }
848
+
849
+ default:
850
+ rb_raise(rb_eTypeError, "type mismatch: %s given",
851
+ rb_obj_classname(sub));
852
+ }
853
+ return Qnil;
854
+ }
855
+
856
+ /**
857
+ * call-seq:
858
+ * str.lstrip! => self or nil
859
+ *
860
+ * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
861
+ * change was made. See also <code>UString#rstrip!</code> and
862
+ * <code>UString#strip!</code>, in all these methods whitespace is an
863
+ * Unicode char that has White_Space property.
864
+ *
865
+ * " hello ".u.lstrip #=> "hello "
866
+ * "hello".u.lstrip! #=> nil
867
+ */
868
+
869
+ VALUE
870
+ icu_ustr_lstrip_bang(str)
871
+ VALUE str;
872
+ {
873
+ UChar *s;
874
+ int32_t i,
875
+ n,
876
+ c;
877
+ icu_check_frozen(str);
878
+ s = ICU_PTR(str);
879
+ n = ICU_LEN(str);
880
+ if (!s || n == 0)
881
+ return Qnil;
882
+ /*
883
+ * remove spaces at head
884
+ */
885
+ i = 0;
886
+ U16_GET(s, 0, i, n, c); /* care about surrogates */
887
+ while (i < n && u_isUWhiteSpace(c)) {
888
+ U16_NEXT(s, i, n, c); /* care surr */
889
+ }
890
+
891
+ if (i > 0) {
892
+ if(! u_isUWhiteSpace(c)) --i;
893
+ ICU_LEN(str) = n - i;
894
+ u_memmove(ICU_PTR(str), s + i, ICU_LEN(str));
895
+ ICU_PTR(str)[ICU_LEN(str)] = 0;
896
+ return str;
897
+ }
898
+ return Qnil;
899
+ }
900
+
901
+
902
+ /**
903
+ * call-seq:
904
+ * str.lstrip => new_str
905
+ *
906
+ * Returns a copy of <i>str</i> with leading whitespace removed. See also
907
+ * <code>UString#rstrip</code> and <code>UString#strip</code>.
908
+ *
909
+ * " hello ".u.lstrip #=> "hello "
910
+ * "hello".u.lstrip #=> "hello"
911
+ */
912
+
913
+ VALUE
914
+ icu_ustr_lstrip(str)
915
+ VALUE str;
916
+ {
917
+ str = icu_ustr_dup(str);
918
+ icu_ustr_lstrip_bang(str);
919
+ return str;
920
+ }
921
+
922
+
923
+ /**
924
+ * call-seq:
925
+ * str.rstrip! => self or nil
926
+ *
927
+ * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
928
+ * no change was made. See also <code>UString#lstrip!</code> and
929
+ * <code>UString#strip!</code>.
930
+ *
931
+ * " hello ".u.rstrip #=> " hello"
932
+ * "hello".u.rstrip! #=> nil
933
+ */
934
+
935
+ VALUE
936
+ icu_ustr_rstrip_bang(str)
937
+ VALUE str;
938
+ {
939
+ UChar *s;
940
+ int32_t i,
941
+ n,
942
+ c;
943
+
944
+ icu_check_frozen(str);
945
+ s = ICU_PTR(str);
946
+ n = ICU_LEN(str);
947
+
948
+ if (!s || n == 0)
949
+ return Qnil;
950
+ i = n - 1;
951
+
952
+ U16_GET(s, 0, n - 1, n, c); /* care surrogates */
953
+ i = n;
954
+ /*
955
+ * remove trailing spaces
956
+ */
957
+ while (i > 0 && u_isUWhiteSpace(c)) {
958
+ U16_PREV(s, 0, i, c); /* care surrogates */
959
+ }
960
+
961
+ if (i < n) {
962
+ if(! u_isUWhiteSpace(c)) ++i;
963
+ ICU_LEN(str) = i;
964
+ ICU_PTR(str)[i] = 0;
965
+ return str;
966
+ }
967
+ return Qnil;
968
+ }
969
+
970
+
971
+ /**
972
+ * call-seq:
973
+ * str.rstrip => new_str
974
+ *
975
+ * Returns a copy of <i>str</i> with trailing whitespace removed. See also
976
+ * <code>UString#lstrip</code> and <code>UString#strip</code>.
977
+ *
978
+ * " hello ".u.rstrip #=> " hello"
979
+ * "hello".u.rstrip #=> "hello"
980
+ */
981
+
982
+ VALUE
983
+ icu_ustr_rstrip(str)
984
+ VALUE str;
985
+ {
986
+ str = icu_ustr_dup(str);
987
+ icu_ustr_rstrip_bang(str);
988
+ return str;
989
+ }
990
+
991
+
992
+ /**
993
+ * call-seq:
994
+ * str.strip! => str or nil
995
+ *
996
+ * Removes leading and trailing whitespace from <i>str</i>. Returns
997
+ * <code>nil</code> if <i>str</i> was not altered.
998
+ */
999
+
1000
+ VALUE
1001
+ icu_ustr_strip_bang(str)
1002
+ VALUE str;
1003
+ {
1004
+ VALUE l = icu_ustr_lstrip_bang(str);
1005
+ VALUE r = icu_ustr_rstrip_bang(str);
1006
+
1007
+ if (NIL_P(l) && NIL_P(r))
1008
+ return Qnil;
1009
+ return str;
1010
+ }
1011
+
1012
+
1013
+ /**
1014
+ * call-seq:
1015
+ * str.strip => new_str
1016
+ *
1017
+ * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
1018
+ *
1019
+ * " hello ".u.strip #=> "hello"
1020
+ * "\tgoodbye\r\n".u.strip #=> "goodbye"
1021
+ */
1022
+
1023
+ VALUE
1024
+ icu_ustr_strip(str)
1025
+ VALUE str;
1026
+ {
1027
+ str = icu_ustr_dup(str);
1028
+ icu_ustr_strip_bang(str);
1029
+ return str;
1030
+ }
1031
+
1032
+
1033
+
1034
+ /* ----------------------------------- */
1035
+ VALUE
1036
+ icu_ustr_normalize(str, mode)
1037
+ VALUE str;
1038
+ int32_t mode;
1039
+ {
1040
+ UErrorCode error = U_ZERO_ERROR;
1041
+ long capa = ICU_LEN(str);
1042
+ UChar *buf;
1043
+ long needed;
1044
+ VALUE ret;
1045
+ if (UNORM_YES == unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), mode, &error))
1046
+ return icu_ustr_dup(str);
1047
+
1048
+ buf = ALLOC_N(UChar, capa + 20);
1049
+ do {
1050
+ error = 0;
1051
+ needed =
1052
+ unorm_normalize(ICU_PTR(str), ICU_LEN(str), mode, 0, buf, capa,
1053
+ &error);
1054
+ if (U_SUCCESS(error)) {
1055
+ ret = icu_ustr_new_set(buf, needed, capa);
1056
+ return ret;
1057
+ }
1058
+ if (error == U_BUFFER_OVERFLOW_ERROR) {
1059
+ capa = needed + 1;
1060
+ REALLOC_N(buf, UChar, capa);
1061
+ if (!buf)
1062
+ rb_raise(rb_eRuntimeError, "can't allocate memory");
1063
+ } else
1064
+ rb_raise(rb_eArgError, u_errorName(error));
1065
+ }
1066
+ while (1);
1067
+ }
1068
+
1069
+ /**
1070
+ * UNORM_NFKC Compatibility decomposition followed by canonical
1071
+ * composition.
1072
+ */
1073
+ VALUE
1074
+ icu_ustr_normalize_KC(str)
1075
+ VALUE str;
1076
+ {
1077
+ return icu_ustr_normalize(str, UNORM_NFKC);
1078
+ }
1079
+
1080
+ /**
1081
+ * UNORM_NFKD Compatibility decomposition.
1082
+ */
1083
+ VALUE
1084
+ icu_ustr_normalize_KD(str)
1085
+ VALUE str;
1086
+ {
1087
+ return icu_ustr_normalize(str, UNORM_NFKD);
1088
+ }
1089
+
1090
+ /**
1091
+ * UNORM_NFD Canonical decomposition.
1092
+ */
1093
+ VALUE
1094
+ icu_ustr_normalize_D(str)
1095
+ VALUE str;
1096
+ {
1097
+ return icu_ustr_normalize(str, UNORM_NFD);
1098
+ }
1099
+
1100
+ /**
1101
+ * UNORM_FCD
1102
+ */
1103
+ VALUE
1104
+ icu_ustr_normalize_FCD(VALUE str)
1105
+ {
1106
+ return icu_ustr_normalize(str, UNORM_FCD);
1107
+ }
1108
+
1109
+ /**
1110
+ * UNORM_NFC Canonical decomposition followed by canonical composition.
1111
+ */
1112
+ VALUE
1113
+ icu_ustr_normalize_C(str)
1114
+ VALUE str;
1115
+ {
1116
+ return icu_ustr_normalize(str, UNORM_NFC);
1117
+ }
1118
+
1119
+ /* UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE */
1120
+ VALUE
1121
+ icu_ustr_each_mode(argc, argv, str, mode)
1122
+ int argc;
1123
+ VALUE *argv;
1124
+ VALUE str;
1125
+ int32_t mode;
1126
+ {
1127
+ UErrorCode error = 0;
1128
+ UBreakIterator *boundary;
1129
+ int32_t end, start;
1130
+ VALUE loc ;
1131
+ char *locale = "";
1132
+ if( rb_scan_args(argc, argv, "01", &loc) == 1) {
1133
+ Check_Type(loc, T_STRING);
1134
+ locale = RSTRING(loc)->ptr;
1135
+ }
1136
+ boundary =
1137
+ ubrk_open(mode, locale, ICU_PTR(str), ICU_LEN(str),
1138
+ &error);
1139
+ if (U_FAILURE(error))
1140
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1141
+ start = ubrk_first(boundary);
1142
+ USTRING(str)->busy = 1;
1143
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1144
+ start = end, end = ubrk_next(boundary)) {
1145
+ rb_yield(icu_ustr_new(ICU_PTR(str) + start, end - start));
1146
+ }
1147
+ USTRING(str)->busy = 0;
1148
+ ubrk_close(boundary);
1149
+ return str;
1150
+ }
1151
+
1152
+ /**
1153
+ * call-seq:
1154
+ * str.each_word(locale = "") {|substr| block } => str
1155
+ *
1156
+ * Word boundary analysis is used by search and replace functions, as well as within text editing
1157
+ * applications that allow the user to select words with a double click. Word selection provides
1158
+ * correct interpretation of punctuation marks within and following words. Characters that are not
1159
+ * part of a word, such as symbols or punctuation marks, have word-breaks on both sides.
1160
+ *
1161
+ */
1162
+ VALUE
1163
+ icu_ustr_each_word(argc, argv, str)
1164
+ int argc;
1165
+ VALUE *argv;
1166
+ VALUE str;
1167
+
1168
+ {
1169
+ return icu_ustr_each_mode(argc, argv, str, UBRK_WORD);
1170
+ }
1171
+
1172
+ /**
1173
+ * call-seq:
1174
+ * str.each_char(locale = "") {|substr| block } => str
1175
+ *
1176
+ * Character boundary analysis allows users to interact with characters as they expect to,
1177
+ * for example, when moving the cursor through a text string. Character boundary analysis provides
1178
+ * correct navigation of through character strings, regardless of how the character is stored.
1179
+ * For example, an accented character might be stored as a base character and a diacritical mark.
1180
+ * What users consider to be a character can differ between languages.
1181
+ *
1182
+ */
1183
+ VALUE
1184
+ icu_ustr_each_char(argc, argv, str)
1185
+ int argc;
1186
+ VALUE *argv;
1187
+ VALUE str;
1188
+
1189
+ {
1190
+ return icu_ustr_each_mode(argc, argv, str, UBRK_CHARACTER);
1191
+ }
1192
+
1193
+ /**
1194
+ * call-seq:
1195
+ * str.each_line_break(locale = "") {|substr| block } => str
1196
+ *
1197
+ * Line boundary analysis determines where a text string can be broken when line-wrapping.
1198
+ * The mechanism correctly handles punctuation and hyphenated words.
1199
+ *
1200
+ */
1201
+ VALUE
1202
+ icu_ustr_each_line(argc, argv, str)
1203
+ int argc;
1204
+ VALUE *argv;
1205
+ VALUE str;
1206
+
1207
+ {
1208
+ return icu_ustr_each_mode(argc, argv, str, UBRK_LINE);
1209
+ }
1210
+
1211
+ /**
1212
+ * call-seq:
1213
+ * str.each_sentence(locale = "") {|substr| block } => str
1214
+ *
1215
+ * Sentence boundary analysis allows selection with correct interpretation of periods
1216
+ * within numbers and abbreviations, and trailing punctuation marks such as quotation marks and parentheses.
1217
+ *
1218
+ */
1219
+ VALUE
1220
+ icu_ustr_each_sentence(argc, argv, str)
1221
+ int argc;
1222
+ VALUE *argv;
1223
+ VALUE str;
1224
+ {
1225
+ return icu_ustr_each_mode(argc, argv, str, UBRK_SENTENCE);
1226
+ }
1227
+
1228
+ /**
1229
+ * call-seq:
1230
+ * str.to_u(encoding = 'utf8') => UString
1231
+ *
1232
+ * Returns self.
1233
+ */
1234
+ VALUE
1235
+ icu_ustr_to_ustr(argc, argv, str)
1236
+ int argc;
1237
+ VALUE *argv;
1238
+ VALUE str;
1239
+ {
1240
+ return str;
1241
+ }
1242
+
1243
+ /**
1244
+ * call-seq:
1245
+ * str.to_s(encoding = 'utf8') => String
1246
+ *
1247
+ * Converts to Ruby String (byte-oriented) value in given encoding.
1248
+ * When no encoding is given, assumes UTF-8.
1249
+ */
1250
+ VALUE
1251
+ icu_ustr_to_rstr(argc, argv, str)
1252
+ int argc;
1253
+ VALUE *argv,
1254
+ str;
1255
+ {
1256
+ VALUE enc;
1257
+ char *encoding = 0; /* default */
1258
+ UErrorCode error = 0;
1259
+ UConverter *conv ;
1260
+ int enclen, needed = 0;
1261
+ char * buf;
1262
+ VALUE s;
1263
+ if (rb_scan_args(argc, argv, "01", &enc) == 1) {
1264
+ Check_Type(enc, T_STRING);
1265
+ encoding = RSTRING(enc)->ptr;
1266
+ }
1267
+
1268
+ enclen = ICU_LEN(str) + 1;
1269
+ buf = ALLOC_N(char, enclen);
1270
+
1271
+ if( !encoding || !strncmp(encoding, "utf8", 4)){
1272
+ u_strToUTF8( buf, enclen, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1273
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1274
+ REALLOC_N(buf, char, needed + 1);
1275
+ error = 0;
1276
+ u_strToUTF8( buf, needed, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1277
+ }
1278
+ if( U_FAILURE(error) ){
1279
+ free(buf);
1280
+ rb_raise(rb_eArgError, u_errorName(error));
1281
+ }
1282
+ s = rb_str_new(buf, needed);
1283
+
1284
+ } else {
1285
+ conv = ucnv_open(encoding, &error);
1286
+ if (U_FAILURE(error)) {
1287
+ ucnv_close(conv);
1288
+ free(buf);
1289
+ rb_raise(rb_eArgError, u_errorName(error));
1290
+ }
1291
+ enclen =
1292
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1293
+ &error);
1294
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1295
+ REALLOC_N(buf, char, enclen + 1);
1296
+ error = 0;
1297
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1298
+ &error);
1299
+ }
1300
+ if( U_FAILURE(error) ){
1301
+ free(buf);
1302
+ rb_raise(rb_eArgError, u_errorName(error));
1303
+ }
1304
+ s = rb_str_new(buf, enclen);
1305
+ ucnv_close(conv);
1306
+ }
1307
+ free(buf);
1308
+ return s;
1309
+ }
1310
+
1311
+ /* -------------- */
1312
+ extern VALUE icu_format(UChar * pattern, int32_t len, VALUE args,
1313
+ int32_t arg_len, char *locale);
1314
+ /**
1315
+ * call-seq:
1316
+ * str.format(locale, [*args])
1317
+ *
1318
+ * Powerful locale-sensitive message formatting. see [./docs/FORMATTING]
1319
+ *
1320
+ * Valid argument types are: +Fixnum+, +UString+, +Float+, +Time+ .
1321
+ *
1322
+ * */
1323
+ VALUE
1324
+ icu_ustr_format(str, args)
1325
+ VALUE str,
1326
+ args;
1327
+ {
1328
+ VALUE loc;
1329
+ Check_Type(args, T_ARRAY);
1330
+ loc = rb_ary_shift(args);
1331
+ Check_Type(loc, T_STRING);
1332
+ return icu_format(ICU_PTR(str), ICU_LEN(str), args, RARRAY(args)->len,
1333
+ RSTRING(loc)->ptr);
1334
+ }
1335
+
1336
+ /* ------ UString regexp related functions ---- */
1337
+
1338
+ /**
1339
+ * call-seq:
1340
+ * str =~ uregexp => UMatch or nil
1341
+ * str =~ other_str => integer or nil
1342
+ *
1343
+ * Match---If <code>URegexp</code> is given, use it as a pattern to
1344
+ * match against <i>uregexp</i> and return UMatch or +nil+.
1345
+ *
1346
+ * If <code>UString</code> is given, returns index of it
1347
+ * (similar to <code>UString#index</code>).
1348
+ *
1349
+ * Otherwise returns +nil+
1350
+ *
1351
+ * "cat o' 9 tails".u =~ '\d' #=> nil
1352
+ * "cat o' 9 tails".u =~ /\d/.U #=> #<UMatch:0xf6fb7d5c @cg=[<U000039>]>
1353
+ * "cat o' 9 tails".u =~ 9 #=> false
1354
+ * "cat o' 9 tails".u =~ '9'.u #=> 7
1355
+ */
1356
+
1357
+ VALUE
1358
+ icu_ustr_match(x, y)
1359
+ VALUE x,
1360
+ y;
1361
+ {
1362
+ long pos ;
1363
+ if (TYPE(y) == T_REGEXP){
1364
+ rb_raise(rb_eTypeError, "Wrong type: can't match against Regexp. Use URegexp instead");
1365
+ }
1366
+ if (CLASS_OF(y) == rb_cURegexp) {
1367
+ return icu_reg_match(y, x);
1368
+ } else if (CLASS_OF(y) == rb_cUString) {
1369
+ pos = icu_ustr_index(x, y, 0);
1370
+ if (pos == -1) return Qnil;
1371
+ else return LONG2NUM(pos);
1372
+ } else {
1373
+ return Qnil;
1374
+ }
1375
+ }
1376
+
1377
+ VALUE
1378
+ get_pat(pat, quote)
1379
+ VALUE pat;
1380
+ int quote;
1381
+ {
1382
+ if (CLASS_OF(pat) == rb_cURegexp)
1383
+ return pat;
1384
+
1385
+ if (CLASS_OF(pat) == rb_cUString)
1386
+ return icu_reg_comp(pat);
1387
+ Check_Class(pat, rb_cURegexp);
1388
+ return Qnil;
1389
+ }
1390
+
1391
+
1392
+ /**
1393
+ * call-seq:
1394
+ * str.match(pattern) => matchdata or nil
1395
+ *
1396
+ * Converts <i>pattern</i> to a <code>URegexp</code> (if it isn't already one),
1397
+ * then invokes its <code>match</code> method on <i>str</i>.
1398
+ *
1399
+ * 'hello'.u.match('(.)\1'.u) #=> #<UMatch:0x401b3d30>
1400
+ * 'hello'.u.match('(.)\1'.u)[0] #=> "ll"
1401
+ * 'hello'.u.match(/(.)\1/.U)[0] #=> "ll"
1402
+ * 'hello'.u.match('xx') #=> nil
1403
+ */
1404
+
1405
+ VALUE
1406
+ icu_ustr_match_m(str, re)
1407
+ VALUE str,
1408
+ re;
1409
+ {
1410
+ return rb_funcall(get_pat(re, 0), rb_intern("match"), 1, str);
1411
+ }
1412
+
1413
+ VALUE
1414
+ ustr_scan_once(str, pat, start)
1415
+ VALUE str,
1416
+ pat;
1417
+ long *start;
1418
+ {
1419
+ VALUE result;
1420
+ long i;
1421
+ long beg,
1422
+ end, num_regs;
1423
+
1424
+ if (icu_reg_search(pat, str, *start, 0) >= 0) {
1425
+ icu_reg_range(pat, 0, &beg, &end);
1426
+ if (beg == end) {
1427
+ *start = end + 1;
1428
+ } else {
1429
+ *start = end;
1430
+ }
1431
+ num_regs = icu_group_count(pat);
1432
+ if (num_regs <= 1) {
1433
+ return icu_reg_nth_match(pat, 0);
1434
+ }
1435
+ result = rb_ary_new2(num_regs);
1436
+ for (i = 1; i <= num_regs; i++) {
1437
+ rb_ary_store(result, i - 1, icu_reg_nth_match(pat, i));
1438
+ }
1439
+
1440
+ return result;
1441
+ }
1442
+ return Qnil;
1443
+ }
1444
+
1445
+
1446
+ /**
1447
+ * call-seq:
1448
+ * str.scan(pattern) => array
1449
+ * str.scan(pattern) {|match, ...| block } => str
1450
+ *
1451
+ * Both forms iterate through <i>str</i>, matching the pattern (which may be a
1452
+ * <code>URegexp</code> or a <code>UString</code>). For each match, a result is
1453
+ * generated and either added to the result array or passed to the block. If
1454
+ * the pattern contains no groups, each individual result consists of the
1455
+ * matched string. If the pattern contains groups, each
1456
+ * individual result is itself an array containing one entry per group.
1457
+ *
1458
+ * a = "cruel world".u
1459
+ * a.scan(/\w+/.U) #=> ["cruel", "world"]
1460
+ * a.scan(/.../.U) #=> ["cru", "el ", "wor"]
1461
+ * a.scan(/(...)/.U) #=> [["cru"], ["el "], ["wor"]]
1462
+ * a.scan(/(..)(..)/.U) #=> [["cr", "ue"], ["l ", "wo"]]
1463
+ *
1464
+ * And the block form:
1465
+ *
1466
+ * a.scan(/\w+/.U) {|w| print "<<#{w}>> " }
1467
+ * print "\n"
1468
+ * a.scan(/(.)(.)/.U) {|a,b| print b, a }
1469
+ * print "\n"
1470
+ *
1471
+ * <em>produces:</em>
1472
+ *
1473
+ * <<cruel>> <<world>>
1474
+ * rceu lowlr
1475
+ */
1476
+
1477
+ VALUE
1478
+ icu_ustr_scan(str, pat)
1479
+ VALUE str,
1480
+ pat;
1481
+ {
1482
+ VALUE result;
1483
+ long start = 0;
1484
+
1485
+ pat = get_pat(pat, 1);
1486
+ if (!rb_block_given_p()) {
1487
+ VALUE ary = rb_ary_new();
1488
+
1489
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1490
+ rb_ary_push(ary, result);
1491
+ }
1492
+ return ary;
1493
+ }
1494
+ USTRING(str)->busy = 1;
1495
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1496
+ rb_yield(result);
1497
+ }
1498
+ USTRING(str)->busy = 0;
1499
+ return str;
1500
+ }
1501
+ /**
1502
+ * call-seq:
1503
+ * str.char_span(start[, len, [locale]])
1504
+ *
1505
+ * Returns substring starting at <code>start</code>-th char, with <code>len</code> chars length.
1506
+ * Here "char" means "grapheme cluster", so start index and len are measured in terms of "graphemes"
1507
+ * locale parameter is optional.
1508
+ * Negative len can be supplied to receive to end of string.
1509
+ *
1510
+ * String is transformed to NFC before extract.
1511
+ */
1512
+ VALUE
1513
+ icu_ustr_char_span(int argc, VALUE * argv, VALUE str)
1514
+ {
1515
+ UErrorCode error = 0;
1516
+ int32_t end, start, char_start = 0, char_len = -1, total_chars = 0;
1517
+ int32_t init_pos = -1, end_pos = -1, n;
1518
+ char *loc = NULL;
1519
+ VALUE cs, clen, locl, out;
1520
+ UBreakIterator *boundary;
1521
+
1522
+ n = rb_scan_args(argc, argv, "12", &cs, &clen, &locl);
1523
+ Check_Type(cs, T_FIXNUM);
1524
+ char_start = FIX2INT(cs);
1525
+ if(char_start < 0) rb_raise(rb_eArgError, "Negative offset aren't allowed!");
1526
+
1527
+ if( n > 1) {
1528
+ Check_Type(clen, T_FIXNUM);
1529
+ char_len = FIX2INT(clen);
1530
+ if(char_len <= 0) char_len = -1;
1531
+ }
1532
+ if( n > 2) {
1533
+ Check_Type(locl, T_STRING);
1534
+ loc = RSTRING(locl)->ptr;
1535
+ }
1536
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1537
+ str = icu_ustr_normalize_C(str);
1538
+
1539
+ boundary =
1540
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1541
+ if (U_FAILURE(error))
1542
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1543
+
1544
+ start = ubrk_first(boundary);
1545
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1546
+ start = end, end = ubrk_next(boundary)) {
1547
+ if( total_chars == char_start ) init_pos = start;
1548
+ total_chars ++;
1549
+ if( char_len>0 && total_chars == char_start+char_len) end_pos = end;
1550
+ }
1551
+ ubrk_close(boundary);
1552
+ if( init_pos == -1) rb_raise(rb_eArgError, "Char index %d out of bounds %d", char_start, total_chars);
1553
+ if( end_pos == -1) end_pos = ICU_LEN(str); /* reached end of string */
1554
+ out = icu_ustr_new(ICU_PTR(str)+init_pos, end_pos - init_pos);
1555
+ return out;
1556
+ }
1557
+
1558
+ VALUE
1559
+ icu_ustr_chars(str, loc)
1560
+ VALUE str;
1561
+ char *loc;
1562
+ {
1563
+ UErrorCode error = 0;
1564
+ int32_t end, start;
1565
+ VALUE out;
1566
+ UBreakIterator *boundary;
1567
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1568
+ str = icu_ustr_normalize_C(str);
1569
+
1570
+ boundary =
1571
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1572
+ if (U_FAILURE(error))
1573
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1574
+
1575
+ out = rb_ary_new();
1576
+ start = ubrk_first(boundary);
1577
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1578
+ start = end, end = ubrk_next(boundary)) {
1579
+ rb_ary_push(out, icu_ustr_new(ICU_PTR(str) + start, end - start));
1580
+ }
1581
+ ubrk_close(boundary);
1582
+ return out;
1583
+ }
1584
+
1585
+ /**
1586
+ * call-seq:
1587
+ * str.chars(locale = "") => array of character
1588
+ *
1589
+ * Returns array of character graphemes, locale dependent.
1590
+ * String is transformed to NFC before split.
1591
+ * */
1592
+ VALUE
1593
+ icu_ustr_chars_m(argc, argv, str)
1594
+ int argc;
1595
+ VALUE *argv;
1596
+ VALUE str;
1597
+ {
1598
+ VALUE locale;
1599
+ if (rb_scan_args(argc, argv, "01", &locale) == 1) {
1600
+ Check_Type(locale, T_STRING);
1601
+ return icu_ustr_chars(str, RSTRING(locale)->ptr);
1602
+ } else {
1603
+ return icu_ustr_chars(str, "");
1604
+ }
1605
+ }
1606
+
1607
+ /**
1608
+ * call-seq:
1609
+ * str.split(pattern, [limit]) => anArray
1610
+ *
1611
+ * Divides <i>str</i> into substrings based on a delimiter, returning an array
1612
+ * of these substrings. <i>str</i> is divided where the
1613
+ * pattern matches.
1614
+ *
1615
+ * NOTE: split(//) or split("") is not supported.
1616
+ * To get array of chars use #chars or #codepoints methods
1617
+ *
1618
+ * If the <i>limit</i> parameter is omitted, trailing null fields are
1619
+ * suppressed. If <i>limit</i> is a positive number, at most that number of
1620
+ * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
1621
+ * string is returned as the only entry in an array). If negative, there is no
1622
+ * limit to the number of fields returned, and trailing null fields are not
1623
+ * suppressed.
1624
+ *
1625
+ */
1626
+
1627
+ VALUE
1628
+ icu_ustr_split_m(argc, argv, str)
1629
+ int argc;
1630
+ VALUE *argv;
1631
+ VALUE str;
1632
+ {
1633
+ VALUE spat;
1634
+ VALUE limit;
1635
+ int lim = 0;
1636
+ VALUE result;
1637
+
1638
+ if (rb_scan_args(argc, argv, "11", &spat, &limit) == 2) {
1639
+ lim = NUM2INT(limit);
1640
+ if (lim <= 0)
1641
+ limit = Qnil;
1642
+ }
1643
+ if (CLASS_OF(spat) == rb_cURegexp) {
1644
+ result = icu_reg_split(spat, str, limit);
1645
+ } else {
1646
+ if (CLASS_OF(spat) == rb_cUString) {
1647
+ result = icu_reg_split(icu_reg_comp(spat), str, limit);
1648
+ } else {
1649
+ rb_raise(rb_eArgError, "Expected UString or URegexp, got %s",
1650
+ rb_class2name(CLASS_OF(spat)));
1651
+ }
1652
+ }
1653
+ if (NIL_P(limit) && lim == 0) {
1654
+ while (RARRAY(result)->len > 0 &&
1655
+ ICU_LEN( (RARRAY(result)->ptr[RARRAY(result)->len - 1])) == 0)
1656
+ rb_ary_pop(result);
1657
+ }
1658
+
1659
+ return result;
1660
+ }
1661
+
1662
+ /**
1663
+ * call-seq:
1664
+ * str.inspect => String
1665
+ *
1666
+ * Shows codepoints in form of \uxxxx. For debug purposes.
1667
+ */
1668
+ VALUE
1669
+ icu_ustr_inspect(str)
1670
+ VALUE str;
1671
+ {
1672
+ VALUE buf = rb_str_new2("");
1673
+ char temp[] = "\\u0010FFFF ";
1674
+ int32_t i,
1675
+ n,
1676
+ k,
1677
+ c;
1678
+ UChar *s = ICU_PTR(str);
1679
+ n = ICU_LEN(str);
1680
+ i = 0;
1681
+ while (i < n) {
1682
+ U16_NEXT(s, i, n, c); /* care surrogate */
1683
+ if(c >= 0x10000)
1684
+ k = sprintf(temp, "\\u%08X", c);
1685
+ else
1686
+ k = sprintf(temp, "\\u%04X", c);
1687
+ rb_str_cat(buf, temp, k);
1688
+ }
1689
+ return buf;
1690
+ }
1691
+
1692
+ /**
1693
+ * call-seq:
1694
+ * str.codepoints => array of fixnums
1695
+ *
1696
+ * Returns array of codepoints as fixnums.
1697
+ */
1698
+ VALUE
1699
+ icu_ustr_points(str)
1700
+ VALUE str;
1701
+ {
1702
+ VALUE buf = rb_ary_new();
1703
+ int32_t i,
1704
+ n,
1705
+ c;
1706
+ UChar *s = ICU_PTR(str);
1707
+ n = ICU_LEN(str);
1708
+ i = 0;
1709
+ while (i < n) {
1710
+ U16_NEXT(s, i, n, c); /* care surrogates */
1711
+ rb_ary_push(buf, LONG2NUM(c));
1712
+ }
1713
+ return buf;
1714
+ }
1715
+
1716
+
1717
+ /**
1718
+ * call-seq:
1719
+ * str.inspect_names => String
1720
+ *
1721
+ * Dumps names of codepoints in this UString (debug).
1722
+ */
1723
+ VALUE
1724
+ icu_ustr_inspect_names(str)
1725
+ VALUE str;
1726
+ {
1727
+ VALUE buf = rb_str_new2("");
1728
+ char temp[301];
1729
+ UErrorCode error;
1730
+ int32_t i,
1731
+ n,
1732
+ c,
1733
+ l;
1734
+ UChar *s = ICU_PTR(str);
1735
+ n = ICU_LEN(str);
1736
+ i = 0;
1737
+ while (i < n) {
1738
+ U16_NEXT(s, i, n, c) sprintf(temp, "<U%06X>", c); /* care surrogates */
1739
+ rb_str_cat(buf, temp, 9);
1740
+ error = 0;
1741
+ l = u_charName(c, U_UNICODE_CHAR_NAME, temp, 300, &error);
1742
+ rb_str_cat(buf, temp, l);
1743
+ rb_str_cat(buf, "\n", 1);
1744
+ }
1745
+ return buf;
1746
+ }
1747
+
1748
+ VALUE
1749
+ icu_ustr_subpat(str, re, nth)
1750
+ VALUE str,
1751
+ re;
1752
+ int nth;
1753
+ {
1754
+ if (icu_reg_search(re, str, 0, 0) >= 0) {
1755
+ return icu_reg_nth_match(re, nth);
1756
+ }
1757
+ return Qnil;
1758
+ }
1759
+
1760
+ /* beg len are code unit indexes*/
1761
+ VALUE
1762
+ icu_ustr_substr(str, beg, len)
1763
+ VALUE str;
1764
+ long beg,
1765
+ len;
1766
+ {
1767
+ int32_t str_size;
1768
+ str_size = ICU_LEN(str);
1769
+ if (len < 0) return Qnil;
1770
+
1771
+ if (beg > str_size) return Qnil;
1772
+ if (beg < 0) {
1773
+ beg += str_size;
1774
+ if (beg < 0) return Qnil;
1775
+ }
1776
+ if (beg + len > str_size) {
1777
+ len = str_size - beg;
1778
+ }
1779
+ if (len < 0) {
1780
+ len = 0;
1781
+ }
1782
+ if( len == 0) return icu_ustr_new(0, 0);
1783
+ /* adjust to codepoint boundaries */
1784
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
1785
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
1786
+ return icu_ustr_new(ICU_PTR(str) + beg, len);
1787
+ }
1788
+
1789
+ VALUE
1790
+ icu_ustr_aref(str, indx)
1791
+ VALUE str;
1792
+ VALUE indx;
1793
+ {
1794
+ long idx;
1795
+ int32_t cp_len = ICU_LEN(str);
1796
+
1797
+ switch (TYPE(indx)) {
1798
+ case T_FIXNUM:
1799
+ idx = FIX2LONG(indx);
1800
+
1801
+ num_index:
1802
+ if (idx < 0) {
1803
+ idx = cp_len + idx;
1804
+ }
1805
+ if (idx < 0 || cp_len <= idx) {
1806
+ return Qnil;
1807
+ }
1808
+ return icu_ustr_substr(str, idx, 1);
1809
+
1810
+ case T_DATA:
1811
+ if (CLASS_OF(indx) == rb_cURegexp)
1812
+ return icu_ustr_subpat(str, indx, 0);
1813
+ if (CLASS_OF(indx) == rb_cUString) {
1814
+ if (icu_ustr_index(str, indx, 0) != -1)
1815
+ return icu_ustr_dup(indx);
1816
+ return Qnil;
1817
+ }
1818
+
1819
+ default:
1820
+ /*
1821
+ * check if indx is Range
1822
+ */
1823
+ {
1824
+ long beg,
1825
+ len;
1826
+ switch (rb_range_beg_len(indx, &beg, &len, cp_len, 0)) {
1827
+ case Qfalse:
1828
+ break;
1829
+ case Qnil:
1830
+ return Qnil;
1831
+ default:
1832
+ return icu_ustr_substr(str, beg, len);
1833
+ }
1834
+ }
1835
+ idx = NUM2LONG(indx);
1836
+ goto num_index;
1837
+ }
1838
+ return Qnil; /* not reached */
1839
+ }
1840
+
1841
+ /**
1842
+ * call-seq:
1843
+ * str[fixnum] => new_str or nil
1844
+ * str[fixnum, fixnum] => new_str or nil
1845
+ * str[range] => new_str or nil
1846
+ * str[regexp] => new_str or nil
1847
+ * str[regexp, fixnum] => new_str or nil
1848
+ * str[other_str] => new_str or nil
1849
+ * str.slice(fixnum) => new_str or nil
1850
+ * str.slice(fixnum, fixnum) => new_str or nil
1851
+ * str.slice(range) => new_str or nil
1852
+ * str.slice(regexp) => new_str or nil
1853
+ * str.slice(regexp, fixnum) => new_str or nil
1854
+ * str.slice(other_str) => new_str or nil
1855
+ *
1856
+ * Element Reference---If passed a single <code>Fixnum</code>, returns
1857
+ * substring with the character at that position. If passed two <code>Fixnum</code>
1858
+ * objects, returns a substring starting at the offset given by the first, and
1859
+ * a length given by the second. If given a range, a substring containing
1860
+ * characters at offsets given by the range is returned. In all three cases, if
1861
+ * an offset is negative, it is counted from the end of <i>str</i>. Returns
1862
+ * <code>nil</code> if the initial offset falls outside the string, the length
1863
+ * is negative, or the beginning of the range is greater than the end.
1864
+ *
1865
+ * If a <code>URegexp</code> is supplied, the matching portion of <i>str</i> is
1866
+ * returned. If a numeric parameter follows the regular expression, that
1867
+ * component of the <code>UMatch</code> is returned instead. If a
1868
+ * <code>UString</code> is given, that string is returned if it occurs in
1869
+ * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1870
+ * match.
1871
+ *
1872
+ * a = "hello there".u
1873
+ * a[1] #=> 'e'
1874
+ * a[1,3] #=> "ell"
1875
+ * a[1..3] #=> "ell"
1876
+ * a[-3,2] #=> "er"
1877
+ * a[-4..-2] #=> "her"
1878
+ * a[12..-1] #=> nil
1879
+ * a[-2..-4] #=> ""
1880
+ * a[/[aeiou](.)\1/.U] #=> "ell"
1881
+ * a[/[aeiou](.)\1/.U, 0] #=> "ell"
1882
+ * a[/[aeiou](.)\1/.U, 1] #=> "l"
1883
+ * a[/[aeiou](.)\1/.U, 2] #=> nil
1884
+ * a["lo".u] #=> "lo"
1885
+ * a["bye".u] #=> nil
1886
+ */
1887
+
1888
+ VALUE
1889
+ icu_ustr_aref_m(argc, argv, str)
1890
+ int argc;
1891
+ VALUE *argv;
1892
+ VALUE str;
1893
+ {
1894
+ if (argc == 2) {
1895
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
1896
+ return icu_ustr_subpat(str, argv[0], NUM2INT(argv[1]));
1897
+ }
1898
+ return icu_ustr_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
1899
+ }
1900
+ if (argc != 1) {
1901
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
1902
+ argc);
1903
+ }
1904
+ return icu_ustr_aref(str, argv[0]);
1905
+ }
1906
+
1907
+ /**
1908
+ * call-seq:
1909
+ * str.sub!(pattern, replacement) => str or nil
1910
+ * str.sub!(pattern) {|match| block } => str or nil
1911
+ *
1912
+ * Performs the substitutions of <code>UString#sub</code> in place,
1913
+ * returning <i>str</i>, or <code>nil</code> if no substitutions were
1914
+ * performed.
1915
+ */
1916
+
1917
+ VALUE
1918
+ icu_ustr_sub_bang(argc, argv, str)
1919
+ int argc;
1920
+ VALUE *argv;
1921
+ VALUE str;
1922
+ {
1923
+ return ustr_gsub(argc, argv, str, 1, 1 );
1924
+ }
1925
+
1926
+
1927
+ /**
1928
+ * call-seq:
1929
+ * str.sub(pattern, replacement) => new_str
1930
+ * str.sub(pattern) {|match| block } => new_str
1931
+ *
1932
+ * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
1933
+ * <i>pattern</i> replaced with either <i>replacement</i> or the value of the
1934
+ * block. The <i>pattern</i> will typically be a <code>URegexp</code>; if it is
1935
+ * a <code>UString</code> then no regular expression metacharacters will be
1936
+ * interpreted (that is <code>/\d/.U</code> will match a digit, but
1937
+ * <code>'\d'</code> will match a backslash followed by a 'd').
1938
+ *
1939
+ * The sequences <code>$1</code>, <code>$2</code>, etc., may be used.
1940
+ *
1941
+ * In the block form, the current UMatch object is passed in as a parameter.
1942
+ * The value returned by the block will be substituted for the match on each call.
1943
+ *
1944
+ * "hello".u.sub(/[aeiou]/.U, '*'.u) #=> "h*llo"
1945
+ * "hello".u.sub(/([aeiou])/.U, '<$1>'.u) #=> "h<e>llo"
1946
+ */
1947
+
1948
+ VALUE
1949
+ icu_ustr_sub(argc, argv, str)
1950
+ int argc;
1951
+ VALUE *argv;
1952
+ VALUE str;
1953
+ {
1954
+ str = icu_ustr_dup(str);
1955
+ icu_ustr_sub_bang(argc, argv, str);
1956
+ return str;
1957
+ }
1958
+
1959
+ /**
1960
+ * replace in string from +beg+ length +len+ (in code units)
1961
+ */
1962
+ static void
1963
+ icu_ustr_splice(str, beg, len, val)
1964
+ VALUE str;
1965
+ long beg,
1966
+ len;
1967
+ VALUE val;
1968
+ {
1969
+ long char_len;
1970
+ Check_Class(val, rb_cUString);
1971
+ if (val == str) {
1972
+ val = icu_ustr_dup(str);
1973
+ }
1974
+ if (len < 0)
1975
+ rb_raise(rb_eIndexError, "negative length %ld", len);
1976
+ char_len = ICU_LEN(str);
1977
+
1978
+ if (char_len < beg) {
1979
+ out_of_range:
1980
+ rb_raise(rb_eIndexError, "index %ld out of string", beg);
1981
+ }
1982
+ if (beg < 0) {
1983
+ if (-beg > char_len) {
1984
+ goto out_of_range;
1985
+ }
1986
+ beg += char_len;
1987
+ }
1988
+ if (char_len < beg + len) {
1989
+ len = char_len - beg;
1990
+ }
1991
+ /* adjust to codepoint boundaries */
1992
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
1993
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
1994
+
1995
+ ustr_splice_units(USTRING(str), beg, len, ICU_PTR(val), ICU_LEN(val));
1996
+ OBJ_INFECT(str, val);
1997
+ }
1998
+
1999
+
2000
+ /**
2001
+ * call-seq:
2002
+ * str.insert(index, other_str) => str
2003
+ *
2004
+ * Inserts <i>other_str</i> before the character at the given
2005
+ * <i>index</i>, modifying <i>str</i>. Negative indices count from the
2006
+ * end of the string, and insert <em>after</em> the given character.
2007
+ * The intent is insert <i>other_str</i> so that it starts at the given
2008
+ * <i>index</i>.
2009
+ *
2010
+ * "abcd".u.insert(0, 'X'.u) #=> "Xabcd"
2011
+ * "abcd".u.insert(3, 'X'.u) #=> "abcXd"
2012
+ * "abcd".u.insert(4, 'X'.u) #=> "abcdX"
2013
+ * "abcd".u.insert(-3, 'X'.u) #=> "abXcd"
2014
+ * "abcd".u.insert(-1, 'X'.u) #=> "abcdX"
2015
+ */
2016
+
2017
+ VALUE
2018
+ icu_ustr_insert(str, idx, str2)
2019
+ VALUE str,
2020
+ idx,
2021
+ str2;
2022
+ {
2023
+ long pos = NUM2LONG(idx);
2024
+ icu_check_frozen(str);
2025
+
2026
+ if (pos == -1) {
2027
+ pos = NUM2LONG(icu_ustr_length(str));
2028
+ } else if (pos < 0) {
2029
+ pos++;
2030
+ }
2031
+
2032
+ icu_ustr_splice(str, pos, 0, str2);
2033
+ return str;
2034
+ }
2035
+
2036
+ /**
2037
+ * call-seq:
2038
+ * str.include? other_str => true or false
2039
+ *
2040
+ * Returns <code>true</code> if <i>str</i> contains the given string
2041
+ *
2042
+ * "hello".u.include? "lo".u #=> true
2043
+ * "hello".u.include? "ol".u #=> false
2044
+ */
2045
+
2046
+ VALUE
2047
+ icu_ustr_include(str, arg)
2048
+ VALUE str,
2049
+ arg;
2050
+ {
2051
+ long i;
2052
+ i = icu_ustr_index(str, arg, 0);
2053
+ if (i == -1)
2054
+ return Qfalse;
2055
+ return Qtrue;
2056
+ }
2057
+
2058
+ static void
2059
+ icu_ustr_subpat_set(str, re, nth, val)
2060
+ VALUE str,
2061
+ re;
2062
+ int nth;
2063
+ VALUE val;
2064
+ {
2065
+ long start,
2066
+ end,
2067
+ len;
2068
+ VALUE matched;
2069
+
2070
+ if (icu_reg_search(re, str, 0, 0) < 0) {
2071
+ rb_raise(rb_eIndexError, "regexp not matched");
2072
+ }
2073
+ matched = icu_reg_range(re, nth, &start, &end);
2074
+ if (NIL_P(matched)) {
2075
+ rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2076
+ }
2077
+ len = end - start;
2078
+ /* adjust to codepoint boundaries */
2079
+ U16_SET_CP_START(ICU_PTR(str), 0, start);
2080
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
2081
+
2082
+ ustr_splice_units(USTRING(str), start, len, ICU_PTR(val), ICU_LEN(val));
2083
+ }
2084
+
2085
+ VALUE
2086
+ icu_ustr_aset(str, indx, val)
2087
+ VALUE str;
2088
+ VALUE indx,
2089
+ val;
2090
+ {
2091
+ long idx,
2092
+ beg;
2093
+ long char_len = ICU_LEN(str);
2094
+
2095
+ switch (TYPE(indx)) {
2096
+ case T_FIXNUM:
2097
+ num_index:
2098
+ idx = FIX2LONG(indx);
2099
+ if (char_len <= idx) {
2100
+ out_of_range:
2101
+ rb_raise(rb_eIndexError, "index %ld out of string", idx);
2102
+ }
2103
+ if (idx < 0) {
2104
+ if (-idx > char_len)
2105
+ goto out_of_range;
2106
+ idx += char_len;
2107
+ }
2108
+ icu_ustr_splice(str, idx, 1, val);
2109
+ return val;
2110
+
2111
+ case T_DATA:
2112
+ if (CLASS_OF(indx) == rb_cURegexp) {
2113
+ icu_ustr_subpat_set(str, indx, 0, val);
2114
+ return val;
2115
+ }
2116
+ if (CLASS_OF(indx) == rb_cUString) {
2117
+ beg = icu_ustr_index(str, indx, 0);
2118
+ if (beg < 0) {
2119
+ rb_raise(rb_eIndexError, "string not matched");
2120
+ }
2121
+ ustr_splice_units(USTRING(str), beg, ICU_LEN(indx), ICU_PTR(val), ICU_LEN(val));
2122
+ return val;
2123
+ }
2124
+ default:
2125
+ /*
2126
+ * check if indx is Range
2127
+ */
2128
+ {
2129
+ long beg,
2130
+ len;
2131
+ if (rb_range_beg_len(indx, &beg, &len, char_len, 2)) {
2132
+ icu_ustr_splice(str, beg, len, val);
2133
+ return val;
2134
+ }
2135
+ }
2136
+ idx = NUM2LONG(indx);
2137
+ goto num_index;
2138
+ }
2139
+ }
2140
+
2141
+
2142
+ /**
2143
+ * call-seq:
2144
+ * str[fixnum] = new_str
2145
+ * str[fixnum, fixnum] = new_str
2146
+ * str[range] = new_str
2147
+ * str[regexp] = new_str
2148
+ * str[regexp, fixnum] = new_str
2149
+ * str[other_str] = new_str
2150
+ *
2151
+ * Element Assignment---Replaces some or all of the content of <i>str</i>. The
2152
+ * portion of the string affected is determined using the same criteria as
2153
+ * <code>UString#[]</code>. If the replacement string is not the same length as
2154
+ * the text it is replacing, the string will be adjusted accordingly. If the
2155
+ * regular expression or string is used as the index doesn't match a position
2156
+ * in the string, <code>IndexError</code> is raised. If the regular expression
2157
+ * form is used, the optional second <code>Fixnum</code> allows you to specify
2158
+ * which portion of the match to replace (effectively using the
2159
+ * <code>UMatch</code> indexing rules. The forms that take a
2160
+ * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2161
+ * out of range; the <code>Range</code> form will raise a
2162
+ * <code>RangeError</code>, and the <code>URegexp</code> and <code>UString</code>
2163
+ * forms will silently ignore the assignment.
2164
+ */
2165
+
2166
+ VALUE
2167
+ icu_ustr_aset_m(argc, argv, str)
2168
+ int argc;
2169
+ VALUE *argv;
2170
+ VALUE str;
2171
+ {
2172
+ icu_check_frozen(str);
2173
+ if (argc == 3) {
2174
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
2175
+ icu_ustr_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2176
+ } else {
2177
+ icu_ustr_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]),
2178
+ argv[2]);
2179
+ }
2180
+ return argv[2];
2181
+ }
2182
+ if (argc != 2) {
2183
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2184
+ argc);
2185
+ }
2186
+ return icu_ustr_aset(str, argv[0], argv[1]);
2187
+ }
2188
+
2189
+ /**
2190
+ * call-seq:
2191
+ * str.slice!(fixnum) => new_str or nil
2192
+ * str.slice!(fixnum, fixnum) => new_str or nil
2193
+ * str.slice!(range) => new_str or nil
2194
+ * str.slice!(regexp) => new_str or nil
2195
+ * str.slice!(other_str) => new_str or nil
2196
+ *
2197
+ * Deletes the specified portion from <i>str</i>, and returns the portion
2198
+ * deleted. The forms that take a <code>Fixnum</code> will raise an
2199
+ * <code>IndexError</code> if the value is out of range; the <code>Range</code>
2200
+ * form will raise a <code>RangeError</code>, and the <code>URegexp</code> and
2201
+ * <code>UString</code> forms will silently ignore the assignment.
2202
+ *
2203
+ * string = "this is a string".u
2204
+ * string.slice!(2) #=> 105
2205
+ * string.slice!(3..6) #=> " is "
2206
+ * string.slice!(/s.*t/.U) #=> "sa st"
2207
+ * string.slice!("r".u) #=> "r"
2208
+ * string #=> "thing"
2209
+ */
2210
+
2211
+ VALUE
2212
+ icu_ustr_slice_bang(argc, argv, str)
2213
+ int argc;
2214
+ VALUE *argv;
2215
+ VALUE str;
2216
+ {
2217
+ VALUE result;
2218
+ VALUE buf[3];
2219
+ int i;
2220
+ icu_check_frozen(str);
2221
+ if (argc < 1 || 2 < argc) {
2222
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
2223
+ argc);
2224
+ }
2225
+ for (i = 0; i < argc; i++) {
2226
+ buf[i] = argv[i];
2227
+ }
2228
+ buf[i] = icu_ustr_new(0, 0);
2229
+ result = icu_ustr_aref_m(argc, buf, str);
2230
+ if (!NIL_P(result)) {
2231
+ icu_ustr_aset_m(argc + 1, buf, str);
2232
+ }
2233
+ return result;
2234
+ }
2235
+
2236
+ VALUE
2237
+ ustr_gsub(argc, argv, str, bang, once)
2238
+ int argc;
2239
+ VALUE *argv;
2240
+ VALUE str;
2241
+ int bang;
2242
+ int once;
2243
+ {
2244
+ VALUE pat,
2245
+ repl;
2246
+ long beg,
2247
+ end,
2248
+ prev_end;
2249
+ int tainted = 0,
2250
+ iter = 0;
2251
+ VALUE buf, curr_repl, umatch, block_res;
2252
+ if (argc == 1 && rb_block_given_p()) {
2253
+ iter = 1;
2254
+ } else if (argc == 2) {
2255
+ repl = argv[1];
2256
+ Check_Class(repl, rb_cUString);
2257
+ if (OBJ_TAINTED(repl))
2258
+ tainted = 1;
2259
+ } else {
2260
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2261
+ argc);
2262
+ }
2263
+
2264
+ pat = get_pat(argv[0], 1);
2265
+ beg = icu_reg_search(pat, str, 0, 0);
2266
+
2267
+ if (beg < 0) {
2268
+ /* no match */
2269
+ if (bang)
2270
+ return Qnil;
2271
+ return icu_ustr_dup(str);
2272
+ }
2273
+ end = 0;
2274
+ icu_check_frozen(str);
2275
+ USTRING(str)->busy = 1;
2276
+ buf = icu_ustr_new(0, 0);
2277
+ pat = icu_reg_clone(pat);
2278
+ if(rb_block_given_p()) iter = 1;
2279
+ do {
2280
+
2281
+ prev_end = end;
2282
+ icu_reg_range(pat, 0, &beg, &end);
2283
+ icu_ustr_concat(buf, icu_reg_get_prematch(pat, prev_end));
2284
+ if ( iter ) {
2285
+ UChar * ptr = ICU_PTR(str);
2286
+ long o_len = ICU_LEN(str);
2287
+ umatch = icu_umatch_new(pat);
2288
+ block_res = rb_yield(umatch);
2289
+ if (CLASS_OF(block_res) == rb_cUString)
2290
+ curr_repl = block_res;
2291
+ else if (CLASS_OF(block_res) == rb_cUMatch)
2292
+ curr_repl = icu_umatch_aref(block_res, INT2FIX(0));
2293
+ else
2294
+ curr_repl =
2295
+ icu_from_rstr(0, NULL, rb_obj_as_string(block_res));
2296
+ ustr_mod_check(str, ptr, o_len);
2297
+ } else {
2298
+ curr_repl = icu_reg_get_replacement(pat, repl, prev_end);
2299
+ }
2300
+ icu_ustr_concat(buf, curr_repl);
2301
+ }
2302
+ while (icu_reg_find_next(pat) && !once);
2303
+ icu_ustr_concat(buf, icu_reg_get_tail(pat, end));
2304
+ USTRING(str)->busy = 0;
2305
+ if (bang) {
2306
+ icu_ustr_replace(str, buf);
2307
+ return str;
2308
+ } else {
2309
+ return buf;
2310
+ }
2311
+ }
2312
+
2313
+ /**
2314
+ * call-seq:
2315
+ * str.gsub!(pattern, replacement) => str or nil
2316
+ * str.gsub!(pattern) {|match| block } => str or nil
2317
+ *
2318
+ * Performs the substitutions of <code>UString#gsub</code> in place, returning
2319
+ * <i>str</i>, or <code>nil</code> if no substitutions were performed.
2320
+ */
2321
+
2322
+ VALUE
2323
+ icu_ustr_gsub_bang(argc, argv, str)
2324
+ int argc;
2325
+ VALUE *argv;
2326
+ VALUE str;
2327
+ {
2328
+ icu_check_frozen(str);
2329
+ return ustr_gsub(argc, argv, str, 1, 0);
2330
+ }
2331
+
2332
+
2333
+ /**
2334
+ * call-seq:
2335
+ * str.gsub(pattern, replacement) => new_str
2336
+ * str.gsub(pattern) {|match| block } => new_str
2337
+ *
2338
+ * Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
2339
+ * replaced with either <i>replacement</i> or the value of the block. The
2340
+ * <i>pattern</i> will typically be a <code>URegexp</code>; if it is a
2341
+ * <code>UString</code> then no regular expression metacharacters will be
2342
+ * interpreted (that is <code>/\d/</code> will match a digit, but
2343
+ * <code>'\d'</code> will match a backslash followed by a 'd').
2344
+ *
2345
+ * If a string is used as the replacement, the sequences <code>$1</code>, <code>$2</code>, and so on
2346
+ * may be used to interpolate successive groups in the match.
2347
+ *
2348
+ * In the block form, the current UMatch object is passed in as a parameter. The value
2349
+ * returned by the block will be substituted for the match on each call.
2350
+ *
2351
+ * "hello".gsub(/[aeiou]/.U, '*') #=> "h*ll*"
2352
+ * "hello".gsub(/([aeiou])/.U, '<$1>') #=> "h<e>ll<o>"
2353
+ */
2354
+
2355
+ VALUE
2356
+ icu_ustr_gsub(argc, argv, str)
2357
+ int argc;
2358
+ VALUE *argv;
2359
+ VALUE str;
2360
+ {
2361
+ return ustr_gsub(argc, argv, str, 0, 0);
2362
+ }
2363
+
2364
+
2365
+ /*-------------*/
2366
+ /* parsing */
2367
+ extern VALUE icu_date_parse(UChar * str, int32_t str_len, char * locale, UChar * val, int32_t len);
2368
+
2369
+ /**
2370
+ * call-seq:
2371
+ * str.parse_date( locale, value)
2372
+ *
2373
+ * Parses given value, using +str+ as format pattern with respect to +locale+.
2374
+ *
2375
+ * "HH:mm:ss E dd/MM/yyyy".u.parse_date("en", "20:15:01 Fri 13/01/2006".u)) # => Time.local(2006,"jan",13,20,15,1)
2376
+ *
2377
+ */
2378
+
2379
+ VALUE
2380
+ icu_ustr_parse_date( str, locale, val)
2381
+ VALUE str, locale, val;
2382
+ {
2383
+ Check_Type(locale, T_STRING);
2384
+ Check_Class(val, rb_cUString);
2385
+ return icu_date_parse(ICU_PTR(str), ICU_LEN(str), RSTRING(locale)->ptr, ICU_PTR(val), ICU_LEN(val));
2386
+ }
2387
+
2388
+ /**
2389
+ * call-seq:
2390
+ * str.to_f( locale = "",[format_pattern]) => aFloat
2391
+ *
2392
+ * Parses string as double value, with respect to +locale+ and format pattern,
2393
+ * if they are provided.
2394
+ *
2395
+ * "456".u.to_f # => 456.0
2396
+ * "123,001".u.to_f("ru") # => 123.001
2397
+ * "123,001".u.to_f("en") # => 123001.0
2398
+ * "Got 123,001".u.to_f("en", "Got ###,###".u) # => 123001
2399
+ */
2400
+
2401
+ VALUE
2402
+ icu_ustr_parse_double( int argc, VALUE * argv, VALUE str)
2403
+ {
2404
+ UParseError error;
2405
+ UErrorCode status = U_ZERO_ERROR;
2406
+ UNumberFormat * format = NULL;
2407
+ VALUE loc, pattern;
2408
+ char * locale;
2409
+ double value;
2410
+ int32_t pos, n;
2411
+
2412
+ n = rb_scan_args(argc, argv, "02", &loc, &pattern) ;
2413
+ if( n == 2) {
2414
+ Check_Class(pattern, rb_cUString);
2415
+ } else pattern = Qnil;
2416
+
2417
+ if (n > 0) {
2418
+ Check_Type(loc, T_STRING);
2419
+ locale = RSTRING(loc)->ptr;
2420
+ } else locale = NULL;
2421
+
2422
+ if( pattern != Qnil ) {
2423
+ format = unum_open(UNUM_PATTERN_DECIMAL, ICU_PTR(pattern), ICU_LEN(pattern), locale,
2424
+ &error, &status);
2425
+ } else {
2426
+ format = unum_open(UNUM_DECIMAL, NULL, 0, locale,&error, &status);
2427
+ }
2428
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't open format %s", u_errorName(status));
2429
+ pos = 0;
2430
+ value = unum_parseDouble(format, ICU_PTR(str), ICU_LEN(str), &pos, &status);
2431
+ unum_close(format);
2432
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't parse %s at %d", u_errorName(status), pos);
2433
+ return rb_float_new(value);
2434
+ }
2435
+
2436
+ /**
2437
+ * call-seq:
2438
+ * UString::strcoll(str1, str2 ) => Fixnum
2439
+ * UString::strcoll(str1, str2 , locale) => Fixnum
2440
+ * UString::strcoll(str1, str2 , locale, strength) => Fixnum
2441
+ *
2442
+ * Performs locale-sensitive string comparison.
2443
+ * Special values for locales can be passed in - if +nil+ is passed for the locale,
2444
+ * the default locale collation rules will be used. If empty string ("") or "root" are
2445
+ * passed, UCA rules will be used.
2446
+ *
2447
+ * Strength must be a fixnum that set collation strength:
2448
+ * -1 is default, 0 - primary, 1 - secondary, 2 - ternary.
2449
+ * E.g., pass 0 to ignore case and accents, 1 - to ignore case only.
2450
+ **/
2451
+ VALUE
2452
+ icu_ustr_coll(argc, argv, self)
2453
+ int argc;
2454
+ VALUE *argv;
2455
+ VALUE self;
2456
+ {
2457
+ UErrorCode status = 0 ;
2458
+ UCollator * collator = 0;
2459
+ int result;
2460
+ VALUE ret = Qnil;
2461
+ VALUE str1, str2, loc, strength = Qnil;
2462
+ char * locale = NULL;
2463
+ int n ;
2464
+ n = rb_scan_args(argc, argv, "22", &str1, &str2, &loc, &strength);
2465
+ if ( n == 3) {
2466
+ if( loc != Qnil) {
2467
+ Check_Type(loc, T_STRING);
2468
+ locale = RSTRING(loc)->ptr;
2469
+ }
2470
+ }
2471
+ Check_Class(str1, rb_cUString);
2472
+ Check_Class(str2, rb_cUString);
2473
+ collator = ucol_open(locale, &status);
2474
+ if( U_FAILURE(status) )
2475
+ {
2476
+ rb_raise(rb_eArgError, u_errorName(status));
2477
+ }
2478
+ if( n == 4 ){
2479
+ Check_Type(strength, T_FIXNUM);
2480
+ ucol_setStrength(collator, NUM2INT(strength));
2481
+ }
2482
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
2483
+
2484
+ switch(result){
2485
+ case UCOL_EQUAL: ret = INT2FIX(0);break;
2486
+ case UCOL_GREATER: ret = INT2FIX(1);break;
2487
+ case UCOL_LESS: ret = INT2FIX(-1);break;
2488
+ }
2489
+ ucol_close(collator);
2490
+ return ret;
2491
+ }
2492
+
2493
+ /**
2494
+ * call-seq:
2495
+ * UString::list_coll => anArray
2496
+ *
2497
+ * Returns array of available collator locales, to be used in UString#strcoll
2498
+ * */
2499
+ VALUE icu_ustr_list_coll(str)
2500
+ VALUE str;
2501
+ {
2502
+ int32_t i, n =ucol_countAvailable();
2503
+ VALUE ret = rb_ary_new();
2504
+ for( i = 0; i<n; i++) {
2505
+ rb_ary_push(ret, rb_str_new2(ucol_getAvailable(i)));
2506
+ }
2507
+ return ret;
2508
+ }
2509
+
2510
+ /**
2511
+ * call-seq:
2512
+ * UString::list_locales => anArray
2513
+ *
2514
+ * Returns array of available locales.
2515
+ * */
2516
+ VALUE icu_ustr_list_locales(str)
2517
+ VALUE str;
2518
+ {
2519
+ int32_t i, n =uloc_countAvailable();
2520
+ VALUE ret = rb_ary_new();
2521
+ for( i = 0; i<n; i++) {
2522
+ rb_ary_push(ret, rb_str_new2(uloc_getAvailable(i)));
2523
+ }
2524
+ return ret;
2525
+ }
2526
+ /**
2527
+ * call-seq:
2528
+ * UString::list_translits => anArray
2529
+ *
2530
+ * Returns array of available translits.
2531
+ * */
2532
+ VALUE icu_ustr_list_translits(str)
2533
+ VALUE str;
2534
+ {
2535
+ UErrorCode status = U_ZERO_ERROR;
2536
+ UEnumeration * ids ;
2537
+ VALUE ret ;
2538
+ UChar * name;
2539
+ int32_t len;
2540
+ ids = utrans_openIDs (&status);
2541
+ ICU_RAISE(status);
2542
+ ret = rb_ary_new();
2543
+ while( (name = (UChar*)uenum_unext(ids, &len, &status))) {
2544
+ rb_ary_push(ret, icu_ustr_new(name, len));
2545
+ }
2546
+ uenum_close(ids);
2547
+ return ret;
2548
+
2549
+ }
2550
+ /**
2551
+ * call-seq:
2552
+ * str.search(pattern, options = {})
2553
+ *
2554
+ * Searches for match in string. Returns array of +Range+
2555
+ * corresponding to position where pattern is matched.
2556
+ *
2557
+ * Valid options are:
2558
+ * :locale -- locale, +String+, value e.g. "en", "ru_RU"
2559
+ * :ignore_case -- whether to ignore case, valid values are +true+ or +false+, default to +false+
2560
+ * :ignore_case_accents -- sets collator options to strength +0+ - primary difference, e.g. ignore case and accents,
2561
+ * overrides :ignore_case: option, default to +false+,
2562
+ * :loosely -- same as :ignore_case_accents
2563
+ * :limit -- Fixnum limit of match positions to return.
2564
+ * :whole_words -- whether to match whole words only
2565
+ * :canonical -- use canonical equivalence
2566
+ *
2567
+ *
2568
+ * a = "A quick brown fox jumped over the lazy fox dancing foxtrote".u
2569
+ * a.search("fox".u) # => [14..16, 39..41, 51..53]
2570
+ * a.search("FoX".u) # => []
2571
+ * a.search("FoX".u, :ignore_case => true) # => [14..16, 39..41, 51..53]
2572
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true) # => [14..16, 39..41]
2573
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true, :limit => 1) # => [14..16]
2574
+ *
2575
+ * b = "Iñtërnâtiônàlizætiøn îs cọmpłèx".u.upcase # => IÑTËRNÂTIÔNÀLIZÆTIØN ÎS CỌMPŁÈX
2576
+ * b.search("nâtiôn".u, :locale => "en") # => []
2577
+ * b.search("nation".u) # => []
2578
+ * b.search("nation".u, :locale => "en", :ignore_case_accents => true) # => [5..10]
2579
+ * b.search("nâtiôn".u, :locale => "en", :ignore_case => true) # => [5..10]
2580
+ * b.search("zaeti".u, :locale => "en" ) # => []
2581
+ * b.search("zaeti".u, :locale => "en", :ignore_case => true) # => []
2582
+ * b.search("zaeti".u, :locale => "en", :ignore_case_accents => true) # => [14..17]
2583
+ *
2584
+ * v = [?a, 0x0325, 0x0300].to_u # => ḁ̀
2585
+ * v.search([?a, 0x300].to_u, :canonical => true) # => [0..2]
2586
+ * v.search([?a, 0x300].to_u) # => []
2587
+ **/
2588
+
2589
+ VALUE icu_ustr_search(argc, argv, str)
2590
+ int argc;
2591
+ VALUE *argv;
2592
+ VALUE str;
2593
+
2594
+ {
2595
+ UErrorCode status = U_ZERO_ERROR;
2596
+ UStringSearch * search = 0 ;
2597
+ VALUE pat, locale , limit, options;
2598
+ int lim = -1, count = 0 ;
2599
+ int32_t start, len;
2600
+ VALUE ret = rb_ary_new();
2601
+ UCollator * collator = 0;
2602
+ UBreakIterator * brkit = 0;
2603
+ char * loc = 0;
2604
+ if ( rb_scan_args(argc, argv, "11", &pat, &options) == 2 ) {
2605
+ Check_Type(options, T_HASH);
2606
+ } else {
2607
+ options = Qnil;
2608
+ }
2609
+
2610
+ Check_Class(pat, rb_cUString);
2611
+ locale = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("locale")));
2612
+
2613
+ if( locale != Qnil ) {
2614
+ Check_Type(locale, T_STRING);
2615
+ loc = RSTRING(locale) -> ptr;
2616
+ }
2617
+ limit = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("limit")));
2618
+
2619
+ if(TYPE(limit) == T_FIXNUM) {
2620
+ lim = FIX2INT(limit);
2621
+ if(lim <= 0) {
2622
+ rb_raise(rb_eTypeError, "Limit must be positive or nil, got: %d", lim);
2623
+ }
2624
+ }
2625
+ else
2626
+ if (limit!=Qnil)
2627
+ rb_raise(rb_eArgError, "Limit must be Fixnum, got %s", rb_class2name(CLASS_OF(limit)));
2628
+
2629
+ collator = ucol_open(loc, &status);
2630
+ ucol_setStrength(collator, -1);
2631
+
2632
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("whole_words"))) )
2633
+ brkit = ubrk_open(UBRK_WORD, loc, ICU_PTR(str), ICU_LEN(str), &status);
2634
+
2635
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case"))) )
2636
+ ucol_setStrength(collator, UCOL_SECONDARY);
2637
+
2638
+ if( options != Qnil &&
2639
+ ( Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case_accents")) )
2640
+ || Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("loosely")) )
2641
+ )
2642
+ )
2643
+ ucol_setStrength(collator, UCOL_PRIMARY );
2644
+
2645
+
2646
+ search = usearch_openFromCollator(ICU_PTR(pat), ICU_LEN(pat),
2647
+ ICU_PTR(str), ICU_LEN(str),
2648
+ collator, brkit, &status);
2649
+
2650
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("canonical"))) )
2651
+ usearch_setAttribute(search, USEARCH_CANONICAL_MATCH, USEARCH_ON, &status);
2652
+
2653
+ if( U_FAILURE(status) ) goto failure;
2654
+
2655
+ status = U_ZERO_ERROR;
2656
+ if( usearch_first(search, &status) == USEARCH_DONE) {
2657
+ usearch_close(search);
2658
+ ucol_close(collator);
2659
+ ubrk_close(brkit);
2660
+ return ret;
2661
+ }
2662
+
2663
+ do {
2664
+ if( U_FAILURE(status) ) goto failure;
2665
+
2666
+ start = usearch_getMatchedStart(search);
2667
+ len = usearch_getMatchedLength(search);
2668
+ rb_ary_push(ret, rb_range_new(LONG2NUM(start), LONG2NUM(start+len-1), 0));
2669
+
2670
+ status = U_ZERO_ERROR;
2671
+ count += 1;
2672
+ if (lim > 0 && count >= lim) break;
2673
+ } while (USEARCH_DONE != usearch_next(search, &status));
2674
+ usearch_close( search);
2675
+ ucol_close(collator);
2676
+ ubrk_close(brkit);
2677
+ return ret;
2678
+
2679
+ failure:
2680
+ usearch_close( search);
2681
+ ucol_close(collator);
2682
+ ubrk_close(brkit);
2683
+
2684
+ rb_raise(rb_eArgError, u_errorName(status));
2685
+ return Qnil;
2686
+ }
2687
+ /**
2688
+ * call-seq:
2689
+ * str.conv_unit_range(unit_range) => code_point_range
2690
+ *
2691
+ * Converts <b>code unit</b> range to <b>code point</b> range.
2692
+ * If your chars don't use multiple UTF16 codeunits, range will be the same.
2693
+ */
2694
+ VALUE icu_ustr_convert_unit_range(str, range)
2695
+ VALUE str, range;
2696
+ {
2697
+ long cu_start, cu_len, cur_pos, cp_len ;
2698
+ if( rb_range_beg_len(range, &cu_start, &cu_len, ICU_LEN(str), 0) != Qtrue)
2699
+ return Qnil;
2700
+
2701
+ cur_pos = u_countChar32( ICU_PTR(str), cu_start );
2702
+ if( cu_start+cu_len > ICU_LEN(str)) --cu_len;
2703
+ cp_len = u_countChar32( ICU_PTR(str) + cu_start , cu_len);
2704
+ return rb_range_new(LONG2NUM(cur_pos), LONG2NUM(cur_pos + cp_len-1), 0);
2705
+ }
2706
+ /**
2707
+ * call-seq:
2708
+ * str.conv_point_range(point_range) => code_unit_range
2709
+ *
2710
+ * Converts <b>code point</b> range to <b>code unit</b> range.
2711
+ * (inversion of #conv_unit_range)
2712
+ * If your chars don't use multiple UTF16 codeuints, range will be the same.
2713
+ */
2714
+ VALUE icu_ustr_convert_point_range(str, range)
2715
+ VALUE str, range;
2716
+ {
2717
+ long cp_start, cu_start, cu_end, cp_len, str_cp_len;
2718
+ str_cp_len = u_countChar32( ICU_PTR(str), ICU_LEN(str));
2719
+ if( Qtrue != rb_range_beg_len(range, &cp_start, &cp_len, str_cp_len, 0) ) return Qnil;
2720
+
2721
+ cu_start = 0;
2722
+ U16_FWD_N(ICU_PTR(str), cu_start, ICU_LEN(str), cp_start); /* care sur */
2723
+ cu_end = cu_start;
2724
+ U16_FWD_N(ICU_PTR(str), cu_end, ICU_LEN(str), cp_len); /* care sur */
2725
+
2726
+ return rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0);
2727
+ }
2728
+ /**
2729
+ * call-seq:
2730
+ * str.unit_count
2731
+ *
2732
+ * returns number of code units in string.
2733
+ *
2734
+ */
2735
+ VALUE icu_ustr_unit_count(VALUE str){
2736
+ return LONG2NUM(ICU_LEN(str));
2737
+ }
2738
+ /**
2739
+ * call-seq:
2740
+ * str.point_count
2741
+ *
2742
+ * returns number of code points in string.
2743
+ *
2744
+ */
2745
+ VALUE icu_ustr_point_count(VALUE str){
2746
+ return LONG2NUM(u_countChar32(ICU_PTR(str), ICU_LEN(str)));
2747
+ }
2748
+
2749
+ UChar icu_uchar_at(int32_t offset, void * context)
2750
+ {
2751
+ return ((UChar*)context)[offset];
2752
+ }
2753
+ /**
2754
+ * call-seq:
2755
+ * str.unescape => new_str
2756
+ *
2757
+ * Unescape a string of characters.
2758
+ *
2759
+ * The following escape sequences are recognized:
2760
+ * \uhhhh 4 hex digits; h in [0-9A-Fa-f]
2761
+ * \Uhhhhhhhh 8 hex digits
2762
+ * \xhh 1-2 hex digits \x{h...} 1-8 hex digits
2763
+ * \ooo 1-3 octal digits; o in [0-7]
2764
+ * \cX control-X; X is masked with 0x1F
2765
+ *
2766
+ * as well as the standard ANSI C escapes:
2767
+ * \a => U+0007, \b => U+0008, \t => U+0009, \n => U+000A, \v => U+000B, \f => U+000C, \r => U+000D, \e => U+001B, \" => U+0022, \' => U+0027, \? => U+003F, \\ => U+005C
2768
+ *
2769
+ * If escape sequence is invalid, it is ignored.
2770
+ *
2771
+ * "\\u044D\\u043A\\u0440\\u0430\\u043D\\u0438\\u0440\\u043E\\u0432\\u0430\\u043D\\u0438\\u0435".u.unescape => "экранирование"
2772
+ *
2773
+ **/
2774
+
2775
+ VALUE icu_ustr_unescape(str)
2776
+ VALUE str;
2777
+ {
2778
+ UChar32 c32;
2779
+ int32_t offset, leng, i, segment_start;
2780
+ UChar * ptr;
2781
+ UChar buf[3];
2782
+ VALUE ret;
2783
+ offset = 0;
2784
+ segment_start = 0;
2785
+ leng = ICU_LEN(str);
2786
+ ptr = ICU_PTR(str);
2787
+ ret = icu_ustr_new(0, 0);
2788
+ while(offset < leng) {
2789
+ if( ptr[offset] == '\\' ) {
2790
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2791
+ ++offset;
2792
+ c32 = u_unescapeAt(icu_uchar_at, &offset, leng, ICU_PTR(str));
2793
+ // append this char
2794
+ if( 0xFFFFFFFF == c32) continue;
2795
+ i = 0;
2796
+ U16_APPEND_UNSAFE(buf, i, c32);
2797
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, buf, U16_LENGTH(c32));
2798
+ segment_start = offset;
2799
+ } else {
2800
+ ++offset;
2801
+ }
2802
+ }
2803
+ if( segment_start < offset)
2804
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2805
+
2806
+ return ret;
2807
+ }
2808
+
2809
+
2810
+
2811
+ /* transliteration */
2812
+ extern VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len);
2813
+ /**
2814
+ * call-seq:
2815
+ * str.translit(id, [rules])
2816
+ *
2817
+ * Performs {transliteration}[http://icu.sourceforge.net/userguide/Transformations.html],
2818
+ * of this string, using given transform +id+ and +rules+
2819
+ *
2820
+ * "yukihiro matsumoto".u.translit("Latin-Hiragana".u) # => ゆきひろ まつもと
2821
+ * "hello".u.translit("null".u, ":: upper();".u) # => HELLO
2822
+ **/
2823
+ VALUE icu_ustr_translit(argc, argv, str)
2824
+ int argc;
2825
+ VALUE * argv ;
2826
+ VALUE str;
2827
+ {
2828
+ VALUE id, rules ;
2829
+ if(rb_scan_args(argc, argv, "11", &id, &rules) == 2) {
2830
+ Check_Class(rules, rb_cUString);
2831
+ } else rules = Qnil;
2832
+
2833
+ Check_Class(str, rb_cUString);
2834
+ Check_Class(id, rb_cUString);
2835
+ if( rules == Qnil) {
2836
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id), NULL, 0);
2837
+ } else {
2838
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id),
2839
+ ICU_PTR(rules), ICU_LEN(rules));
2840
+ }
2841
+ }
2842
+ void
2843
+ initialize_ustring(void)
2844
+ {
2845
+ UErrorCode status = U_ZERO_ERROR;
2846
+ u_init(&status);
2847
+ if( U_FAILURE(status) ){
2848
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2849
+ }
2850
+ s_UCA_collator = ucol_open("", &status);
2851
+ if( U_FAILURE(status) ){
2852
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2853
+ }
2854
+ s_case_UCA_collator = ucol_open("", &status);
2855
+ if( U_FAILURE(status) ){
2856
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2857
+ }
2858
+ ucol_setStrength(s_case_UCA_collator, UCOL_SECONDARY);
2859
+
2860
+ /*
2861
+
2862
+ Document-class: UString
2863
+
2864
+ UString is a string class that stores Unicode characters directly and provides
2865
+ similar functionality as the Ruby String class.
2866
+
2867
+ An UString string consists of 16-bit Unicode code units. A Unicode character
2868
+ may be stored with either one code unit which is the most common case or with a matched
2869
+ pair of special code units ("surrogates").
2870
+
2871
+ For single-character handling, a Unicode character code point is a value in the
2872
+ range 0..0x10ffff.
2873
+
2874
+ Indexes and offsets into and lengths of strings always count code units, not code points.
2875
+ This is the same as with multi-byte char* strings in traditional string handling.
2876
+ Operations on partial strings typically do not test for code point boundaries.
2877
+
2878
+ In order to use the collation, text boundary analysis, formatting and other ICU APIs,
2879
+ Unicode strings must be used. In order to get Unicode strings from your native codepage,
2880
+ you can use the conversion API.
2881
+
2882
+ UString class is also point for access to several ICU services, instead of
2883
+ mirroring ICU class hierarchy.
2884
+
2885
+ ==== Methods by category:
2886
+
2887
+ - concat and modify: + , * , << , #concat , #replace
2888
+
2889
+ - element reference, insert, replace: [] , #slice , []= , #slice! , #insert , #char_span
2890
+
2891
+ - comparisons: <=> , == , #casecmp , #strcoll
2892
+
2893
+ - size and positions: #length , #point_count , #clear , #empty? , #conv_unit_range , #conv_point_range
2894
+
2895
+ - index/search methods: #index , #rindex , #include? , #search
2896
+
2897
+ - regexps, matching and replacing: =~ , #match , #scan , #split , #sub , #sub! , #gsub , #gsub!
2898
+
2899
+ - conversion String/UString: #to_s, Kernel#u, String#to_u
2900
+
2901
+ - iterators: #each_line_break , #each_word , #each_char , #each_sentence
2902
+
2903
+ - split to chars/codepoints: #chars , #codepoints , Array#to_u
2904
+
2905
+ - character case: #upcase , #upcase! , #downcase , #downcase!
2906
+
2907
+ - stripping spaces: #strip , #lstrip , #rstrip , #strip! , #lstrip! , #rstrip!
2908
+
2909
+ - formatting and parsing: #format , #parse_date , #to_f
2910
+
2911
+ - UNICODE normalization: #norm_C , #norm_D , #norm_KC , #norm_KD , #norm_FCD
2912
+
2913
+ - utilities: #unescape , #hash , #inspect , #inspect_names , #translit
2914
+
2915
+ - ICU avalable info: #list_coll , #list_locales , #list_translits
2916
+ */
2917
+ rb_cUString = rb_define_class("UString", rb_cObject);
2918
+ rb_include_module(rb_cUString, rb_mComparable);
2919
+
2920
+ /* initializations */
2921
+ rb_define_alloc_func(rb_cUString, icu_ustr_alloc);
2922
+ rb_define_method(rb_cUString, "initialize", icu_ustr_init, -1);
2923
+ rb_define_method(rb_cUString, "initialize_copy", icu_ustr_replace, 1);
2924
+ rb_define_method(rb_cUString, "replace", icu_ustr_replace, 1);
2925
+
2926
+ /* comparisons */
2927
+ rb_define_method(rb_cUString, "<=>", icu_ustr_cmp_m, 1);
2928
+ rb_define_method(rb_cUString, "==", icu_ustr_equal, 1);
2929
+ rb_define_method(rb_cUString, "casecmp", icu_ustr_casecmp, 1);
2930
+ rb_define_singleton_method(rb_cUString, "strcoll", icu_ustr_coll, -1);
2931
+
2932
+ /* ICU avalable info */
2933
+ rb_define_singleton_method(rb_cUString, "list_coll", icu_ustr_list_coll, 0);
2934
+ rb_define_singleton_method(rb_cUString, "list_locales", icu_ustr_list_locales, 0);
2935
+ rb_define_singleton_method(rb_cUString, "list_translits", icu_ustr_list_translits, 0);
2936
+
2937
+ /* hash code */
2938
+ rb_define_method(rb_cUString, "hash", icu_ustr_hash_m, 0);
2939
+
2940
+ /* inspect */
2941
+ rb_define_method(rb_cUString, "inspect", icu_ustr_inspect, 0);
2942
+ rb_define_method(rb_cUString, "inspect_names", icu_ustr_inspect_names, 0);
2943
+
2944
+ /* size */
2945
+ rb_define_method(rb_cUString, "length", icu_ustr_length, 0);
2946
+ rb_define_alias (rb_cUString, "size", "length");
2947
+ rb_define_method(rb_cUString, "unit_count", icu_ustr_unit_count, 0);
2948
+ rb_define_method(rb_cUString, "point_count", icu_ustr_point_count, 0);
2949
+ rb_define_method(rb_cUString, "clear", icu_ustr_clear, 0);
2950
+ rb_define_method(rb_cUString, "empty?", icu_ustr_empty, 0);
2951
+
2952
+ /* UNICODE normalization */
2953
+ rb_define_method(rb_cUString, "norm_C", icu_ustr_normalize_C, 0);
2954
+ rb_define_method(rb_cUString, "norm_D", icu_ustr_normalize_D, 0);
2955
+ rb_define_method(rb_cUString, "norm_KC", icu_ustr_normalize_KC, 0);
2956
+ rb_define_method(rb_cUString, "norm_KD", icu_ustr_normalize_KD, 0);
2957
+ rb_define_method(rb_cUString, "norm_FCD", icu_ustr_normalize_FCD, 0);
2958
+
2959
+ /* iterators */
2960
+ rb_define_method(rb_cUString, "each_line_break", icu_ustr_each_line, -1);
2961
+ rb_define_method(rb_cUString, "each_word", icu_ustr_each_word, -1);
2962
+ rb_define_method(rb_cUString, "each_char", icu_ustr_each_char, -1);
2963
+ rb_define_method(rb_cUString, "each_sentence", icu_ustr_each_sentence, -1);
2964
+ rb_define_alias(rb_cUString, "each", "each_line_break");
2965
+
2966
+ /* split to chars/codepoints */
2967
+ rb_define_method(rb_cUString, "chars", icu_ustr_chars_m, -1);
2968
+ rb_define_method(rb_cUString, "char_span", icu_ustr_char_span, -1);
2969
+ rb_define_method(rb_cUString, "codepoints", icu_ustr_points, 0);
2970
+
2971
+ /* concat operations */
2972
+ rb_define_method(rb_cUString, "+", icu_ustr_plus, 1);
2973
+ rb_define_method(rb_cUString, "*", icu_ustr_times, 1);
2974
+ rb_define_method(rb_cUString, "concat", icu_ustr_concat, 1);
2975
+ rb_define_alias( rb_cUString, "<<", "concat");
2976
+
2977
+ /* character case */
2978
+ rb_define_method(rb_cUString, "upcase", icu_ustr_upcase, -1);
2979
+ rb_define_method(rb_cUString, "upcase!", icu_ustr_upcase_bang, -1);
2980
+ rb_define_method(rb_cUString, "downcase", icu_ustr_downcase, -1);
2981
+ rb_define_method(rb_cUString, "downcase!", icu_ustr_downcase_bang, -1);
2982
+ rb_define_method(rb_cUString, "foldcase", icu_ustr_foldcase, 0);
2983
+
2984
+ /* stripping spaces */
2985
+ rb_define_method(rb_cUString, "strip", icu_ustr_strip, 0);
2986
+ rb_define_method(rb_cUString, "lstrip", icu_ustr_lstrip, 0);
2987
+ rb_define_method(rb_cUString, "rstrip", icu_ustr_rstrip, 0);
2988
+
2989
+ rb_define_method(rb_cUString, "strip!", icu_ustr_strip_bang, 0);
2990
+ rb_define_method(rb_cUString, "lstrip!", icu_ustr_lstrip_bang, 0);
2991
+ rb_define_method(rb_cUString, "rstrip!", icu_ustr_rstrip_bang, 0);
2992
+
2993
+ /* index/search methods */
2994
+ rb_define_method(rb_cUString, "index", icu_ustr_index_m, -1);
2995
+ rb_define_method(rb_cUString, "rindex", icu_ustr_rindex_m, -1);
2996
+ rb_define_method(rb_cUString, "include?", icu_ustr_include, 1);
2997
+ rb_define_method(rb_cUString, "search", icu_ustr_search, -1);
2998
+
2999
+ /* element reference */
3000
+ rb_define_method(rb_cUString, "[]", icu_ustr_aref_m, -1);
3001
+ rb_define_alias(rb_cUString, "slice", "[]");
3002
+
3003
+ /* codeunit/codepoint conversion */
3004
+ rb_define_method(rb_cUString, "conv_unit_range", icu_ustr_convert_unit_range, 1);
3005
+ rb_define_method(rb_cUString, "conv_point_range", icu_ustr_convert_point_range, 1);
3006
+
3007
+ /* insert/replace */
3008
+ rb_define_method(rb_cUString, "[]=", icu_ustr_aset_m, -1);
3009
+ rb_define_method(rb_cUString, "slice!", icu_ustr_slice_bang, -1);
3010
+ rb_define_method(rb_cUString, "insert", icu_ustr_insert, 2);
3011
+
3012
+ /* conversion to String from UString */
3013
+ rb_define_method(rb_cUString, "to_u", icu_ustr_to_ustr, -1);
3014
+ rb_define_method(rb_cUString, "to_s", icu_ustr_to_rstr, -1);
3015
+ rb_define_alias(rb_cUString, "to_str", "to_s");
3016
+
3017
+ /* formatting messages */
3018
+ rb_define_method(rb_cUString, "format", icu_ustr_format, -2);
3019
+ rb_define_alias( rb_cUString, "fmt", "format");
3020
+
3021
+ /* parsing */
3022
+ rb_define_method(rb_cUString, "parse_date", icu_ustr_parse_date, 2);
3023
+ rb_define_method(rb_cUString, "to_f", icu_ustr_parse_double, -1);
3024
+
3025
+ /* transliteration */
3026
+ rb_define_method(rb_cUString, "translit", icu_ustr_translit, -1);
3027
+
3028
+ /* unescaping */
3029
+ rb_define_method(rb_cUString, "unescape", icu_ustr_unescape, 0);
3030
+
3031
+ /* regexp matching and replacing */
3032
+ rb_define_method(rb_cUString, "=~", icu_ustr_match, 1);
3033
+ rb_define_method(rb_cUString, "match", icu_ustr_match_m, 1);
3034
+ rb_define_method(rb_cUString, "scan", icu_ustr_scan, 1);
3035
+ rb_define_method(rb_cUString, "split", icu_ustr_split_m, -1);
3036
+ rb_define_method(rb_cUString, "sub", icu_ustr_sub, -1);
3037
+ rb_define_method(rb_cUString, "sub!", icu_ustr_sub_bang, -1);
3038
+ rb_define_method(rb_cUString, "gsub", icu_ustr_gsub, -1);
3039
+ rb_define_method(rb_cUString, "gsub!", icu_ustr_gsub_bang, -1);
3040
+
3041
+ }
3042
+