icu4r 0.1.3.2006.01.26

Sign up to get free protection for your applications and to get access to all the features.
data/ustring.c ADDED
@@ -0,0 +1,3042 @@
1
+ /**
2
+ * ustring.c - ICU based Unicode string support.
3
+ *
4
+ * $Id: ustring.c,v 1.20 2006/01/23 14:26:45 meadow Exp $
5
+ *
6
+ * Copyright (c) 2006 Nikolai Lugovoi
7
+ *
8
+ * This code is based on original ruby String class source (string.c):
9
+ *
10
+ * * string.c -
11
+ * *
12
+ * * Copyright (C) 1993-2003 Yukihiro Matsumoto
13
+ * * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
14
+ * * Copyright (C) 2000 Information-technology Promotion Agency, Japan
15
+ * *
16
+ **/
17
+
18
+ #include "icu_common.h"
19
+ VALUE icu_ustr_replace(VALUE str, VALUE str2);
20
+ VALUE ustr_gsub(int argc, VALUE * argv, VALUE str, int bang, int once);
21
+ extern VALUE icu_from_rstr(int argc, VALUE * argv, VALUE str);
22
+
23
+ VALUE rb_cURegexp;
24
+ VALUE rb_cUString;
25
+ VALUE rb_cUMatch;
26
+ VALUE rb_cUResourceBundle;
27
+ VALUE rb_cULocale;
28
+ VALUE rb_cUCalendar;
29
+
30
+ #include "uregex.h"
31
+
32
+
33
+ /* to be used in <=>, casecmp */
34
+ static UCollator * s_UCA_collator, * s_case_UCA_collator;
35
+
36
+ static void
37
+ free_ustr(str)
38
+ ICUString *str;
39
+ {
40
+ if (str->ptr)
41
+ free(str->ptr);
42
+ str->ptr = 0;
43
+ free(str);
44
+ }
45
+ inline void icu_check_frozen(VALUE str)
46
+ {
47
+ rb_check_frozen(str);
48
+ if(USTRING(str)->busy) rb_raise(rb_eRuntimeError, "String is busy. Can't modify");
49
+ }
50
+ #define START_BUF_LEN 16
51
+ /**
52
+ * Allocate ICUString struct with given +capa+ capacity,
53
+ * if mode == 1 and UChar != 0 - copy len UChars from src,
54
+ * else set pointer to src.
55
+ */
56
+ #define ICU_COPY 1
57
+ #define ICU_SET 0
58
+ VALUE icu_ustr_alloc_and_wrap(UChar * src, long len, long capa, int mode)
59
+ {
60
+ ICUString *n_str = ALLOC_N(ICUString, 1);
61
+ size_t alloc_capa;
62
+ if( mode == ICU_COPY ) {
63
+ alloc_capa = START_BUF_LEN > capa ? START_BUF_LEN : capa;
64
+ if(alloc_capa<=len) alloc_capa = len + 1;
65
+ n_str->ptr = ALLOC_N(UChar, alloc_capa);
66
+ n_str->capa = alloc_capa;
67
+ n_str->len = len;
68
+ if( src ) {
69
+ u_memcpy(n_str->ptr, src, len);
70
+ n_str->ptr[len] = 0;
71
+ }
72
+ } else {
73
+ n_str->ptr = src;
74
+ n_str->len = len;
75
+ n_str->capa = capa;
76
+ }
77
+ if(n_str->capa <= n_str->len) rb_raise(rb_eRuntimeError, "Capacity is not large then len, sentinel can't be set!");
78
+ n_str->busy = 0;
79
+ n_str->ptr[n_str->len] = 0;
80
+ return Data_Wrap_Struct(rb_cUString, 0, free_ustr, n_str);
81
+ }
82
+ VALUE
83
+ icu_ustr_alloc(klass)
84
+ VALUE klass;
85
+ {
86
+ return icu_ustr_alloc_and_wrap(NULL, 0, 0, ICU_COPY);
87
+ }
88
+
89
+ void ustr_capa_resize(ICUString * str, long new_capa)
90
+ {
91
+ if (new_capa != str->capa) {
92
+ if (str->capa < new_capa || (str->capa - new_capa > 1024)) {
93
+ if(new_capa < START_BUF_LEN) new_capa = START_BUF_LEN;
94
+ REALLOC_N(str->ptr, UChar, new_capa);
95
+ str->capa = new_capa;
96
+ }
97
+ }
98
+ }
99
+ /* delete +del_len+ units from string and insert replacement */
100
+ void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len)
101
+ {
102
+ long new_len;
103
+ UChar * temp = 0 ;
104
+ if( str->busy ) {
105
+ rb_warn("Attempt to modify busy string. Ignored");
106
+ return;
107
+ }
108
+ if( repl_len < 0) return;
109
+ if( del_len == 0 && repl_len == 0) return;
110
+ new_len = str->len - del_len + repl_len;
111
+ if (replacement == str->ptr ) {
112
+ temp = ALLOC_N(UChar, repl_len);
113
+ u_memcpy(temp, replacement, repl_len);
114
+ replacement = temp;
115
+ }
116
+ if ( repl_len >= del_len) ustr_capa_resize(str, new_len+1);
117
+ /* move tail */
118
+ if(str->len - (start+del_len) > 0) {
119
+ u_memmove(str->ptr + start+repl_len, str->ptr + start+del_len, str->len-(start+del_len) );
120
+ }
121
+ /* copy string */
122
+ if( repl_len > 0) u_memcpy(str->ptr+start, replacement, repl_len);
123
+ if ( repl_len < del_len) ustr_capa_resize(str, new_len+1);
124
+ str->len = new_len;
125
+ str->ptr[new_len] = 0;
126
+ if(temp) {
127
+ free(temp);
128
+ }
129
+ }
130
+ static inline void
131
+ ustr_mod_check(VALUE s, UChar *p, long len)
132
+ {
133
+ if (ICU_PTR(s) != p || ICU_LEN(s) != len){
134
+ rb_raise(rb_eRuntimeError, "string modified");
135
+ }
136
+ }
137
+ VALUE
138
+ ustr_new(klass, ptr, len)
139
+ VALUE klass;
140
+ UChar *ptr;
141
+ long len;
142
+ {
143
+ if (len < 0) {
144
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
145
+ }
146
+ return icu_ustr_alloc_and_wrap(ptr, len, len+1, ICU_COPY);
147
+ }
148
+
149
+ VALUE
150
+ icu_ustr_new(ptr, len)
151
+ const UChar *ptr;
152
+ long len;
153
+ {
154
+ return ustr_new(rb_cUString, ptr, len);
155
+ }
156
+ VALUE
157
+ icu_ustr_new_set(ptr, len, capa)
158
+ UChar *ptr;
159
+ long len;
160
+ long capa;
161
+ {
162
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_SET);
163
+ }
164
+ VALUE
165
+ icu_ustr_new2(ptr)
166
+ const UChar *ptr;
167
+ {
168
+ if (!ptr) {
169
+ rb_raise(rb_eArgError, "NULL pointer given");
170
+ }
171
+ return icu_ustr_new(ptr, u_strlen(ptr));
172
+ }
173
+
174
+ inline VALUE
175
+ icu_ustr_new_capa(UChar * ptr, long len, long capa)
176
+ {
177
+ return icu_ustr_alloc_and_wrap(ptr, len, capa, ICU_COPY);
178
+ }
179
+
180
+ /* ------------ */
181
+
182
+ /**
183
+ * call-seq:
184
+ * UString.new(str="".u) => new_str
185
+ *
186
+ * Returns a new string object containing a copy of <i>str</i>.
187
+ */
188
+
189
+ VALUE
190
+ icu_ustr_init(argc, argv, str)
191
+ int argc;
192
+ VALUE *argv;
193
+ VALUE str;
194
+ {
195
+ VALUE orig;
196
+
197
+ if (rb_scan_args(argc, argv, "01", &orig) == 1)
198
+ {
199
+ icu_ustr_replace(str, orig);
200
+ }
201
+ return str;
202
+ }
203
+
204
+ /**
205
+ * call-seq:
206
+ * str.length => integer
207
+ *
208
+ * Returns the length of <i>str</i>.
209
+ */
210
+ VALUE
211
+ icu_ustr_length(str)
212
+ VALUE str;
213
+ {
214
+ return LONG2NUM(ICU_LEN(str));
215
+ }
216
+
217
+ /**
218
+ * call-seq:
219
+ * str.empty? => true or false
220
+ *
221
+ * Returns <code>true</code> if <i>str</i> has a length of zero.
222
+ *
223
+ * "hello".u.empty? #=> false
224
+ * "".u.empty? #=> true
225
+ */
226
+
227
+ VALUE
228
+ icu_ustr_empty(str)
229
+ VALUE str;
230
+ {
231
+ return 0 == ICU_LEN(str) ? Qtrue : Qfalse;
232
+ }
233
+
234
+ VALUE
235
+ icu_ustr_resize(str, len)
236
+ VALUE str;
237
+ long len;
238
+ {
239
+ if (len < 0) {
240
+ rb_raise(rb_eArgError, "negative string size (or size too big)");
241
+ }
242
+ ustr_capa_resize(USTRING(str), len);
243
+ ICU_LEN(str) = len;
244
+ ICU_PTR(str)[len] = 0; /* sentinel */
245
+ return str;
246
+ }
247
+
248
+
249
+ /**
250
+ * call-seq:
251
+ * str.replace(other_str) => str
252
+ *
253
+ * Replaces the contents and taintedness of <i>str</i> with the corresponding
254
+ * values in <i>other_str</i>.
255
+ *
256
+ * s = "hello".u #=> "hello"
257
+ * s.replace "world".u #=> "world"
258
+ */
259
+ VALUE
260
+ icu_ustr_replace(str, str2)
261
+ VALUE str,
262
+ str2;
263
+ {
264
+ if (str == str2)
265
+ return str;
266
+ icu_check_frozen(str);
267
+ Check_Class(str2, rb_cUString);
268
+ ustr_splice_units(USTRING(str), 0, ICU_LEN(str), ICU_PTR(str2), ICU_LEN(str2));
269
+ OBJ_INFECT(str, str2);
270
+ return str;
271
+ }
272
+
273
+ /**
274
+ * call-seq:
275
+ * string.clear -> string
276
+ *
277
+ * Makes string empty.
278
+ *
279
+ * a = "abcde".u
280
+ * a.clear #=> ""
281
+ */
282
+
283
+ VALUE
284
+ icu_ustr_clear(str)
285
+ VALUE str;
286
+ {
287
+ icu_check_frozen(str);
288
+ icu_ustr_resize(str, 0);
289
+ return str;
290
+ }
291
+
292
+ int icu_collator_cmp (UCollator * collator, VALUE str1, VALUE str2)
293
+ {
294
+ int ret = 0, result ;
295
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
296
+ switch(result){
297
+ case UCOL_EQUAL: ret = 0;break;
298
+ case UCOL_GREATER: ret = 1;break;
299
+ case UCOL_LESS: ret = -1;break;
300
+ }
301
+ return ret;
302
+ }
303
+
304
+ int
305
+ icu_ustr_cmp(str1, str2)
306
+ VALUE str1,
307
+ str2;
308
+ {
309
+ return icu_collator_cmp(s_UCA_collator, str1, str2);
310
+ }
311
+
312
+ /**
313
+ * call-seq:
314
+ * str == obj => true or false
315
+ *
316
+ * Equality---If <i>obj</i> is not a <code>UString</code>, returns
317
+ * <code>false</code>. Otherwise, returns <code>true</code> if
318
+ * strings are of the same length and content
319
+ *
320
+ */
321
+
322
+ VALUE
323
+ icu_ustr_equal(str1, str2)
324
+ VALUE str1,
325
+ str2;
326
+ {
327
+ if (str1 == str2)
328
+ return Qtrue;
329
+ if (CLASS_OF(str2) != rb_cUString) {
330
+ return Qfalse;
331
+ }
332
+ if (ICU_LEN(str1) == ICU_LEN(str2) &&
333
+ u_strncmp(ICU_PTR(str1), ICU_PTR(str2), ICU_LEN(str1) ) == 0) {
334
+ return Qtrue;
335
+ }
336
+ return Qfalse;
337
+ }
338
+
339
+ /**
340
+ * call-seq:
341
+ * str <=> other_str => -1, 0, +1
342
+ *
343
+ * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
344
+ * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
345
+ * <i>str</i>.
346
+ *
347
+ * <code><=></code> is the basis for the methods <code><</code>,
348
+ * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
349
+ * included from module <code>Comparable</code>. The method
350
+ * <code>String#==</code> does not use <code>Comparable#==</code>.
351
+ *
352
+ * This method uses UCA rules, see also #strcoll for locale-specific string collation.
353
+ *
354
+ * "abcdef".u <=> "abcde".u #=> 1
355
+ * "abcdef".u <=> "abcdef".u #=> 0
356
+ * "abcdef".u <=> "abcdefg".u #=> -1
357
+ * "abcdef".u <=> "ABCDEF".u #=> -1
358
+ */
359
+
360
+ VALUE
361
+ icu_ustr_cmp_m(str1, str2)
362
+ VALUE str1,
363
+ str2;
364
+ {
365
+ long result;
366
+
367
+ if (CLASS_OF(str2) != rb_cUString) {
368
+ return Qnil;
369
+ } else {
370
+ result = icu_ustr_cmp(str1, str2);
371
+ }
372
+ return LONG2NUM(result);
373
+ }
374
+
375
+ /**
376
+ * call-seq:
377
+ * str.casecmp(other_str) => -1, 0, +1
378
+ *
379
+ * Case-insensitive version of <code>UString#<=></code> .
380
+ * This method uses UCA collator with secondary strength, see #strcoll
381
+ *
382
+ *
383
+ * "abcdef".u.casecmp("abcde".u) #=> 1
384
+ * "aBcDeF".u.casecmp("abcdef".u) #=> 0
385
+ * "abcdef".u.casecmp("abcdefg".u) #=> -1
386
+ * "abcdef".u.casecmp("ABCDEF".u) #=> 0
387
+ */
388
+
389
+ VALUE
390
+ icu_ustr_casecmp(str1, str2)
391
+ VALUE str1,
392
+ str2;
393
+ {
394
+ Check_Class(str2, rb_cUString);
395
+ return INT2FIX(icu_collator_cmp(s_case_UCA_collator, str1, str2));
396
+ }
397
+
398
+ /**
399
+ * call-seq:
400
+ * str + other_str => new_str
401
+ *
402
+ * Concatenation---Returns a new <code>UString</code> containing
403
+ * <i>other_str</i> concatenated to <i>str</i>.
404
+ *
405
+ * "Hello from ".u + "main".u #=> "Hello from main"
406
+ */
407
+
408
+ VALUE
409
+ icu_ustr_plus(str1, str2)
410
+ VALUE str1,
411
+ str2;
412
+ {
413
+ VALUE str3;
414
+ Check_Class(str2, rb_cUString);
415
+
416
+ str3 = icu_ustr_new_capa(ICU_PTR(str1), ICU_LEN(str1), ICU_LEN(str1) + ICU_LEN(str2));
417
+ ustr_splice_units(USTRING(str3), ICU_LEN(str3), 0, ICU_PTR(str2), ICU_LEN(str2));
418
+ if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
419
+ OBJ_TAINT(str3);
420
+ return str3;
421
+ }
422
+
423
+ /**
424
+ * call-seq:
425
+ * str * integer => new_str
426
+ *
427
+ * Copy---Returns a new <code>UString</code> containing <i>integer</i> copies of
428
+ * the receiver.
429
+ *
430
+ * "Ho! ".u * 3 #=> "Ho! Ho! Ho! ".u
431
+ */
432
+
433
+ VALUE
434
+ icu_ustr_times(str, times)
435
+ VALUE str,
436
+ times;
437
+ {
438
+ VALUE str2;
439
+ long i,
440
+ len;
441
+ Check_Type(times, T_FIXNUM);
442
+ len = NUM2LONG(times);
443
+ if (len < 0) {
444
+ rb_raise(rb_eArgError, "negative argument");
445
+ }
446
+ if (len && LONG_MAX / len < ICU_LEN(str)) {
447
+ rb_raise(rb_eArgError, "argument too big");
448
+ }
449
+
450
+ str2 = icu_ustr_new_capa(0, 0, len *= ICU_LEN(str));
451
+ for (i = 0; i < len; i += ICU_LEN(str)) {
452
+ ustr_splice_units(USTRING(str2), i, 0, ICU_PTR(str), ICU_LEN(str));
453
+ }
454
+ ICU_PTR(str2)[ICU_LEN(str2)] = 0;
455
+
456
+ OBJ_INFECT(str2, str);
457
+
458
+ return str2;
459
+ }
460
+
461
+
462
+ /**
463
+ * call-seq:
464
+ * str << other_str => str
465
+ * str.concat(other_str) => str
466
+ *
467
+ * Append---Concatenates the given string object to <i>str</i>.
468
+ *
469
+ * a = "hello ".u
470
+ * a << "world".u #=> "hello world"
471
+ */
472
+
473
+ VALUE
474
+ icu_ustr_concat(str1, str2)
475
+ VALUE str1,
476
+ str2;
477
+ {
478
+ icu_check_frozen(str1);
479
+ Check_Class(str2, rb_cUString);
480
+ if (ICU_LEN(str2) > 0) {
481
+ ustr_splice_units(USTRING(str1), ICU_LEN(str1), 0, ICU_PTR(str2), ICU_LEN(str2));
482
+ OBJ_INFECT(str1, str2);
483
+ }
484
+ return str1;
485
+ }
486
+
487
+ int
488
+ icu_ustr_hash(str)
489
+ VALUE str;
490
+ {
491
+ register long len = ICU_LEN(str) * (sizeof(UChar));
492
+ register char *p = (char*)ICU_PTR(str);
493
+ register int key = 0;
494
+
495
+ while (len--) {
496
+ key += *p++;
497
+ key += (key << 10);
498
+ key ^= (key >> 6);
499
+ }
500
+ key += (key << 3);
501
+ key ^= (key >> 11);
502
+ key += (key << 15);
503
+ return key;
504
+ }
505
+
506
+ /**
507
+ * call-seq:
508
+ * str.hash => fixnum
509
+ *
510
+ * Return a hash based on the string's length and content.
511
+ */
512
+
513
+ VALUE
514
+ icu_ustr_hash_m(str)
515
+ VALUE str;
516
+ {
517
+ int key = icu_ustr_hash(str);
518
+ return INT2FIX(key);
519
+ }
520
+
521
+ VALUE
522
+ icu_ustr_dup(str)
523
+ VALUE str;
524
+ {
525
+ VALUE dup = icu_ustr_new(ICU_PTR(str), ICU_LEN(str));
526
+ return dup;
527
+ }
528
+
529
+ /**
530
+ * call-seq:
531
+ * str.upcase!(locale = "") => str or nil
532
+ *
533
+ * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
534
+ * were made. This method is locale-sensitive.
535
+ */
536
+
537
+ VALUE
538
+ icu_ustr_upcase_bang(argc, argv, str)
539
+ int argc;
540
+ VALUE * argv;
541
+ VALUE str;
542
+
543
+ {
544
+ UErrorCode error = 0;
545
+ UChar *buf = 0;
546
+ long len ;
547
+ VALUE loc;
548
+ char * locale = NULL;
549
+ icu_check_frozen(str);
550
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
551
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
552
+ if( loc != Qnil) {
553
+ Check_Type(loc, T_STRING);
554
+ locale = RSTRING(loc)->ptr;
555
+ }
556
+ }
557
+
558
+ len = u_strToUpper(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale, &error);
559
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
560
+ REALLOC_N(buf, UChar, len + 1);
561
+ error = 0;
562
+ len =
563
+ u_strToUpper(buf, len, ICU_PTR(str), ICU_LEN(str), locale, &error);
564
+ }
565
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
566
+ return Qnil;
567
+ free(ICU_PTR(str));
568
+ ICU_PTR(str) = buf;
569
+ ICU_LEN(str) = len;
570
+ return str;
571
+ }
572
+
573
+
574
+ /**
575
+ * call-seq:
576
+ * str.upcase(locale = "") => new_str
577
+ *
578
+ * Returns a copy of <i>str</i> with all lowercase letters replaced with their
579
+ * uppercase counterparts. The operation is locale sensitive.
580
+ *
581
+ * "hEllO".u.upcase #=> "HELLO"
582
+ */
583
+
584
+ VALUE
585
+ icu_ustr_upcase(argc, argv, str)
586
+ int argc;
587
+ VALUE * argv;
588
+ VALUE str;
589
+
590
+ {
591
+ str = icu_ustr_dup(str);
592
+ icu_ustr_upcase_bang(argc, argv, str);
593
+ return str;
594
+ }
595
+
596
+
597
+ /**
598
+ * call-seq:
599
+ * str.downcase!(locale = "") => str or nil
600
+ *
601
+ * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
602
+ * changes were made.
603
+ */
604
+
605
+ VALUE
606
+ icu_ustr_downcase_bang(argc, argv, str)
607
+ int argc;
608
+ VALUE * argv;
609
+ VALUE str;
610
+ {
611
+ UErrorCode error = 0;
612
+ UChar *buf;
613
+ long len ;
614
+ VALUE loc;
615
+ char * locale = NULL;
616
+ buf = ALLOC_N(UChar, ICU_LEN(str) + 1);
617
+ icu_check_frozen(str);
618
+ if (rb_scan_args(argc, argv, "01", &loc) == 1) {
619
+ if( loc != Qnil) {
620
+ Check_Type(loc, T_STRING);
621
+ locale = RSTRING(loc)->ptr;
622
+ }
623
+ }
624
+ len =
625
+ u_strToLower(buf, ICU_LEN(str), ICU_PTR(str), ICU_LEN(str), locale,
626
+ &error);
627
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
628
+ REALLOC_N(buf, UChar, len + 1);
629
+ error = 0;
630
+ len =
631
+ u_strToLower(buf, len , ICU_PTR(str), ICU_LEN(str), locale,
632
+ &error);
633
+ }
634
+ if (0 == u_strncmp(buf, ICU_PTR(str), len))
635
+ return Qnil;
636
+ free(ICU_PTR(str));
637
+ ICU_PTR(str) = buf;
638
+ ICU_LEN(str) = len;
639
+ return str;
640
+ }
641
+
642
+ /**
643
+ * call-seq:
644
+ * str.downcase(locale = "") => new_str
645
+ *
646
+ * Returns a copy of <i>str</i> with all uppercase letters replaced with their
647
+ * lowercase counterparts. The operation is locale sensitive.
648
+ *
649
+ * "hEllO".u.downcase #=> "hello"
650
+ */
651
+
652
+ VALUE
653
+ icu_ustr_downcase(argc, argv, str)
654
+ int argc;
655
+ VALUE * argv;
656
+ VALUE str;
657
+ {
658
+ str = icu_ustr_dup(str);
659
+ icu_ustr_downcase_bang(argc, argv, str);
660
+ return str;
661
+ }
662
+
663
+ /**
664
+ * call-seq:
665
+ * str.foldcase
666
+ *
667
+ * Case-fold the characters in a string.
668
+ * Case-folding is locale-independent and not context-sensitive.
669
+ *
670
+ */
671
+ VALUE
672
+ icu_ustr_foldcase(str)
673
+ VALUE str;
674
+ {
675
+ UErrorCode error = 0;
676
+ UChar *buf;
677
+ long len, capa ;
678
+ capa = ICU_LEN(str) + 1;
679
+ buf = ALLOC_N(UChar, capa);
680
+ len = u_strFoldCase(buf, capa-1, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
681
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
682
+ capa = len + 1;
683
+ REALLOC_N(buf, UChar, len + 1);
684
+ error = 0;
685
+ len = u_strFoldCase(buf, capa, ICU_PTR(str), ICU_LEN(str), U_FOLD_CASE_DEFAULT, &error);
686
+ }
687
+ return icu_ustr_new_set(buf, len, capa) ;
688
+ }
689
+
690
+ static long
691
+ icu_ustr_index(str, sub, offset)
692
+ VALUE str,
693
+ sub;
694
+ long offset;
695
+ {
696
+ long pos;
697
+ UChar *found;
698
+ if (offset < 0) {
699
+ offset += ICU_LEN(str);
700
+ if (offset < 0)
701
+ return -1;
702
+ }
703
+ if (ICU_LEN(str) - offset < ICU_LEN(sub))
704
+ return -1;
705
+ if (ICU_LEN(sub) == 0)
706
+ return offset;
707
+ found =
708
+ u_strFindFirst(ICU_PTR(str) + offset, ICU_LEN(str) - offset,
709
+ ICU_PTR(sub), ICU_LEN(sub));
710
+ if (NULL == found)
711
+ return -1;
712
+ pos = found - (ICU_PTR(str) + offset);
713
+ return pos + offset;
714
+ }
715
+
716
+ /**
717
+ * call-seq:
718
+ * str.index(substring [, offset]) => fixnum or nil
719
+ * str.index(regexp [, offset]) => fixnum or nil
720
+ *
721
+ * Returns the index of the first occurrence of the given <i>substring</i>,
722
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns
723
+ * <code>nil</code> if not found. If the second parameter is present, it
724
+ * specifies the position in the string to begin the search.
725
+ *
726
+ * "hello".u.index('e'.u) #=> 1
727
+ * "hello".u.index('lo'.u) #=> 3
728
+ * "hello".u.index('a'.u) #=> nil
729
+ * "hello".u.index(/[aeiou]/.U, -3) #=> 4
730
+ */
731
+
732
+ VALUE
733
+ icu_ustr_index_m(argc, argv, str)
734
+ int argc;
735
+ VALUE *argv;
736
+ VALUE str;
737
+ {
738
+ VALUE sub;
739
+ VALUE initpos;
740
+ long pos ;
741
+ int processed = 0;
742
+
743
+ if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
744
+ pos = NUM2LONG(initpos);
745
+ } else {
746
+ pos = 0;
747
+ }
748
+ if (pos < 0) {
749
+ pos += ICU_LEN(str);
750
+ }
751
+
752
+ if( CLASS_OF(sub) == rb_cUString) {
753
+ pos = icu_ustr_index(str, sub, pos);
754
+ processed = 1;
755
+ }
756
+ if( CLASS_OF(sub) == rb_cURegexp) {
757
+ pos = icu_reg_search(sub, str, pos, 0);
758
+ processed = 1;
759
+ }
760
+ if(! processed ) {
761
+ rb_raise(rb_eTypeError, "Wrong Type, expected UString or URegexp, got %s", rb_class2name(CLASS_OF(sub)));
762
+ }
763
+
764
+ if (pos == -1)
765
+ return Qnil;
766
+ return LONG2NUM(pos);
767
+ }
768
+
769
+ static long
770
+ icu_ustr_rindex(str, sub, pos)
771
+ VALUE str,
772
+ sub;
773
+ long pos;
774
+ {
775
+ long len = ICU_LEN(sub);
776
+ UChar *found;
777
+
778
+ /*
779
+ * substring longer than string
780
+ */
781
+ if (ICU_LEN(str) < len)
782
+ return -1;
783
+ if (ICU_LEN(str) - pos < len) {
784
+ pos = ICU_LEN(str) - len;
785
+ }
786
+ found = u_strFindLast(ICU_PTR(str), pos, ICU_PTR(sub), ICU_LEN(sub));
787
+ if (NULL == found)
788
+ return -1;
789
+ pos = found - (ICU_PTR(str));
790
+ return pos;
791
+ }
792
+
793
+
794
+ /**
795
+ * call-seq:
796
+ * str.rindex(substring [, fixnum]) => fixnum or nil
797
+ * str.rindex(regexp [, fixnum]) => fixnum or nil
798
+ *
799
+ * Returns the index of the last occurrence of the given <i>substring</i>,
800
+ * or pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
801
+ * found. If the second parameter is present, it specifies the position in the
802
+ * string to end the search---characters beyond this point will not be considered.
803
+ *
804
+ * "hello".u.rindex('e') #=> 1
805
+ * "hello".u.rindex('l') #=> 3
806
+ * "hello".u.rindex('a') #=> nil
807
+ * "hello".u.rindex(/[aeiou]/.U, -2) #=> 1
808
+ */
809
+
810
+ VALUE
811
+ icu_ustr_rindex_m(argc, argv, str)
812
+ int argc;
813
+ VALUE *argv;
814
+ VALUE str;
815
+ {
816
+ VALUE sub;
817
+ VALUE position;
818
+ long pos;
819
+
820
+ if (rb_scan_args(argc, argv, "11", &sub, &position) == 2) {
821
+ pos = NUM2LONG(position);
822
+ if (pos < 0) {
823
+ pos += ICU_LEN(str);
824
+ if (pos < 0) {
825
+ return Qnil;
826
+ }
827
+ }
828
+ if (pos > ICU_LEN(str))
829
+ pos = ICU_LEN(str);
830
+ } else {
831
+ pos = ICU_LEN(str);
832
+ }
833
+
834
+ switch (TYPE(sub)) {
835
+ case T_DATA:
836
+ if (CLASS_OF(sub) == rb_cUString) {
837
+ pos = icu_ustr_rindex(str, sub, pos);
838
+ if (pos >= 0)
839
+ return LONG2NUM(pos);
840
+ break;
841
+ }
842
+ if (CLASS_OF(sub) == rb_cURegexp) {
843
+ pos = icu_reg_search(sub, str, pos, 1);
844
+ if (pos >= 0)
845
+ return LONG2NUM(pos);
846
+ break;
847
+ }
848
+
849
+ default:
850
+ rb_raise(rb_eTypeError, "type mismatch: %s given",
851
+ rb_obj_classname(sub));
852
+ }
853
+ return Qnil;
854
+ }
855
+
856
+ /**
857
+ * call-seq:
858
+ * str.lstrip! => self or nil
859
+ *
860
+ * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
861
+ * change was made. See also <code>UString#rstrip!</code> and
862
+ * <code>UString#strip!</code>, in all these methods whitespace is an
863
+ * Unicode char that has White_Space property.
864
+ *
865
+ * " hello ".u.lstrip #=> "hello "
866
+ * "hello".u.lstrip! #=> nil
867
+ */
868
+
869
+ VALUE
870
+ icu_ustr_lstrip_bang(str)
871
+ VALUE str;
872
+ {
873
+ UChar *s;
874
+ int32_t i,
875
+ n,
876
+ c;
877
+ icu_check_frozen(str);
878
+ s = ICU_PTR(str);
879
+ n = ICU_LEN(str);
880
+ if (!s || n == 0)
881
+ return Qnil;
882
+ /*
883
+ * remove spaces at head
884
+ */
885
+ i = 0;
886
+ U16_GET(s, 0, i, n, c); /* care about surrogates */
887
+ while (i < n && u_isUWhiteSpace(c)) {
888
+ U16_NEXT(s, i, n, c); /* care surr */
889
+ }
890
+
891
+ if (i > 0) {
892
+ if(! u_isUWhiteSpace(c)) --i;
893
+ ICU_LEN(str) = n - i;
894
+ u_memmove(ICU_PTR(str), s + i, ICU_LEN(str));
895
+ ICU_PTR(str)[ICU_LEN(str)] = 0;
896
+ return str;
897
+ }
898
+ return Qnil;
899
+ }
900
+
901
+
902
+ /**
903
+ * call-seq:
904
+ * str.lstrip => new_str
905
+ *
906
+ * Returns a copy of <i>str</i> with leading whitespace removed. See also
907
+ * <code>UString#rstrip</code> and <code>UString#strip</code>.
908
+ *
909
+ * " hello ".u.lstrip #=> "hello "
910
+ * "hello".u.lstrip #=> "hello"
911
+ */
912
+
913
+ VALUE
914
+ icu_ustr_lstrip(str)
915
+ VALUE str;
916
+ {
917
+ str = icu_ustr_dup(str);
918
+ icu_ustr_lstrip_bang(str);
919
+ return str;
920
+ }
921
+
922
+
923
+ /**
924
+ * call-seq:
925
+ * str.rstrip! => self or nil
926
+ *
927
+ * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
928
+ * no change was made. See also <code>UString#lstrip!</code> and
929
+ * <code>UString#strip!</code>.
930
+ *
931
+ * " hello ".u.rstrip #=> " hello"
932
+ * "hello".u.rstrip! #=> nil
933
+ */
934
+
935
+ VALUE
936
+ icu_ustr_rstrip_bang(str)
937
+ VALUE str;
938
+ {
939
+ UChar *s;
940
+ int32_t i,
941
+ n,
942
+ c;
943
+
944
+ icu_check_frozen(str);
945
+ s = ICU_PTR(str);
946
+ n = ICU_LEN(str);
947
+
948
+ if (!s || n == 0)
949
+ return Qnil;
950
+ i = n - 1;
951
+
952
+ U16_GET(s, 0, n - 1, n, c); /* care surrogates */
953
+ i = n;
954
+ /*
955
+ * remove trailing spaces
956
+ */
957
+ while (i > 0 && u_isUWhiteSpace(c)) {
958
+ U16_PREV(s, 0, i, c); /* care surrogates */
959
+ }
960
+
961
+ if (i < n) {
962
+ if(! u_isUWhiteSpace(c)) ++i;
963
+ ICU_LEN(str) = i;
964
+ ICU_PTR(str)[i] = 0;
965
+ return str;
966
+ }
967
+ return Qnil;
968
+ }
969
+
970
+
971
+ /**
972
+ * call-seq:
973
+ * str.rstrip => new_str
974
+ *
975
+ * Returns a copy of <i>str</i> with trailing whitespace removed. See also
976
+ * <code>UString#lstrip</code> and <code>UString#strip</code>.
977
+ *
978
+ * " hello ".u.rstrip #=> " hello"
979
+ * "hello".u.rstrip #=> "hello"
980
+ */
981
+
982
+ VALUE
983
+ icu_ustr_rstrip(str)
984
+ VALUE str;
985
+ {
986
+ str = icu_ustr_dup(str);
987
+ icu_ustr_rstrip_bang(str);
988
+ return str;
989
+ }
990
+
991
+
992
+ /**
993
+ * call-seq:
994
+ * str.strip! => str or nil
995
+ *
996
+ * Removes leading and trailing whitespace from <i>str</i>. Returns
997
+ * <code>nil</code> if <i>str</i> was not altered.
998
+ */
999
+
1000
+ VALUE
1001
+ icu_ustr_strip_bang(str)
1002
+ VALUE str;
1003
+ {
1004
+ VALUE l = icu_ustr_lstrip_bang(str);
1005
+ VALUE r = icu_ustr_rstrip_bang(str);
1006
+
1007
+ if (NIL_P(l) && NIL_P(r))
1008
+ return Qnil;
1009
+ return str;
1010
+ }
1011
+
1012
+
1013
+ /**
1014
+ * call-seq:
1015
+ * str.strip => new_str
1016
+ *
1017
+ * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
1018
+ *
1019
+ * " hello ".u.strip #=> "hello"
1020
+ * "\tgoodbye\r\n".u.strip #=> "goodbye"
1021
+ */
1022
+
1023
+ VALUE
1024
+ icu_ustr_strip(str)
1025
+ VALUE str;
1026
+ {
1027
+ str = icu_ustr_dup(str);
1028
+ icu_ustr_strip_bang(str);
1029
+ return str;
1030
+ }
1031
+
1032
+
1033
+
1034
+ /* ----------------------------------- */
1035
+ VALUE
1036
+ icu_ustr_normalize(str, mode)
1037
+ VALUE str;
1038
+ int32_t mode;
1039
+ {
1040
+ UErrorCode error = U_ZERO_ERROR;
1041
+ long capa = ICU_LEN(str);
1042
+ UChar *buf;
1043
+ long needed;
1044
+ VALUE ret;
1045
+ if (UNORM_YES == unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), mode, &error))
1046
+ return icu_ustr_dup(str);
1047
+
1048
+ buf = ALLOC_N(UChar, capa + 20);
1049
+ do {
1050
+ error = 0;
1051
+ needed =
1052
+ unorm_normalize(ICU_PTR(str), ICU_LEN(str), mode, 0, buf, capa,
1053
+ &error);
1054
+ if (U_SUCCESS(error)) {
1055
+ ret = icu_ustr_new_set(buf, needed, capa);
1056
+ return ret;
1057
+ }
1058
+ if (error == U_BUFFER_OVERFLOW_ERROR) {
1059
+ capa = needed + 1;
1060
+ REALLOC_N(buf, UChar, capa);
1061
+ if (!buf)
1062
+ rb_raise(rb_eRuntimeError, "can't allocate memory");
1063
+ } else
1064
+ rb_raise(rb_eArgError, u_errorName(error));
1065
+ }
1066
+ while (1);
1067
+ }
1068
+
1069
+ /**
1070
+ * UNORM_NFKC Compatibility decomposition followed by canonical
1071
+ * composition.
1072
+ */
1073
+ VALUE
1074
+ icu_ustr_normalize_KC(str)
1075
+ VALUE str;
1076
+ {
1077
+ return icu_ustr_normalize(str, UNORM_NFKC);
1078
+ }
1079
+
1080
+ /**
1081
+ * UNORM_NFKD Compatibility decomposition.
1082
+ */
1083
+ VALUE
1084
+ icu_ustr_normalize_KD(str)
1085
+ VALUE str;
1086
+ {
1087
+ return icu_ustr_normalize(str, UNORM_NFKD);
1088
+ }
1089
+
1090
+ /**
1091
+ * UNORM_NFD Canonical decomposition.
1092
+ */
1093
+ VALUE
1094
+ icu_ustr_normalize_D(str)
1095
+ VALUE str;
1096
+ {
1097
+ return icu_ustr_normalize(str, UNORM_NFD);
1098
+ }
1099
+
1100
+ /**
1101
+ * UNORM_FCD
1102
+ */
1103
+ VALUE
1104
+ icu_ustr_normalize_FCD(VALUE str)
1105
+ {
1106
+ return icu_ustr_normalize(str, UNORM_FCD);
1107
+ }
1108
+
1109
+ /**
1110
+ * UNORM_NFC Canonical decomposition followed by canonical composition.
1111
+ */
1112
+ VALUE
1113
+ icu_ustr_normalize_C(str)
1114
+ VALUE str;
1115
+ {
1116
+ return icu_ustr_normalize(str, UNORM_NFC);
1117
+ }
1118
+
1119
+ /* UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE */
1120
+ VALUE
1121
+ icu_ustr_each_mode(argc, argv, str, mode)
1122
+ int argc;
1123
+ VALUE *argv;
1124
+ VALUE str;
1125
+ int32_t mode;
1126
+ {
1127
+ UErrorCode error = 0;
1128
+ UBreakIterator *boundary;
1129
+ int32_t end, start;
1130
+ VALUE loc ;
1131
+ char *locale = "";
1132
+ if( rb_scan_args(argc, argv, "01", &loc) == 1) {
1133
+ Check_Type(loc, T_STRING);
1134
+ locale = RSTRING(loc)->ptr;
1135
+ }
1136
+ boundary =
1137
+ ubrk_open(mode, locale, ICU_PTR(str), ICU_LEN(str),
1138
+ &error);
1139
+ if (U_FAILURE(error))
1140
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1141
+ start = ubrk_first(boundary);
1142
+ USTRING(str)->busy = 1;
1143
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1144
+ start = end, end = ubrk_next(boundary)) {
1145
+ rb_yield(icu_ustr_new(ICU_PTR(str) + start, end - start));
1146
+ }
1147
+ USTRING(str)->busy = 0;
1148
+ ubrk_close(boundary);
1149
+ return str;
1150
+ }
1151
+
1152
+ /**
1153
+ * call-seq:
1154
+ * str.each_word(locale = "") {|substr| block } => str
1155
+ *
1156
+ * Word boundary analysis is used by search and replace functions, as well as within text editing
1157
+ * applications that allow the user to select words with a double click. Word selection provides
1158
+ * correct interpretation of punctuation marks within and following words. Characters that are not
1159
+ * part of a word, such as symbols or punctuation marks, have word-breaks on both sides.
1160
+ *
1161
+ */
1162
+ VALUE
1163
+ icu_ustr_each_word(argc, argv, str)
1164
+ int argc;
1165
+ VALUE *argv;
1166
+ VALUE str;
1167
+
1168
+ {
1169
+ return icu_ustr_each_mode(argc, argv, str, UBRK_WORD);
1170
+ }
1171
+
1172
+ /**
1173
+ * call-seq:
1174
+ * str.each_char(locale = "") {|substr| block } => str
1175
+ *
1176
+ * Character boundary analysis allows users to interact with characters as they expect to,
1177
+ * for example, when moving the cursor through a text string. Character boundary analysis provides
1178
+ * correct navigation of through character strings, regardless of how the character is stored.
1179
+ * For example, an accented character might be stored as a base character and a diacritical mark.
1180
+ * What users consider to be a character can differ between languages.
1181
+ *
1182
+ */
1183
+ VALUE
1184
+ icu_ustr_each_char(argc, argv, str)
1185
+ int argc;
1186
+ VALUE *argv;
1187
+ VALUE str;
1188
+
1189
+ {
1190
+ return icu_ustr_each_mode(argc, argv, str, UBRK_CHARACTER);
1191
+ }
1192
+
1193
+ /**
1194
+ * call-seq:
1195
+ * str.each_line_break(locale = "") {|substr| block } => str
1196
+ *
1197
+ * Line boundary analysis determines where a text string can be broken when line-wrapping.
1198
+ * The mechanism correctly handles punctuation and hyphenated words.
1199
+ *
1200
+ */
1201
+ VALUE
1202
+ icu_ustr_each_line(argc, argv, str)
1203
+ int argc;
1204
+ VALUE *argv;
1205
+ VALUE str;
1206
+
1207
+ {
1208
+ return icu_ustr_each_mode(argc, argv, str, UBRK_LINE);
1209
+ }
1210
+
1211
+ /**
1212
+ * call-seq:
1213
+ * str.each_sentence(locale = "") {|substr| block } => str
1214
+ *
1215
+ * Sentence boundary analysis allows selection with correct interpretation of periods
1216
+ * within numbers and abbreviations, and trailing punctuation marks such as quotation marks and parentheses.
1217
+ *
1218
+ */
1219
+ VALUE
1220
+ icu_ustr_each_sentence(argc, argv, str)
1221
+ int argc;
1222
+ VALUE *argv;
1223
+ VALUE str;
1224
+ {
1225
+ return icu_ustr_each_mode(argc, argv, str, UBRK_SENTENCE);
1226
+ }
1227
+
1228
+ /**
1229
+ * call-seq:
1230
+ * str.to_u(encoding = 'utf8') => UString
1231
+ *
1232
+ * Returns self.
1233
+ */
1234
+ VALUE
1235
+ icu_ustr_to_ustr(argc, argv, str)
1236
+ int argc;
1237
+ VALUE *argv;
1238
+ VALUE str;
1239
+ {
1240
+ return str;
1241
+ }
1242
+
1243
+ /**
1244
+ * call-seq:
1245
+ * str.to_s(encoding = 'utf8') => String
1246
+ *
1247
+ * Converts to Ruby String (byte-oriented) value in given encoding.
1248
+ * When no encoding is given, assumes UTF-8.
1249
+ */
1250
+ VALUE
1251
+ icu_ustr_to_rstr(argc, argv, str)
1252
+ int argc;
1253
+ VALUE *argv,
1254
+ str;
1255
+ {
1256
+ VALUE enc;
1257
+ char *encoding = 0; /* default */
1258
+ UErrorCode error = 0;
1259
+ UConverter *conv ;
1260
+ int enclen, needed = 0;
1261
+ char * buf;
1262
+ VALUE s;
1263
+ if (rb_scan_args(argc, argv, "01", &enc) == 1) {
1264
+ Check_Type(enc, T_STRING);
1265
+ encoding = RSTRING(enc)->ptr;
1266
+ }
1267
+
1268
+ enclen = ICU_LEN(str) + 1;
1269
+ buf = ALLOC_N(char, enclen);
1270
+
1271
+ if( !encoding || !strncmp(encoding, "utf8", 4)){
1272
+ u_strToUTF8( buf, enclen, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1273
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1274
+ REALLOC_N(buf, char, needed + 1);
1275
+ error = 0;
1276
+ u_strToUTF8( buf, needed, &needed, ICU_PTR(str), ICU_LEN(str), &error);
1277
+ }
1278
+ if( U_FAILURE(error) ){
1279
+ free(buf);
1280
+ rb_raise(rb_eArgError, u_errorName(error));
1281
+ }
1282
+ s = rb_str_new(buf, needed);
1283
+
1284
+ } else {
1285
+ conv = ucnv_open(encoding, &error);
1286
+ if (U_FAILURE(error)) {
1287
+ ucnv_close(conv);
1288
+ free(buf);
1289
+ rb_raise(rb_eArgError, u_errorName(error));
1290
+ }
1291
+ enclen =
1292
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1293
+ &error);
1294
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
1295
+ REALLOC_N(buf, char, enclen + 1);
1296
+ error = 0;
1297
+ ucnv_fromUChars(conv, buf, enclen, ICU_PTR(str), ICU_LEN(str),
1298
+ &error);
1299
+ }
1300
+ if( U_FAILURE(error) ){
1301
+ free(buf);
1302
+ rb_raise(rb_eArgError, u_errorName(error));
1303
+ }
1304
+ s = rb_str_new(buf, enclen);
1305
+ ucnv_close(conv);
1306
+ }
1307
+ free(buf);
1308
+ return s;
1309
+ }
1310
+
1311
+ /* -------------- */
1312
+ extern VALUE icu_format(UChar * pattern, int32_t len, VALUE args,
1313
+ int32_t arg_len, char *locale);
1314
+ /**
1315
+ * call-seq:
1316
+ * str.format(locale, [*args])
1317
+ *
1318
+ * Powerful locale-sensitive message formatting. see [./docs/FORMATTING]
1319
+ *
1320
+ * Valid argument types are: +Fixnum+, +UString+, +Float+, +Time+ .
1321
+ *
1322
+ * */
1323
+ VALUE
1324
+ icu_ustr_format(str, args)
1325
+ VALUE str,
1326
+ args;
1327
+ {
1328
+ VALUE loc;
1329
+ Check_Type(args, T_ARRAY);
1330
+ loc = rb_ary_shift(args);
1331
+ Check_Type(loc, T_STRING);
1332
+ return icu_format(ICU_PTR(str), ICU_LEN(str), args, RARRAY(args)->len,
1333
+ RSTRING(loc)->ptr);
1334
+ }
1335
+
1336
+ /* ------ UString regexp related functions ---- */
1337
+
1338
+ /**
1339
+ * call-seq:
1340
+ * str =~ uregexp => UMatch or nil
1341
+ * str =~ other_str => integer or nil
1342
+ *
1343
+ * Match---If <code>URegexp</code> is given, use it as a pattern to
1344
+ * match against <i>uregexp</i> and return UMatch or +nil+.
1345
+ *
1346
+ * If <code>UString</code> is given, returns index of it
1347
+ * (similar to <code>UString#index</code>).
1348
+ *
1349
+ * Otherwise returns +nil+
1350
+ *
1351
+ * "cat o' 9 tails".u =~ '\d' #=> nil
1352
+ * "cat o' 9 tails".u =~ /\d/.U #=> #<UMatch:0xf6fb7d5c @cg=[<U000039>]>
1353
+ * "cat o' 9 tails".u =~ 9 #=> false
1354
+ * "cat o' 9 tails".u =~ '9'.u #=> 7
1355
+ */
1356
+
1357
+ VALUE
1358
+ icu_ustr_match(x, y)
1359
+ VALUE x,
1360
+ y;
1361
+ {
1362
+ long pos ;
1363
+ if (TYPE(y) == T_REGEXP){
1364
+ rb_raise(rb_eTypeError, "Wrong type: can't match against Regexp. Use URegexp instead");
1365
+ }
1366
+ if (CLASS_OF(y) == rb_cURegexp) {
1367
+ return icu_reg_match(y, x);
1368
+ } else if (CLASS_OF(y) == rb_cUString) {
1369
+ pos = icu_ustr_index(x, y, 0);
1370
+ if (pos == -1) return Qnil;
1371
+ else return LONG2NUM(pos);
1372
+ } else {
1373
+ return Qnil;
1374
+ }
1375
+ }
1376
+
1377
+ VALUE
1378
+ get_pat(pat, quote)
1379
+ VALUE pat;
1380
+ int quote;
1381
+ {
1382
+ if (CLASS_OF(pat) == rb_cURegexp)
1383
+ return pat;
1384
+
1385
+ if (CLASS_OF(pat) == rb_cUString)
1386
+ return icu_reg_comp(pat);
1387
+ Check_Class(pat, rb_cURegexp);
1388
+ return Qnil;
1389
+ }
1390
+
1391
+
1392
+ /**
1393
+ * call-seq:
1394
+ * str.match(pattern) => matchdata or nil
1395
+ *
1396
+ * Converts <i>pattern</i> to a <code>URegexp</code> (if it isn't already one),
1397
+ * then invokes its <code>match</code> method on <i>str</i>.
1398
+ *
1399
+ * 'hello'.u.match('(.)\1'.u) #=> #<UMatch:0x401b3d30>
1400
+ * 'hello'.u.match('(.)\1'.u)[0] #=> "ll"
1401
+ * 'hello'.u.match(/(.)\1/.U)[0] #=> "ll"
1402
+ * 'hello'.u.match('xx') #=> nil
1403
+ */
1404
+
1405
+ VALUE
1406
+ icu_ustr_match_m(str, re)
1407
+ VALUE str,
1408
+ re;
1409
+ {
1410
+ return rb_funcall(get_pat(re, 0), rb_intern("match"), 1, str);
1411
+ }
1412
+
1413
+ VALUE
1414
+ ustr_scan_once(str, pat, start)
1415
+ VALUE str,
1416
+ pat;
1417
+ long *start;
1418
+ {
1419
+ VALUE result;
1420
+ long i;
1421
+ long beg,
1422
+ end, num_regs;
1423
+
1424
+ if (icu_reg_search(pat, str, *start, 0) >= 0) {
1425
+ icu_reg_range(pat, 0, &beg, &end);
1426
+ if (beg == end) {
1427
+ *start = end + 1;
1428
+ } else {
1429
+ *start = end;
1430
+ }
1431
+ num_regs = icu_group_count(pat);
1432
+ if (num_regs <= 1) {
1433
+ return icu_reg_nth_match(pat, 0);
1434
+ }
1435
+ result = rb_ary_new2(num_regs);
1436
+ for (i = 1; i <= num_regs; i++) {
1437
+ rb_ary_store(result, i - 1, icu_reg_nth_match(pat, i));
1438
+ }
1439
+
1440
+ return result;
1441
+ }
1442
+ return Qnil;
1443
+ }
1444
+
1445
+
1446
+ /**
1447
+ * call-seq:
1448
+ * str.scan(pattern) => array
1449
+ * str.scan(pattern) {|match, ...| block } => str
1450
+ *
1451
+ * Both forms iterate through <i>str</i>, matching the pattern (which may be a
1452
+ * <code>URegexp</code> or a <code>UString</code>). For each match, a result is
1453
+ * generated and either added to the result array or passed to the block. If
1454
+ * the pattern contains no groups, each individual result consists of the
1455
+ * matched string. If the pattern contains groups, each
1456
+ * individual result is itself an array containing one entry per group.
1457
+ *
1458
+ * a = "cruel world".u
1459
+ * a.scan(/\w+/.U) #=> ["cruel", "world"]
1460
+ * a.scan(/.../.U) #=> ["cru", "el ", "wor"]
1461
+ * a.scan(/(...)/.U) #=> [["cru"], ["el "], ["wor"]]
1462
+ * a.scan(/(..)(..)/.U) #=> [["cr", "ue"], ["l ", "wo"]]
1463
+ *
1464
+ * And the block form:
1465
+ *
1466
+ * a.scan(/\w+/.U) {|w| print "<<#{w}>> " }
1467
+ * print "\n"
1468
+ * a.scan(/(.)(.)/.U) {|a,b| print b, a }
1469
+ * print "\n"
1470
+ *
1471
+ * <em>produces:</em>
1472
+ *
1473
+ * <<cruel>> <<world>>
1474
+ * rceu lowlr
1475
+ */
1476
+
1477
+ VALUE
1478
+ icu_ustr_scan(str, pat)
1479
+ VALUE str,
1480
+ pat;
1481
+ {
1482
+ VALUE result;
1483
+ long start = 0;
1484
+
1485
+ pat = get_pat(pat, 1);
1486
+ if (!rb_block_given_p()) {
1487
+ VALUE ary = rb_ary_new();
1488
+
1489
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1490
+ rb_ary_push(ary, result);
1491
+ }
1492
+ return ary;
1493
+ }
1494
+ USTRING(str)->busy = 1;
1495
+ while (!NIL_P(result = ustr_scan_once(str, pat, &start))) {
1496
+ rb_yield(result);
1497
+ }
1498
+ USTRING(str)->busy = 0;
1499
+ return str;
1500
+ }
1501
+ /**
1502
+ * call-seq:
1503
+ * str.char_span(start[, len, [locale]])
1504
+ *
1505
+ * Returns substring starting at <code>start</code>-th char, with <code>len</code> chars length.
1506
+ * Here "char" means "grapheme cluster", so start index and len are measured in terms of "graphemes"
1507
+ * locale parameter is optional.
1508
+ * Negative len can be supplied to receive to end of string.
1509
+ *
1510
+ * String is transformed to NFC before extract.
1511
+ */
1512
+ VALUE
1513
+ icu_ustr_char_span(int argc, VALUE * argv, VALUE str)
1514
+ {
1515
+ UErrorCode error = 0;
1516
+ int32_t end, start, char_start = 0, char_len = -1, total_chars = 0;
1517
+ int32_t init_pos = -1, end_pos = -1, n;
1518
+ char *loc = NULL;
1519
+ VALUE cs, clen, locl, out;
1520
+ UBreakIterator *boundary;
1521
+
1522
+ n = rb_scan_args(argc, argv, "12", &cs, &clen, &locl);
1523
+ Check_Type(cs, T_FIXNUM);
1524
+ char_start = FIX2INT(cs);
1525
+ if(char_start < 0) rb_raise(rb_eArgError, "Negative offset aren't allowed!");
1526
+
1527
+ if( n > 1) {
1528
+ Check_Type(clen, T_FIXNUM);
1529
+ char_len = FIX2INT(clen);
1530
+ if(char_len <= 0) char_len = -1;
1531
+ }
1532
+ if( n > 2) {
1533
+ Check_Type(locl, T_STRING);
1534
+ loc = RSTRING(locl)->ptr;
1535
+ }
1536
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1537
+ str = icu_ustr_normalize_C(str);
1538
+
1539
+ boundary =
1540
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1541
+ if (U_FAILURE(error))
1542
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1543
+
1544
+ start = ubrk_first(boundary);
1545
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1546
+ start = end, end = ubrk_next(boundary)) {
1547
+ if( total_chars == char_start ) init_pos = start;
1548
+ total_chars ++;
1549
+ if( char_len>0 && total_chars == char_start+char_len) end_pos = end;
1550
+ }
1551
+ ubrk_close(boundary);
1552
+ if( init_pos == -1) rb_raise(rb_eArgError, "Char index %d out of bounds %d", char_start, total_chars);
1553
+ if( end_pos == -1) end_pos = ICU_LEN(str); /* reached end of string */
1554
+ out = icu_ustr_new(ICU_PTR(str)+init_pos, end_pos - init_pos);
1555
+ return out;
1556
+ }
1557
+
1558
+ VALUE
1559
+ icu_ustr_chars(str, loc)
1560
+ VALUE str;
1561
+ char *loc;
1562
+ {
1563
+ UErrorCode error = 0;
1564
+ int32_t end, start;
1565
+ VALUE out;
1566
+ UBreakIterator *boundary;
1567
+ if(UNORM_YES != unorm_quickCheck(ICU_PTR(str), ICU_LEN(str), UNORM_NFC, &error) )
1568
+ str = icu_ustr_normalize_C(str);
1569
+
1570
+ boundary =
1571
+ ubrk_open(UBRK_CHARACTER, loc, ICU_PTR(str), ICU_LEN(str), &error);
1572
+ if (U_FAILURE(error))
1573
+ rb_raise(rb_eArgError, "Error %s", u_errorName(error));
1574
+
1575
+ out = rb_ary_new();
1576
+ start = ubrk_first(boundary);
1577
+ for (end = ubrk_next(boundary); end != UBRK_DONE;
1578
+ start = end, end = ubrk_next(boundary)) {
1579
+ rb_ary_push(out, icu_ustr_new(ICU_PTR(str) + start, end - start));
1580
+ }
1581
+ ubrk_close(boundary);
1582
+ return out;
1583
+ }
1584
+
1585
+ /**
1586
+ * call-seq:
1587
+ * str.chars(locale = "") => array of character
1588
+ *
1589
+ * Returns array of character graphemes, locale dependent.
1590
+ * String is transformed to NFC before split.
1591
+ * */
1592
+ VALUE
1593
+ icu_ustr_chars_m(argc, argv, str)
1594
+ int argc;
1595
+ VALUE *argv;
1596
+ VALUE str;
1597
+ {
1598
+ VALUE locale;
1599
+ if (rb_scan_args(argc, argv, "01", &locale) == 1) {
1600
+ Check_Type(locale, T_STRING);
1601
+ return icu_ustr_chars(str, RSTRING(locale)->ptr);
1602
+ } else {
1603
+ return icu_ustr_chars(str, "");
1604
+ }
1605
+ }
1606
+
1607
+ /**
1608
+ * call-seq:
1609
+ * str.split(pattern, [limit]) => anArray
1610
+ *
1611
+ * Divides <i>str</i> into substrings based on a delimiter, returning an array
1612
+ * of these substrings. <i>str</i> is divided where the
1613
+ * pattern matches.
1614
+ *
1615
+ * NOTE: split(//) or split("") is not supported.
1616
+ * To get array of chars use #chars or #codepoints methods
1617
+ *
1618
+ * If the <i>limit</i> parameter is omitted, trailing null fields are
1619
+ * suppressed. If <i>limit</i> is a positive number, at most that number of
1620
+ * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
1621
+ * string is returned as the only entry in an array). If negative, there is no
1622
+ * limit to the number of fields returned, and trailing null fields are not
1623
+ * suppressed.
1624
+ *
1625
+ */
1626
+
1627
+ VALUE
1628
+ icu_ustr_split_m(argc, argv, str)
1629
+ int argc;
1630
+ VALUE *argv;
1631
+ VALUE str;
1632
+ {
1633
+ VALUE spat;
1634
+ VALUE limit;
1635
+ int lim = 0;
1636
+ VALUE result;
1637
+
1638
+ if (rb_scan_args(argc, argv, "11", &spat, &limit) == 2) {
1639
+ lim = NUM2INT(limit);
1640
+ if (lim <= 0)
1641
+ limit = Qnil;
1642
+ }
1643
+ if (CLASS_OF(spat) == rb_cURegexp) {
1644
+ result = icu_reg_split(spat, str, limit);
1645
+ } else {
1646
+ if (CLASS_OF(spat) == rb_cUString) {
1647
+ result = icu_reg_split(icu_reg_comp(spat), str, limit);
1648
+ } else {
1649
+ rb_raise(rb_eArgError, "Expected UString or URegexp, got %s",
1650
+ rb_class2name(CLASS_OF(spat)));
1651
+ }
1652
+ }
1653
+ if (NIL_P(limit) && lim == 0) {
1654
+ while (RARRAY(result)->len > 0 &&
1655
+ ICU_LEN( (RARRAY(result)->ptr[RARRAY(result)->len - 1])) == 0)
1656
+ rb_ary_pop(result);
1657
+ }
1658
+
1659
+ return result;
1660
+ }
1661
+
1662
+ /**
1663
+ * call-seq:
1664
+ * str.inspect => String
1665
+ *
1666
+ * Shows codepoints in form of \uxxxx. For debug purposes.
1667
+ */
1668
+ VALUE
1669
+ icu_ustr_inspect(str)
1670
+ VALUE str;
1671
+ {
1672
+ VALUE buf = rb_str_new2("");
1673
+ char temp[] = "\\u0010FFFF ";
1674
+ int32_t i,
1675
+ n,
1676
+ k,
1677
+ c;
1678
+ UChar *s = ICU_PTR(str);
1679
+ n = ICU_LEN(str);
1680
+ i = 0;
1681
+ while (i < n) {
1682
+ U16_NEXT(s, i, n, c); /* care surrogate */
1683
+ if(c >= 0x10000)
1684
+ k = sprintf(temp, "\\u%08X", c);
1685
+ else
1686
+ k = sprintf(temp, "\\u%04X", c);
1687
+ rb_str_cat(buf, temp, k);
1688
+ }
1689
+ return buf;
1690
+ }
1691
+
1692
+ /**
1693
+ * call-seq:
1694
+ * str.codepoints => array of fixnums
1695
+ *
1696
+ * Returns array of codepoints as fixnums.
1697
+ */
1698
+ VALUE
1699
+ icu_ustr_points(str)
1700
+ VALUE str;
1701
+ {
1702
+ VALUE buf = rb_ary_new();
1703
+ int32_t i,
1704
+ n,
1705
+ c;
1706
+ UChar *s = ICU_PTR(str);
1707
+ n = ICU_LEN(str);
1708
+ i = 0;
1709
+ while (i < n) {
1710
+ U16_NEXT(s, i, n, c); /* care surrogates */
1711
+ rb_ary_push(buf, LONG2NUM(c));
1712
+ }
1713
+ return buf;
1714
+ }
1715
+
1716
+
1717
+ /**
1718
+ * call-seq:
1719
+ * str.inspect_names => String
1720
+ *
1721
+ * Dumps names of codepoints in this UString (debug).
1722
+ */
1723
+ VALUE
1724
+ icu_ustr_inspect_names(str)
1725
+ VALUE str;
1726
+ {
1727
+ VALUE buf = rb_str_new2("");
1728
+ char temp[301];
1729
+ UErrorCode error;
1730
+ int32_t i,
1731
+ n,
1732
+ c,
1733
+ l;
1734
+ UChar *s = ICU_PTR(str);
1735
+ n = ICU_LEN(str);
1736
+ i = 0;
1737
+ while (i < n) {
1738
+ U16_NEXT(s, i, n, c) sprintf(temp, "<U%06X>", c); /* care surrogates */
1739
+ rb_str_cat(buf, temp, 9);
1740
+ error = 0;
1741
+ l = u_charName(c, U_UNICODE_CHAR_NAME, temp, 300, &error);
1742
+ rb_str_cat(buf, temp, l);
1743
+ rb_str_cat(buf, "\n", 1);
1744
+ }
1745
+ return buf;
1746
+ }
1747
+
1748
+ VALUE
1749
+ icu_ustr_subpat(str, re, nth)
1750
+ VALUE str,
1751
+ re;
1752
+ int nth;
1753
+ {
1754
+ if (icu_reg_search(re, str, 0, 0) >= 0) {
1755
+ return icu_reg_nth_match(re, nth);
1756
+ }
1757
+ return Qnil;
1758
+ }
1759
+
1760
+ /* beg len are code unit indexes*/
1761
+ VALUE
1762
+ icu_ustr_substr(str, beg, len)
1763
+ VALUE str;
1764
+ long beg,
1765
+ len;
1766
+ {
1767
+ int32_t str_size;
1768
+ str_size = ICU_LEN(str);
1769
+ if (len < 0) return Qnil;
1770
+
1771
+ if (beg > str_size) return Qnil;
1772
+ if (beg < 0) {
1773
+ beg += str_size;
1774
+ if (beg < 0) return Qnil;
1775
+ }
1776
+ if (beg + len > str_size) {
1777
+ len = str_size - beg;
1778
+ }
1779
+ if (len < 0) {
1780
+ len = 0;
1781
+ }
1782
+ if( len == 0) return icu_ustr_new(0, 0);
1783
+ /* adjust to codepoint boundaries */
1784
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
1785
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
1786
+ return icu_ustr_new(ICU_PTR(str) + beg, len);
1787
+ }
1788
+
1789
+ VALUE
1790
+ icu_ustr_aref(str, indx)
1791
+ VALUE str;
1792
+ VALUE indx;
1793
+ {
1794
+ long idx;
1795
+ int32_t cp_len = ICU_LEN(str);
1796
+
1797
+ switch (TYPE(indx)) {
1798
+ case T_FIXNUM:
1799
+ idx = FIX2LONG(indx);
1800
+
1801
+ num_index:
1802
+ if (idx < 0) {
1803
+ idx = cp_len + idx;
1804
+ }
1805
+ if (idx < 0 || cp_len <= idx) {
1806
+ return Qnil;
1807
+ }
1808
+ return icu_ustr_substr(str, idx, 1);
1809
+
1810
+ case T_DATA:
1811
+ if (CLASS_OF(indx) == rb_cURegexp)
1812
+ return icu_ustr_subpat(str, indx, 0);
1813
+ if (CLASS_OF(indx) == rb_cUString) {
1814
+ if (icu_ustr_index(str, indx, 0) != -1)
1815
+ return icu_ustr_dup(indx);
1816
+ return Qnil;
1817
+ }
1818
+
1819
+ default:
1820
+ /*
1821
+ * check if indx is Range
1822
+ */
1823
+ {
1824
+ long beg,
1825
+ len;
1826
+ switch (rb_range_beg_len(indx, &beg, &len, cp_len, 0)) {
1827
+ case Qfalse:
1828
+ break;
1829
+ case Qnil:
1830
+ return Qnil;
1831
+ default:
1832
+ return icu_ustr_substr(str, beg, len);
1833
+ }
1834
+ }
1835
+ idx = NUM2LONG(indx);
1836
+ goto num_index;
1837
+ }
1838
+ return Qnil; /* not reached */
1839
+ }
1840
+
1841
+ /**
1842
+ * call-seq:
1843
+ * str[fixnum] => new_str or nil
1844
+ * str[fixnum, fixnum] => new_str or nil
1845
+ * str[range] => new_str or nil
1846
+ * str[regexp] => new_str or nil
1847
+ * str[regexp, fixnum] => new_str or nil
1848
+ * str[other_str] => new_str or nil
1849
+ * str.slice(fixnum) => new_str or nil
1850
+ * str.slice(fixnum, fixnum) => new_str or nil
1851
+ * str.slice(range) => new_str or nil
1852
+ * str.slice(regexp) => new_str or nil
1853
+ * str.slice(regexp, fixnum) => new_str or nil
1854
+ * str.slice(other_str) => new_str or nil
1855
+ *
1856
+ * Element Reference---If passed a single <code>Fixnum</code>, returns
1857
+ * substring with the character at that position. If passed two <code>Fixnum</code>
1858
+ * objects, returns a substring starting at the offset given by the first, and
1859
+ * a length given by the second. If given a range, a substring containing
1860
+ * characters at offsets given by the range is returned. In all three cases, if
1861
+ * an offset is negative, it is counted from the end of <i>str</i>. Returns
1862
+ * <code>nil</code> if the initial offset falls outside the string, the length
1863
+ * is negative, or the beginning of the range is greater than the end.
1864
+ *
1865
+ * If a <code>URegexp</code> is supplied, the matching portion of <i>str</i> is
1866
+ * returned. If a numeric parameter follows the regular expression, that
1867
+ * component of the <code>UMatch</code> is returned instead. If a
1868
+ * <code>UString</code> is given, that string is returned if it occurs in
1869
+ * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1870
+ * match.
1871
+ *
1872
+ * a = "hello there".u
1873
+ * a[1] #=> 'e'
1874
+ * a[1,3] #=> "ell"
1875
+ * a[1..3] #=> "ell"
1876
+ * a[-3,2] #=> "er"
1877
+ * a[-4..-2] #=> "her"
1878
+ * a[12..-1] #=> nil
1879
+ * a[-2..-4] #=> ""
1880
+ * a[/[aeiou](.)\1/.U] #=> "ell"
1881
+ * a[/[aeiou](.)\1/.U, 0] #=> "ell"
1882
+ * a[/[aeiou](.)\1/.U, 1] #=> "l"
1883
+ * a[/[aeiou](.)\1/.U, 2] #=> nil
1884
+ * a["lo".u] #=> "lo"
1885
+ * a["bye".u] #=> nil
1886
+ */
1887
+
1888
+ VALUE
1889
+ icu_ustr_aref_m(argc, argv, str)
1890
+ int argc;
1891
+ VALUE *argv;
1892
+ VALUE str;
1893
+ {
1894
+ if (argc == 2) {
1895
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
1896
+ return icu_ustr_subpat(str, argv[0], NUM2INT(argv[1]));
1897
+ }
1898
+ return icu_ustr_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
1899
+ }
1900
+ if (argc != 1) {
1901
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
1902
+ argc);
1903
+ }
1904
+ return icu_ustr_aref(str, argv[0]);
1905
+ }
1906
+
1907
+ /**
1908
+ * call-seq:
1909
+ * str.sub!(pattern, replacement) => str or nil
1910
+ * str.sub!(pattern) {|match| block } => str or nil
1911
+ *
1912
+ * Performs the substitutions of <code>UString#sub</code> in place,
1913
+ * returning <i>str</i>, or <code>nil</code> if no substitutions were
1914
+ * performed.
1915
+ */
1916
+
1917
+ VALUE
1918
+ icu_ustr_sub_bang(argc, argv, str)
1919
+ int argc;
1920
+ VALUE *argv;
1921
+ VALUE str;
1922
+ {
1923
+ return ustr_gsub(argc, argv, str, 1, 1 );
1924
+ }
1925
+
1926
+
1927
+ /**
1928
+ * call-seq:
1929
+ * str.sub(pattern, replacement) => new_str
1930
+ * str.sub(pattern) {|match| block } => new_str
1931
+ *
1932
+ * Returns a copy of <i>str</i> with the <em>first</em> occurrence of
1933
+ * <i>pattern</i> replaced with either <i>replacement</i> or the value of the
1934
+ * block. The <i>pattern</i> will typically be a <code>URegexp</code>; if it is
1935
+ * a <code>UString</code> then no regular expression metacharacters will be
1936
+ * interpreted (that is <code>/\d/.U</code> will match a digit, but
1937
+ * <code>'\d'</code> will match a backslash followed by a 'd').
1938
+ *
1939
+ * The sequences <code>$1</code>, <code>$2</code>, etc., may be used.
1940
+ *
1941
+ * In the block form, the current UMatch object is passed in as a parameter.
1942
+ * The value returned by the block will be substituted for the match on each call.
1943
+ *
1944
+ * "hello".u.sub(/[aeiou]/.U, '*'.u) #=> "h*llo"
1945
+ * "hello".u.sub(/([aeiou])/.U, '<$1>'.u) #=> "h<e>llo"
1946
+ */
1947
+
1948
+ VALUE
1949
+ icu_ustr_sub(argc, argv, str)
1950
+ int argc;
1951
+ VALUE *argv;
1952
+ VALUE str;
1953
+ {
1954
+ str = icu_ustr_dup(str);
1955
+ icu_ustr_sub_bang(argc, argv, str);
1956
+ return str;
1957
+ }
1958
+
1959
+ /**
1960
+ * replace in string from +beg+ length +len+ (in code units)
1961
+ */
1962
+ static void
1963
+ icu_ustr_splice(str, beg, len, val)
1964
+ VALUE str;
1965
+ long beg,
1966
+ len;
1967
+ VALUE val;
1968
+ {
1969
+ long char_len;
1970
+ Check_Class(val, rb_cUString);
1971
+ if (val == str) {
1972
+ val = icu_ustr_dup(str);
1973
+ }
1974
+ if (len < 0)
1975
+ rb_raise(rb_eIndexError, "negative length %ld", len);
1976
+ char_len = ICU_LEN(str);
1977
+
1978
+ if (char_len < beg) {
1979
+ out_of_range:
1980
+ rb_raise(rb_eIndexError, "index %ld out of string", beg);
1981
+ }
1982
+ if (beg < 0) {
1983
+ if (-beg > char_len) {
1984
+ goto out_of_range;
1985
+ }
1986
+ beg += char_len;
1987
+ }
1988
+ if (char_len < beg + len) {
1989
+ len = char_len - beg;
1990
+ }
1991
+ /* adjust to codepoint boundaries */
1992
+ U16_SET_CP_START(ICU_PTR(str), 0, beg);
1993
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
1994
+
1995
+ ustr_splice_units(USTRING(str), beg, len, ICU_PTR(val), ICU_LEN(val));
1996
+ OBJ_INFECT(str, val);
1997
+ }
1998
+
1999
+
2000
+ /**
2001
+ * call-seq:
2002
+ * str.insert(index, other_str) => str
2003
+ *
2004
+ * Inserts <i>other_str</i> before the character at the given
2005
+ * <i>index</i>, modifying <i>str</i>. Negative indices count from the
2006
+ * end of the string, and insert <em>after</em> the given character.
2007
+ * The intent is insert <i>other_str</i> so that it starts at the given
2008
+ * <i>index</i>.
2009
+ *
2010
+ * "abcd".u.insert(0, 'X'.u) #=> "Xabcd"
2011
+ * "abcd".u.insert(3, 'X'.u) #=> "abcXd"
2012
+ * "abcd".u.insert(4, 'X'.u) #=> "abcdX"
2013
+ * "abcd".u.insert(-3, 'X'.u) #=> "abXcd"
2014
+ * "abcd".u.insert(-1, 'X'.u) #=> "abcdX"
2015
+ */
2016
+
2017
+ VALUE
2018
+ icu_ustr_insert(str, idx, str2)
2019
+ VALUE str,
2020
+ idx,
2021
+ str2;
2022
+ {
2023
+ long pos = NUM2LONG(idx);
2024
+ icu_check_frozen(str);
2025
+
2026
+ if (pos == -1) {
2027
+ pos = NUM2LONG(icu_ustr_length(str));
2028
+ } else if (pos < 0) {
2029
+ pos++;
2030
+ }
2031
+
2032
+ icu_ustr_splice(str, pos, 0, str2);
2033
+ return str;
2034
+ }
2035
+
2036
+ /**
2037
+ * call-seq:
2038
+ * str.include? other_str => true or false
2039
+ *
2040
+ * Returns <code>true</code> if <i>str</i> contains the given string
2041
+ *
2042
+ * "hello".u.include? "lo".u #=> true
2043
+ * "hello".u.include? "ol".u #=> false
2044
+ */
2045
+
2046
+ VALUE
2047
+ icu_ustr_include(str, arg)
2048
+ VALUE str,
2049
+ arg;
2050
+ {
2051
+ long i;
2052
+ i = icu_ustr_index(str, arg, 0);
2053
+ if (i == -1)
2054
+ return Qfalse;
2055
+ return Qtrue;
2056
+ }
2057
+
2058
+ static void
2059
+ icu_ustr_subpat_set(str, re, nth, val)
2060
+ VALUE str,
2061
+ re;
2062
+ int nth;
2063
+ VALUE val;
2064
+ {
2065
+ long start,
2066
+ end,
2067
+ len;
2068
+ VALUE matched;
2069
+
2070
+ if (icu_reg_search(re, str, 0, 0) < 0) {
2071
+ rb_raise(rb_eIndexError, "regexp not matched");
2072
+ }
2073
+ matched = icu_reg_range(re, nth, &start, &end);
2074
+ if (NIL_P(matched)) {
2075
+ rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
2076
+ }
2077
+ len = end - start;
2078
+ /* adjust to codepoint boundaries */
2079
+ U16_SET_CP_START(ICU_PTR(str), 0, start);
2080
+ U16_SET_CP_LIMIT(ICU_PTR(str), 0, len, ICU_LEN(str));
2081
+
2082
+ ustr_splice_units(USTRING(str), start, len, ICU_PTR(val), ICU_LEN(val));
2083
+ }
2084
+
2085
+ VALUE
2086
+ icu_ustr_aset(str, indx, val)
2087
+ VALUE str;
2088
+ VALUE indx,
2089
+ val;
2090
+ {
2091
+ long idx,
2092
+ beg;
2093
+ long char_len = ICU_LEN(str);
2094
+
2095
+ switch (TYPE(indx)) {
2096
+ case T_FIXNUM:
2097
+ num_index:
2098
+ idx = FIX2LONG(indx);
2099
+ if (char_len <= idx) {
2100
+ out_of_range:
2101
+ rb_raise(rb_eIndexError, "index %ld out of string", idx);
2102
+ }
2103
+ if (idx < 0) {
2104
+ if (-idx > char_len)
2105
+ goto out_of_range;
2106
+ idx += char_len;
2107
+ }
2108
+ icu_ustr_splice(str, idx, 1, val);
2109
+ return val;
2110
+
2111
+ case T_DATA:
2112
+ if (CLASS_OF(indx) == rb_cURegexp) {
2113
+ icu_ustr_subpat_set(str, indx, 0, val);
2114
+ return val;
2115
+ }
2116
+ if (CLASS_OF(indx) == rb_cUString) {
2117
+ beg = icu_ustr_index(str, indx, 0);
2118
+ if (beg < 0) {
2119
+ rb_raise(rb_eIndexError, "string not matched");
2120
+ }
2121
+ ustr_splice_units(USTRING(str), beg, ICU_LEN(indx), ICU_PTR(val), ICU_LEN(val));
2122
+ return val;
2123
+ }
2124
+ default:
2125
+ /*
2126
+ * check if indx is Range
2127
+ */
2128
+ {
2129
+ long beg,
2130
+ len;
2131
+ if (rb_range_beg_len(indx, &beg, &len, char_len, 2)) {
2132
+ icu_ustr_splice(str, beg, len, val);
2133
+ return val;
2134
+ }
2135
+ }
2136
+ idx = NUM2LONG(indx);
2137
+ goto num_index;
2138
+ }
2139
+ }
2140
+
2141
+
2142
+ /**
2143
+ * call-seq:
2144
+ * str[fixnum] = new_str
2145
+ * str[fixnum, fixnum] = new_str
2146
+ * str[range] = new_str
2147
+ * str[regexp] = new_str
2148
+ * str[regexp, fixnum] = new_str
2149
+ * str[other_str] = new_str
2150
+ *
2151
+ * Element Assignment---Replaces some or all of the content of <i>str</i>. The
2152
+ * portion of the string affected is determined using the same criteria as
2153
+ * <code>UString#[]</code>. If the replacement string is not the same length as
2154
+ * the text it is replacing, the string will be adjusted accordingly. If the
2155
+ * regular expression or string is used as the index doesn't match a position
2156
+ * in the string, <code>IndexError</code> is raised. If the regular expression
2157
+ * form is used, the optional second <code>Fixnum</code> allows you to specify
2158
+ * which portion of the match to replace (effectively using the
2159
+ * <code>UMatch</code> indexing rules. The forms that take a
2160
+ * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
2161
+ * out of range; the <code>Range</code> form will raise a
2162
+ * <code>RangeError</code>, and the <code>URegexp</code> and <code>UString</code>
2163
+ * forms will silently ignore the assignment.
2164
+ */
2165
+
2166
+ VALUE
2167
+ icu_ustr_aset_m(argc, argv, str)
2168
+ int argc;
2169
+ VALUE *argv;
2170
+ VALUE str;
2171
+ {
2172
+ icu_check_frozen(str);
2173
+ if (argc == 3) {
2174
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
2175
+ icu_ustr_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
2176
+ } else {
2177
+ icu_ustr_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]),
2178
+ argv[2]);
2179
+ }
2180
+ return argv[2];
2181
+ }
2182
+ if (argc != 2) {
2183
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2184
+ argc);
2185
+ }
2186
+ return icu_ustr_aset(str, argv[0], argv[1]);
2187
+ }
2188
+
2189
+ /**
2190
+ * call-seq:
2191
+ * str.slice!(fixnum) => new_str or nil
2192
+ * str.slice!(fixnum, fixnum) => new_str or nil
2193
+ * str.slice!(range) => new_str or nil
2194
+ * str.slice!(regexp) => new_str or nil
2195
+ * str.slice!(other_str) => new_str or nil
2196
+ *
2197
+ * Deletes the specified portion from <i>str</i>, and returns the portion
2198
+ * deleted. The forms that take a <code>Fixnum</code> will raise an
2199
+ * <code>IndexError</code> if the value is out of range; the <code>Range</code>
2200
+ * form will raise a <code>RangeError</code>, and the <code>URegexp</code> and
2201
+ * <code>UString</code> forms will silently ignore the assignment.
2202
+ *
2203
+ * string = "this is a string".u
2204
+ * string.slice!(2) #=> 105
2205
+ * string.slice!(3..6) #=> " is "
2206
+ * string.slice!(/s.*t/.U) #=> "sa st"
2207
+ * string.slice!("r".u) #=> "r"
2208
+ * string #=> "thing"
2209
+ */
2210
+
2211
+ VALUE
2212
+ icu_ustr_slice_bang(argc, argv, str)
2213
+ int argc;
2214
+ VALUE *argv;
2215
+ VALUE str;
2216
+ {
2217
+ VALUE result;
2218
+ VALUE buf[3];
2219
+ int i;
2220
+ icu_check_frozen(str);
2221
+ if (argc < 1 || 2 < argc) {
2222
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)",
2223
+ argc);
2224
+ }
2225
+ for (i = 0; i < argc; i++) {
2226
+ buf[i] = argv[i];
2227
+ }
2228
+ buf[i] = icu_ustr_new(0, 0);
2229
+ result = icu_ustr_aref_m(argc, buf, str);
2230
+ if (!NIL_P(result)) {
2231
+ icu_ustr_aset_m(argc + 1, buf, str);
2232
+ }
2233
+ return result;
2234
+ }
2235
+
2236
+ VALUE
2237
+ ustr_gsub(argc, argv, str, bang, once)
2238
+ int argc;
2239
+ VALUE *argv;
2240
+ VALUE str;
2241
+ int bang;
2242
+ int once;
2243
+ {
2244
+ VALUE pat,
2245
+ repl;
2246
+ long beg,
2247
+ end,
2248
+ prev_end;
2249
+ int tainted = 0,
2250
+ iter = 0;
2251
+ VALUE buf, curr_repl, umatch, block_res;
2252
+ if (argc == 1 && rb_block_given_p()) {
2253
+ iter = 1;
2254
+ } else if (argc == 2) {
2255
+ repl = argv[1];
2256
+ Check_Class(repl, rb_cUString);
2257
+ if (OBJ_TAINTED(repl))
2258
+ tainted = 1;
2259
+ } else {
2260
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
2261
+ argc);
2262
+ }
2263
+
2264
+ pat = get_pat(argv[0], 1);
2265
+ beg = icu_reg_search(pat, str, 0, 0);
2266
+
2267
+ if (beg < 0) {
2268
+ /* no match */
2269
+ if (bang)
2270
+ return Qnil;
2271
+ return icu_ustr_dup(str);
2272
+ }
2273
+ end = 0;
2274
+ icu_check_frozen(str);
2275
+ USTRING(str)->busy = 1;
2276
+ buf = icu_ustr_new(0, 0);
2277
+ pat = icu_reg_clone(pat);
2278
+ if(rb_block_given_p()) iter = 1;
2279
+ do {
2280
+
2281
+ prev_end = end;
2282
+ icu_reg_range(pat, 0, &beg, &end);
2283
+ icu_ustr_concat(buf, icu_reg_get_prematch(pat, prev_end));
2284
+ if ( iter ) {
2285
+ UChar * ptr = ICU_PTR(str);
2286
+ long o_len = ICU_LEN(str);
2287
+ umatch = icu_umatch_new(pat);
2288
+ block_res = rb_yield(umatch);
2289
+ if (CLASS_OF(block_res) == rb_cUString)
2290
+ curr_repl = block_res;
2291
+ else if (CLASS_OF(block_res) == rb_cUMatch)
2292
+ curr_repl = icu_umatch_aref(block_res, INT2FIX(0));
2293
+ else
2294
+ curr_repl =
2295
+ icu_from_rstr(0, NULL, rb_obj_as_string(block_res));
2296
+ ustr_mod_check(str, ptr, o_len);
2297
+ } else {
2298
+ curr_repl = icu_reg_get_replacement(pat, repl, prev_end);
2299
+ }
2300
+ icu_ustr_concat(buf, curr_repl);
2301
+ }
2302
+ while (icu_reg_find_next(pat) && !once);
2303
+ icu_ustr_concat(buf, icu_reg_get_tail(pat, end));
2304
+ USTRING(str)->busy = 0;
2305
+ if (bang) {
2306
+ icu_ustr_replace(str, buf);
2307
+ return str;
2308
+ } else {
2309
+ return buf;
2310
+ }
2311
+ }
2312
+
2313
+ /**
2314
+ * call-seq:
2315
+ * str.gsub!(pattern, replacement) => str or nil
2316
+ * str.gsub!(pattern) {|match| block } => str or nil
2317
+ *
2318
+ * Performs the substitutions of <code>UString#gsub</code> in place, returning
2319
+ * <i>str</i>, or <code>nil</code> if no substitutions were performed.
2320
+ */
2321
+
2322
+ VALUE
2323
+ icu_ustr_gsub_bang(argc, argv, str)
2324
+ int argc;
2325
+ VALUE *argv;
2326
+ VALUE str;
2327
+ {
2328
+ icu_check_frozen(str);
2329
+ return ustr_gsub(argc, argv, str, 1, 0);
2330
+ }
2331
+
2332
+
2333
+ /**
2334
+ * call-seq:
2335
+ * str.gsub(pattern, replacement) => new_str
2336
+ * str.gsub(pattern) {|match| block } => new_str
2337
+ *
2338
+ * Returns a copy of <i>str</i> with <em>all</em> occurrences of <i>pattern</i>
2339
+ * replaced with either <i>replacement</i> or the value of the block. The
2340
+ * <i>pattern</i> will typically be a <code>URegexp</code>; if it is a
2341
+ * <code>UString</code> then no regular expression metacharacters will be
2342
+ * interpreted (that is <code>/\d/</code> will match a digit, but
2343
+ * <code>'\d'</code> will match a backslash followed by a 'd').
2344
+ *
2345
+ * If a string is used as the replacement, the sequences <code>$1</code>, <code>$2</code>, and so on
2346
+ * may be used to interpolate successive groups in the match.
2347
+ *
2348
+ * In the block form, the current UMatch object is passed in as a parameter. The value
2349
+ * returned by the block will be substituted for the match on each call.
2350
+ *
2351
+ * "hello".gsub(/[aeiou]/.U, '*') #=> "h*ll*"
2352
+ * "hello".gsub(/([aeiou])/.U, '<$1>') #=> "h<e>ll<o>"
2353
+ */
2354
+
2355
+ VALUE
2356
+ icu_ustr_gsub(argc, argv, str)
2357
+ int argc;
2358
+ VALUE *argv;
2359
+ VALUE str;
2360
+ {
2361
+ return ustr_gsub(argc, argv, str, 0, 0);
2362
+ }
2363
+
2364
+
2365
+ /*-------------*/
2366
+ /* parsing */
2367
+ extern VALUE icu_date_parse(UChar * str, int32_t str_len, char * locale, UChar * val, int32_t len);
2368
+
2369
+ /**
2370
+ * call-seq:
2371
+ * str.parse_date( locale, value)
2372
+ *
2373
+ * Parses given value, using +str+ as format pattern with respect to +locale+.
2374
+ *
2375
+ * "HH:mm:ss E dd/MM/yyyy".u.parse_date("en", "20:15:01 Fri 13/01/2006".u)) # => Time.local(2006,"jan",13,20,15,1)
2376
+ *
2377
+ */
2378
+
2379
+ VALUE
2380
+ icu_ustr_parse_date( str, locale, val)
2381
+ VALUE str, locale, val;
2382
+ {
2383
+ Check_Type(locale, T_STRING);
2384
+ Check_Class(val, rb_cUString);
2385
+ return icu_date_parse(ICU_PTR(str), ICU_LEN(str), RSTRING(locale)->ptr, ICU_PTR(val), ICU_LEN(val));
2386
+ }
2387
+
2388
+ /**
2389
+ * call-seq:
2390
+ * str.to_f( locale = "",[format_pattern]) => aFloat
2391
+ *
2392
+ * Parses string as double value, with respect to +locale+ and format pattern,
2393
+ * if they are provided.
2394
+ *
2395
+ * "456".u.to_f # => 456.0
2396
+ * "123,001".u.to_f("ru") # => 123.001
2397
+ * "123,001".u.to_f("en") # => 123001.0
2398
+ * "Got 123,001".u.to_f("en", "Got ###,###".u) # => 123001
2399
+ */
2400
+
2401
+ VALUE
2402
+ icu_ustr_parse_double( int argc, VALUE * argv, VALUE str)
2403
+ {
2404
+ UParseError error;
2405
+ UErrorCode status = U_ZERO_ERROR;
2406
+ UNumberFormat * format = NULL;
2407
+ VALUE loc, pattern;
2408
+ char * locale;
2409
+ double value;
2410
+ int32_t pos, n;
2411
+
2412
+ n = rb_scan_args(argc, argv, "02", &loc, &pattern) ;
2413
+ if( n == 2) {
2414
+ Check_Class(pattern, rb_cUString);
2415
+ } else pattern = Qnil;
2416
+
2417
+ if (n > 0) {
2418
+ Check_Type(loc, T_STRING);
2419
+ locale = RSTRING(loc)->ptr;
2420
+ } else locale = NULL;
2421
+
2422
+ if( pattern != Qnil ) {
2423
+ format = unum_open(UNUM_PATTERN_DECIMAL, ICU_PTR(pattern), ICU_LEN(pattern), locale,
2424
+ &error, &status);
2425
+ } else {
2426
+ format = unum_open(UNUM_DECIMAL, NULL, 0, locale,&error, &status);
2427
+ }
2428
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't open format %s", u_errorName(status));
2429
+ pos = 0;
2430
+ value = unum_parseDouble(format, ICU_PTR(str), ICU_LEN(str), &pos, &status);
2431
+ unum_close(format);
2432
+ if (U_FAILURE(status) ) rb_raise(rb_eArgError, "can't parse %s at %d", u_errorName(status), pos);
2433
+ return rb_float_new(value);
2434
+ }
2435
+
2436
+ /**
2437
+ * call-seq:
2438
+ * UString::strcoll(str1, str2 ) => Fixnum
2439
+ * UString::strcoll(str1, str2 , locale) => Fixnum
2440
+ * UString::strcoll(str1, str2 , locale, strength) => Fixnum
2441
+ *
2442
+ * Performs locale-sensitive string comparison.
2443
+ * Special values for locales can be passed in - if +nil+ is passed for the locale,
2444
+ * the default locale collation rules will be used. If empty string ("") or "root" are
2445
+ * passed, UCA rules will be used.
2446
+ *
2447
+ * Strength must be a fixnum that set collation strength:
2448
+ * -1 is default, 0 - primary, 1 - secondary, 2 - ternary.
2449
+ * E.g., pass 0 to ignore case and accents, 1 - to ignore case only.
2450
+ **/
2451
+ VALUE
2452
+ icu_ustr_coll(argc, argv, self)
2453
+ int argc;
2454
+ VALUE *argv;
2455
+ VALUE self;
2456
+ {
2457
+ UErrorCode status = 0 ;
2458
+ UCollator * collator = 0;
2459
+ int result;
2460
+ VALUE ret = Qnil;
2461
+ VALUE str1, str2, loc, strength = Qnil;
2462
+ char * locale = NULL;
2463
+ int n ;
2464
+ n = rb_scan_args(argc, argv, "22", &str1, &str2, &loc, &strength);
2465
+ if ( n == 3) {
2466
+ if( loc != Qnil) {
2467
+ Check_Type(loc, T_STRING);
2468
+ locale = RSTRING(loc)->ptr;
2469
+ }
2470
+ }
2471
+ Check_Class(str1, rb_cUString);
2472
+ Check_Class(str2, rb_cUString);
2473
+ collator = ucol_open(locale, &status);
2474
+ if( U_FAILURE(status) )
2475
+ {
2476
+ rb_raise(rb_eArgError, u_errorName(status));
2477
+ }
2478
+ if( n == 4 ){
2479
+ Check_Type(strength, T_FIXNUM);
2480
+ ucol_setStrength(collator, NUM2INT(strength));
2481
+ }
2482
+ result = ucol_strcoll(collator, ICU_PTR(str1), ICU_LEN(str1), ICU_PTR(str2), ICU_LEN(str2));
2483
+
2484
+ switch(result){
2485
+ case UCOL_EQUAL: ret = INT2FIX(0);break;
2486
+ case UCOL_GREATER: ret = INT2FIX(1);break;
2487
+ case UCOL_LESS: ret = INT2FIX(-1);break;
2488
+ }
2489
+ ucol_close(collator);
2490
+ return ret;
2491
+ }
2492
+
2493
+ /**
2494
+ * call-seq:
2495
+ * UString::list_coll => anArray
2496
+ *
2497
+ * Returns array of available collator locales, to be used in UString#strcoll
2498
+ * */
2499
+ VALUE icu_ustr_list_coll(str)
2500
+ VALUE str;
2501
+ {
2502
+ int32_t i, n =ucol_countAvailable();
2503
+ VALUE ret = rb_ary_new();
2504
+ for( i = 0; i<n; i++) {
2505
+ rb_ary_push(ret, rb_str_new2(ucol_getAvailable(i)));
2506
+ }
2507
+ return ret;
2508
+ }
2509
+
2510
+ /**
2511
+ * call-seq:
2512
+ * UString::list_locales => anArray
2513
+ *
2514
+ * Returns array of available locales.
2515
+ * */
2516
+ VALUE icu_ustr_list_locales(str)
2517
+ VALUE str;
2518
+ {
2519
+ int32_t i, n =uloc_countAvailable();
2520
+ VALUE ret = rb_ary_new();
2521
+ for( i = 0; i<n; i++) {
2522
+ rb_ary_push(ret, rb_str_new2(uloc_getAvailable(i)));
2523
+ }
2524
+ return ret;
2525
+ }
2526
+ /**
2527
+ * call-seq:
2528
+ * UString::list_translits => anArray
2529
+ *
2530
+ * Returns array of available translits.
2531
+ * */
2532
+ VALUE icu_ustr_list_translits(str)
2533
+ VALUE str;
2534
+ {
2535
+ UErrorCode status = U_ZERO_ERROR;
2536
+ UEnumeration * ids ;
2537
+ VALUE ret ;
2538
+ UChar * name;
2539
+ int32_t len;
2540
+ ids = utrans_openIDs (&status);
2541
+ ICU_RAISE(status);
2542
+ ret = rb_ary_new();
2543
+ while( (name = (UChar*)uenum_unext(ids, &len, &status))) {
2544
+ rb_ary_push(ret, icu_ustr_new(name, len));
2545
+ }
2546
+ uenum_close(ids);
2547
+ return ret;
2548
+
2549
+ }
2550
+ /**
2551
+ * call-seq:
2552
+ * str.search(pattern, options = {})
2553
+ *
2554
+ * Searches for match in string. Returns array of +Range+
2555
+ * corresponding to position where pattern is matched.
2556
+ *
2557
+ * Valid options are:
2558
+ * :locale -- locale, +String+, value e.g. "en", "ru_RU"
2559
+ * :ignore_case -- whether to ignore case, valid values are +true+ or +false+, default to +false+
2560
+ * :ignore_case_accents -- sets collator options to strength +0+ - primary difference, e.g. ignore case and accents,
2561
+ * overrides :ignore_case: option, default to +false+,
2562
+ * :loosely -- same as :ignore_case_accents
2563
+ * :limit -- Fixnum limit of match positions to return.
2564
+ * :whole_words -- whether to match whole words only
2565
+ * :canonical -- use canonical equivalence
2566
+ *
2567
+ *
2568
+ * a = "A quick brown fox jumped over the lazy fox dancing foxtrote".u
2569
+ * a.search("fox".u) # => [14..16, 39..41, 51..53]
2570
+ * a.search("FoX".u) # => []
2571
+ * a.search("FoX".u, :ignore_case => true) # => [14..16, 39..41, 51..53]
2572
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true) # => [14..16, 39..41]
2573
+ * a.search("FoX".u, :ignore_case => true, :whole_words => true, :limit => 1) # => [14..16]
2574
+ *
2575
+ * b = "Iñtërnâtiônàlizætiøn îs cọmpłèx".u.upcase # => IÑTËRNÂTIÔNÀLIZÆTIØN ÎS CỌMPŁÈX
2576
+ * b.search("nâtiôn".u, :locale => "en") # => []
2577
+ * b.search("nation".u) # => []
2578
+ * b.search("nation".u, :locale => "en", :ignore_case_accents => true) # => [5..10]
2579
+ * b.search("nâtiôn".u, :locale => "en", :ignore_case => true) # => [5..10]
2580
+ * b.search("zaeti".u, :locale => "en" ) # => []
2581
+ * b.search("zaeti".u, :locale => "en", :ignore_case => true) # => []
2582
+ * b.search("zaeti".u, :locale => "en", :ignore_case_accents => true) # => [14..17]
2583
+ *
2584
+ * v = [?a, 0x0325, 0x0300].to_u # => ḁ̀
2585
+ * v.search([?a, 0x300].to_u, :canonical => true) # => [0..2]
2586
+ * v.search([?a, 0x300].to_u) # => []
2587
+ **/
2588
+
2589
+ VALUE icu_ustr_search(argc, argv, str)
2590
+ int argc;
2591
+ VALUE *argv;
2592
+ VALUE str;
2593
+
2594
+ {
2595
+ UErrorCode status = U_ZERO_ERROR;
2596
+ UStringSearch * search = 0 ;
2597
+ VALUE pat, locale , limit, options;
2598
+ int lim = -1, count = 0 ;
2599
+ int32_t start, len;
2600
+ VALUE ret = rb_ary_new();
2601
+ UCollator * collator = 0;
2602
+ UBreakIterator * brkit = 0;
2603
+ char * loc = 0;
2604
+ if ( rb_scan_args(argc, argv, "11", &pat, &options) == 2 ) {
2605
+ Check_Type(options, T_HASH);
2606
+ } else {
2607
+ options = Qnil;
2608
+ }
2609
+
2610
+ Check_Class(pat, rb_cUString);
2611
+ locale = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("locale")));
2612
+
2613
+ if( locale != Qnil ) {
2614
+ Check_Type(locale, T_STRING);
2615
+ loc = RSTRING(locale) -> ptr;
2616
+ }
2617
+ limit = options == Qnil ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("limit")));
2618
+
2619
+ if(TYPE(limit) == T_FIXNUM) {
2620
+ lim = FIX2INT(limit);
2621
+ if(lim <= 0) {
2622
+ rb_raise(rb_eTypeError, "Limit must be positive or nil, got: %d", lim);
2623
+ }
2624
+ }
2625
+ else
2626
+ if (limit!=Qnil)
2627
+ rb_raise(rb_eArgError, "Limit must be Fixnum, got %s", rb_class2name(CLASS_OF(limit)));
2628
+
2629
+ collator = ucol_open(loc, &status);
2630
+ ucol_setStrength(collator, -1);
2631
+
2632
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("whole_words"))) )
2633
+ brkit = ubrk_open(UBRK_WORD, loc, ICU_PTR(str), ICU_LEN(str), &status);
2634
+
2635
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case"))) )
2636
+ ucol_setStrength(collator, UCOL_SECONDARY);
2637
+
2638
+ if( options != Qnil &&
2639
+ ( Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("ignore_case_accents")) )
2640
+ || Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("loosely")) )
2641
+ )
2642
+ )
2643
+ ucol_setStrength(collator, UCOL_PRIMARY );
2644
+
2645
+
2646
+ search = usearch_openFromCollator(ICU_PTR(pat), ICU_LEN(pat),
2647
+ ICU_PTR(str), ICU_LEN(str),
2648
+ collator, brkit, &status);
2649
+
2650
+ if( options != Qnil && Qtrue == rb_hash_aref( options, ID2SYM(rb_intern("canonical"))) )
2651
+ usearch_setAttribute(search, USEARCH_CANONICAL_MATCH, USEARCH_ON, &status);
2652
+
2653
+ if( U_FAILURE(status) ) goto failure;
2654
+
2655
+ status = U_ZERO_ERROR;
2656
+ if( usearch_first(search, &status) == USEARCH_DONE) {
2657
+ usearch_close(search);
2658
+ ucol_close(collator);
2659
+ ubrk_close(brkit);
2660
+ return ret;
2661
+ }
2662
+
2663
+ do {
2664
+ if( U_FAILURE(status) ) goto failure;
2665
+
2666
+ start = usearch_getMatchedStart(search);
2667
+ len = usearch_getMatchedLength(search);
2668
+ rb_ary_push(ret, rb_range_new(LONG2NUM(start), LONG2NUM(start+len-1), 0));
2669
+
2670
+ status = U_ZERO_ERROR;
2671
+ count += 1;
2672
+ if (lim > 0 && count >= lim) break;
2673
+ } while (USEARCH_DONE != usearch_next(search, &status));
2674
+ usearch_close( search);
2675
+ ucol_close(collator);
2676
+ ubrk_close(brkit);
2677
+ return ret;
2678
+
2679
+ failure:
2680
+ usearch_close( search);
2681
+ ucol_close(collator);
2682
+ ubrk_close(brkit);
2683
+
2684
+ rb_raise(rb_eArgError, u_errorName(status));
2685
+ return Qnil;
2686
+ }
2687
+ /**
2688
+ * call-seq:
2689
+ * str.conv_unit_range(unit_range) => code_point_range
2690
+ *
2691
+ * Converts <b>code unit</b> range to <b>code point</b> range.
2692
+ * If your chars don't use multiple UTF16 codeunits, range will be the same.
2693
+ */
2694
+ VALUE icu_ustr_convert_unit_range(str, range)
2695
+ VALUE str, range;
2696
+ {
2697
+ long cu_start, cu_len, cur_pos, cp_len ;
2698
+ if( rb_range_beg_len(range, &cu_start, &cu_len, ICU_LEN(str), 0) != Qtrue)
2699
+ return Qnil;
2700
+
2701
+ cur_pos = u_countChar32( ICU_PTR(str), cu_start );
2702
+ if( cu_start+cu_len > ICU_LEN(str)) --cu_len;
2703
+ cp_len = u_countChar32( ICU_PTR(str) + cu_start , cu_len);
2704
+ return rb_range_new(LONG2NUM(cur_pos), LONG2NUM(cur_pos + cp_len-1), 0);
2705
+ }
2706
+ /**
2707
+ * call-seq:
2708
+ * str.conv_point_range(point_range) => code_unit_range
2709
+ *
2710
+ * Converts <b>code point</b> range to <b>code unit</b> range.
2711
+ * (inversion of #conv_unit_range)
2712
+ * If your chars don't use multiple UTF16 codeuints, range will be the same.
2713
+ */
2714
+ VALUE icu_ustr_convert_point_range(str, range)
2715
+ VALUE str, range;
2716
+ {
2717
+ long cp_start, cu_start, cu_end, cp_len, str_cp_len;
2718
+ str_cp_len = u_countChar32( ICU_PTR(str), ICU_LEN(str));
2719
+ if( Qtrue != rb_range_beg_len(range, &cp_start, &cp_len, str_cp_len, 0) ) return Qnil;
2720
+
2721
+ cu_start = 0;
2722
+ U16_FWD_N(ICU_PTR(str), cu_start, ICU_LEN(str), cp_start); /* care sur */
2723
+ cu_end = cu_start;
2724
+ U16_FWD_N(ICU_PTR(str), cu_end, ICU_LEN(str), cp_len); /* care sur */
2725
+
2726
+ return rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0);
2727
+ }
2728
+ /**
2729
+ * call-seq:
2730
+ * str.unit_count
2731
+ *
2732
+ * returns number of code units in string.
2733
+ *
2734
+ */
2735
+ VALUE icu_ustr_unit_count(VALUE str){
2736
+ return LONG2NUM(ICU_LEN(str));
2737
+ }
2738
+ /**
2739
+ * call-seq:
2740
+ * str.point_count
2741
+ *
2742
+ * returns number of code points in string.
2743
+ *
2744
+ */
2745
+ VALUE icu_ustr_point_count(VALUE str){
2746
+ return LONG2NUM(u_countChar32(ICU_PTR(str), ICU_LEN(str)));
2747
+ }
2748
+
2749
+ UChar icu_uchar_at(int32_t offset, void * context)
2750
+ {
2751
+ return ((UChar*)context)[offset];
2752
+ }
2753
+ /**
2754
+ * call-seq:
2755
+ * str.unescape => new_str
2756
+ *
2757
+ * Unescape a string of characters.
2758
+ *
2759
+ * The following escape sequences are recognized:
2760
+ * \uhhhh 4 hex digits; h in [0-9A-Fa-f]
2761
+ * \Uhhhhhhhh 8 hex digits
2762
+ * \xhh 1-2 hex digits \x{h...} 1-8 hex digits
2763
+ * \ooo 1-3 octal digits; o in [0-7]
2764
+ * \cX control-X; X is masked with 0x1F
2765
+ *
2766
+ * as well as the standard ANSI C escapes:
2767
+ * \a => U+0007, \b => U+0008, \t => U+0009, \n => U+000A, \v => U+000B, \f => U+000C, \r => U+000D, \e => U+001B, \" => U+0022, \' => U+0027, \? => U+003F, \\ => U+005C
2768
+ *
2769
+ * If escape sequence is invalid, it is ignored.
2770
+ *
2771
+ * "\\u044D\\u043A\\u0440\\u0430\\u043D\\u0438\\u0440\\u043E\\u0432\\u0430\\u043D\\u0438\\u0435".u.unescape => "экранирование"
2772
+ *
2773
+ **/
2774
+
2775
+ VALUE icu_ustr_unescape(str)
2776
+ VALUE str;
2777
+ {
2778
+ UChar32 c32;
2779
+ int32_t offset, leng, i, segment_start;
2780
+ UChar * ptr;
2781
+ UChar buf[3];
2782
+ VALUE ret;
2783
+ offset = 0;
2784
+ segment_start = 0;
2785
+ leng = ICU_LEN(str);
2786
+ ptr = ICU_PTR(str);
2787
+ ret = icu_ustr_new(0, 0);
2788
+ while(offset < leng) {
2789
+ if( ptr[offset] == '\\' ) {
2790
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2791
+ ++offset;
2792
+ c32 = u_unescapeAt(icu_uchar_at, &offset, leng, ICU_PTR(str));
2793
+ // append this char
2794
+ if( 0xFFFFFFFF == c32) continue;
2795
+ i = 0;
2796
+ U16_APPEND_UNSAFE(buf, i, c32);
2797
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, buf, U16_LENGTH(c32));
2798
+ segment_start = offset;
2799
+ } else {
2800
+ ++offset;
2801
+ }
2802
+ }
2803
+ if( segment_start < offset)
2804
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, ptr+segment_start, offset-segment_start);
2805
+
2806
+ return ret;
2807
+ }
2808
+
2809
+
2810
+
2811
+ /* transliteration */
2812
+ extern VALUE icu_transliterate(UChar * str, int32_t str_len, UChar * id, int32_t id_len, UChar * rules, int32_t rule_len);
2813
+ /**
2814
+ * call-seq:
2815
+ * str.translit(id, [rules])
2816
+ *
2817
+ * Performs {transliteration}[http://icu.sourceforge.net/userguide/Transformations.html],
2818
+ * of this string, using given transform +id+ and +rules+
2819
+ *
2820
+ * "yukihiro matsumoto".u.translit("Latin-Hiragana".u) # => ゆきひろ まつもと
2821
+ * "hello".u.translit("null".u, ":: upper();".u) # => HELLO
2822
+ **/
2823
+ VALUE icu_ustr_translit(argc, argv, str)
2824
+ int argc;
2825
+ VALUE * argv ;
2826
+ VALUE str;
2827
+ {
2828
+ VALUE id, rules ;
2829
+ if(rb_scan_args(argc, argv, "11", &id, &rules) == 2) {
2830
+ Check_Class(rules, rb_cUString);
2831
+ } else rules = Qnil;
2832
+
2833
+ Check_Class(str, rb_cUString);
2834
+ Check_Class(id, rb_cUString);
2835
+ if( rules == Qnil) {
2836
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id), NULL, 0);
2837
+ } else {
2838
+ return icu_transliterate(ICU_PTR(str), ICU_LEN(str), ICU_PTR(id), ICU_LEN(id),
2839
+ ICU_PTR(rules), ICU_LEN(rules));
2840
+ }
2841
+ }
2842
+ void
2843
+ initialize_ustring(void)
2844
+ {
2845
+ UErrorCode status = U_ZERO_ERROR;
2846
+ u_init(&status);
2847
+ if( U_FAILURE(status) ){
2848
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2849
+ }
2850
+ s_UCA_collator = ucol_open("", &status);
2851
+ if( U_FAILURE(status) ){
2852
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2853
+ }
2854
+ s_case_UCA_collator = ucol_open("", &status);
2855
+ if( U_FAILURE(status) ){
2856
+ rb_raise(rb_eRuntimeError, "Can't initialize : %s", u_errorName(status));
2857
+ }
2858
+ ucol_setStrength(s_case_UCA_collator, UCOL_SECONDARY);
2859
+
2860
+ /*
2861
+
2862
+ Document-class: UString
2863
+
2864
+ UString is a string class that stores Unicode characters directly and provides
2865
+ similar functionality as the Ruby String class.
2866
+
2867
+ An UString string consists of 16-bit Unicode code units. A Unicode character
2868
+ may be stored with either one code unit which is the most common case or with a matched
2869
+ pair of special code units ("surrogates").
2870
+
2871
+ For single-character handling, a Unicode character code point is a value in the
2872
+ range 0..0x10ffff.
2873
+
2874
+ Indexes and offsets into and lengths of strings always count code units, not code points.
2875
+ This is the same as with multi-byte char* strings in traditional string handling.
2876
+ Operations on partial strings typically do not test for code point boundaries.
2877
+
2878
+ In order to use the collation, text boundary analysis, formatting and other ICU APIs,
2879
+ Unicode strings must be used. In order to get Unicode strings from your native codepage,
2880
+ you can use the conversion API.
2881
+
2882
+ UString class is also point for access to several ICU services, instead of
2883
+ mirroring ICU class hierarchy.
2884
+
2885
+ ==== Methods by category:
2886
+
2887
+ - concat and modify: + , * , << , #concat , #replace
2888
+
2889
+ - element reference, insert, replace: [] , #slice , []= , #slice! , #insert , #char_span
2890
+
2891
+ - comparisons: <=> , == , #casecmp , #strcoll
2892
+
2893
+ - size and positions: #length , #point_count , #clear , #empty? , #conv_unit_range , #conv_point_range
2894
+
2895
+ - index/search methods: #index , #rindex , #include? , #search
2896
+
2897
+ - regexps, matching and replacing: =~ , #match , #scan , #split , #sub , #sub! , #gsub , #gsub!
2898
+
2899
+ - conversion String/UString: #to_s, Kernel#u, String#to_u
2900
+
2901
+ - iterators: #each_line_break , #each_word , #each_char , #each_sentence
2902
+
2903
+ - split to chars/codepoints: #chars , #codepoints , Array#to_u
2904
+
2905
+ - character case: #upcase , #upcase! , #downcase , #downcase!
2906
+
2907
+ - stripping spaces: #strip , #lstrip , #rstrip , #strip! , #lstrip! , #rstrip!
2908
+
2909
+ - formatting and parsing: #format , #parse_date , #to_f
2910
+
2911
+ - UNICODE normalization: #norm_C , #norm_D , #norm_KC , #norm_KD , #norm_FCD
2912
+
2913
+ - utilities: #unescape , #hash , #inspect , #inspect_names , #translit
2914
+
2915
+ - ICU avalable info: #list_coll , #list_locales , #list_translits
2916
+ */
2917
+ rb_cUString = rb_define_class("UString", rb_cObject);
2918
+ rb_include_module(rb_cUString, rb_mComparable);
2919
+
2920
+ /* initializations */
2921
+ rb_define_alloc_func(rb_cUString, icu_ustr_alloc);
2922
+ rb_define_method(rb_cUString, "initialize", icu_ustr_init, -1);
2923
+ rb_define_method(rb_cUString, "initialize_copy", icu_ustr_replace, 1);
2924
+ rb_define_method(rb_cUString, "replace", icu_ustr_replace, 1);
2925
+
2926
+ /* comparisons */
2927
+ rb_define_method(rb_cUString, "<=>", icu_ustr_cmp_m, 1);
2928
+ rb_define_method(rb_cUString, "==", icu_ustr_equal, 1);
2929
+ rb_define_method(rb_cUString, "casecmp", icu_ustr_casecmp, 1);
2930
+ rb_define_singleton_method(rb_cUString, "strcoll", icu_ustr_coll, -1);
2931
+
2932
+ /* ICU avalable info */
2933
+ rb_define_singleton_method(rb_cUString, "list_coll", icu_ustr_list_coll, 0);
2934
+ rb_define_singleton_method(rb_cUString, "list_locales", icu_ustr_list_locales, 0);
2935
+ rb_define_singleton_method(rb_cUString, "list_translits", icu_ustr_list_translits, 0);
2936
+
2937
+ /* hash code */
2938
+ rb_define_method(rb_cUString, "hash", icu_ustr_hash_m, 0);
2939
+
2940
+ /* inspect */
2941
+ rb_define_method(rb_cUString, "inspect", icu_ustr_inspect, 0);
2942
+ rb_define_method(rb_cUString, "inspect_names", icu_ustr_inspect_names, 0);
2943
+
2944
+ /* size */
2945
+ rb_define_method(rb_cUString, "length", icu_ustr_length, 0);
2946
+ rb_define_alias (rb_cUString, "size", "length");
2947
+ rb_define_method(rb_cUString, "unit_count", icu_ustr_unit_count, 0);
2948
+ rb_define_method(rb_cUString, "point_count", icu_ustr_point_count, 0);
2949
+ rb_define_method(rb_cUString, "clear", icu_ustr_clear, 0);
2950
+ rb_define_method(rb_cUString, "empty?", icu_ustr_empty, 0);
2951
+
2952
+ /* UNICODE normalization */
2953
+ rb_define_method(rb_cUString, "norm_C", icu_ustr_normalize_C, 0);
2954
+ rb_define_method(rb_cUString, "norm_D", icu_ustr_normalize_D, 0);
2955
+ rb_define_method(rb_cUString, "norm_KC", icu_ustr_normalize_KC, 0);
2956
+ rb_define_method(rb_cUString, "norm_KD", icu_ustr_normalize_KD, 0);
2957
+ rb_define_method(rb_cUString, "norm_FCD", icu_ustr_normalize_FCD, 0);
2958
+
2959
+ /* iterators */
2960
+ rb_define_method(rb_cUString, "each_line_break", icu_ustr_each_line, -1);
2961
+ rb_define_method(rb_cUString, "each_word", icu_ustr_each_word, -1);
2962
+ rb_define_method(rb_cUString, "each_char", icu_ustr_each_char, -1);
2963
+ rb_define_method(rb_cUString, "each_sentence", icu_ustr_each_sentence, -1);
2964
+ rb_define_alias(rb_cUString, "each", "each_line_break");
2965
+
2966
+ /* split to chars/codepoints */
2967
+ rb_define_method(rb_cUString, "chars", icu_ustr_chars_m, -1);
2968
+ rb_define_method(rb_cUString, "char_span", icu_ustr_char_span, -1);
2969
+ rb_define_method(rb_cUString, "codepoints", icu_ustr_points, 0);
2970
+
2971
+ /* concat operations */
2972
+ rb_define_method(rb_cUString, "+", icu_ustr_plus, 1);
2973
+ rb_define_method(rb_cUString, "*", icu_ustr_times, 1);
2974
+ rb_define_method(rb_cUString, "concat", icu_ustr_concat, 1);
2975
+ rb_define_alias( rb_cUString, "<<", "concat");
2976
+
2977
+ /* character case */
2978
+ rb_define_method(rb_cUString, "upcase", icu_ustr_upcase, -1);
2979
+ rb_define_method(rb_cUString, "upcase!", icu_ustr_upcase_bang, -1);
2980
+ rb_define_method(rb_cUString, "downcase", icu_ustr_downcase, -1);
2981
+ rb_define_method(rb_cUString, "downcase!", icu_ustr_downcase_bang, -1);
2982
+ rb_define_method(rb_cUString, "foldcase", icu_ustr_foldcase, 0);
2983
+
2984
+ /* stripping spaces */
2985
+ rb_define_method(rb_cUString, "strip", icu_ustr_strip, 0);
2986
+ rb_define_method(rb_cUString, "lstrip", icu_ustr_lstrip, 0);
2987
+ rb_define_method(rb_cUString, "rstrip", icu_ustr_rstrip, 0);
2988
+
2989
+ rb_define_method(rb_cUString, "strip!", icu_ustr_strip_bang, 0);
2990
+ rb_define_method(rb_cUString, "lstrip!", icu_ustr_lstrip_bang, 0);
2991
+ rb_define_method(rb_cUString, "rstrip!", icu_ustr_rstrip_bang, 0);
2992
+
2993
+ /* index/search methods */
2994
+ rb_define_method(rb_cUString, "index", icu_ustr_index_m, -1);
2995
+ rb_define_method(rb_cUString, "rindex", icu_ustr_rindex_m, -1);
2996
+ rb_define_method(rb_cUString, "include?", icu_ustr_include, 1);
2997
+ rb_define_method(rb_cUString, "search", icu_ustr_search, -1);
2998
+
2999
+ /* element reference */
3000
+ rb_define_method(rb_cUString, "[]", icu_ustr_aref_m, -1);
3001
+ rb_define_alias(rb_cUString, "slice", "[]");
3002
+
3003
+ /* codeunit/codepoint conversion */
3004
+ rb_define_method(rb_cUString, "conv_unit_range", icu_ustr_convert_unit_range, 1);
3005
+ rb_define_method(rb_cUString, "conv_point_range", icu_ustr_convert_point_range, 1);
3006
+
3007
+ /* insert/replace */
3008
+ rb_define_method(rb_cUString, "[]=", icu_ustr_aset_m, -1);
3009
+ rb_define_method(rb_cUString, "slice!", icu_ustr_slice_bang, -1);
3010
+ rb_define_method(rb_cUString, "insert", icu_ustr_insert, 2);
3011
+
3012
+ /* conversion to String from UString */
3013
+ rb_define_method(rb_cUString, "to_u", icu_ustr_to_ustr, -1);
3014
+ rb_define_method(rb_cUString, "to_s", icu_ustr_to_rstr, -1);
3015
+ rb_define_alias(rb_cUString, "to_str", "to_s");
3016
+
3017
+ /* formatting messages */
3018
+ rb_define_method(rb_cUString, "format", icu_ustr_format, -2);
3019
+ rb_define_alias( rb_cUString, "fmt", "format");
3020
+
3021
+ /* parsing */
3022
+ rb_define_method(rb_cUString, "parse_date", icu_ustr_parse_date, 2);
3023
+ rb_define_method(rb_cUString, "to_f", icu_ustr_parse_double, -1);
3024
+
3025
+ /* transliteration */
3026
+ rb_define_method(rb_cUString, "translit", icu_ustr_translit, -1);
3027
+
3028
+ /* unescaping */
3029
+ rb_define_method(rb_cUString, "unescape", icu_ustr_unescape, 0);
3030
+
3031
+ /* regexp matching and replacing */
3032
+ rb_define_method(rb_cUString, "=~", icu_ustr_match, 1);
3033
+ rb_define_method(rb_cUString, "match", icu_ustr_match_m, 1);
3034
+ rb_define_method(rb_cUString, "scan", icu_ustr_scan, 1);
3035
+ rb_define_method(rb_cUString, "split", icu_ustr_split_m, -1);
3036
+ rb_define_method(rb_cUString, "sub", icu_ustr_sub, -1);
3037
+ rb_define_method(rb_cUString, "sub!", icu_ustr_sub_bang, -1);
3038
+ rb_define_method(rb_cUString, "gsub", icu_ustr_gsub, -1);
3039
+ rb_define_method(rb_cUString, "gsub!", icu_ustr_gsub_bang, -1);
3040
+
3041
+ }
3042
+