icu4r_19 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ #include "icu_common.h"
2
+ extern VALUE rb_cUString;
3
+ extern VALUE icu_ustr_new_set(const UChar * str, long len, long capa);
4
+
5
+ /**
6
+ * call-seq:
7
+ * ary.to_u => anUString
8
+ *
9
+ * Creates UString from array of fixnums, representing Unicode codepoints.
10
+ * (inversion of UString#codepoints)
11
+ *
12
+ * a = "поддержка".to_u.codepoints # => [1087, 1086, 1076, 1076, 1077, 1088, 1078, 1082, 1072]
13
+ * a.to_u # => "поддержка"
14
+ *
15
+ */
16
+ VALUE icu_ustr_from_array(obj)
17
+ VALUE obj;
18
+ {
19
+ int i, n;
20
+ VALUE *p;
21
+ VALUE ret, temp;
22
+ UChar32 * src , *pos, chr;
23
+ UChar * buf;
24
+ int32_t len, capa;
25
+ UErrorCode status = U_ZERO_ERROR;
26
+
27
+ n = RARRAY_LEN(obj);
28
+ p = RARRAY_PTR(obj);
29
+
30
+ src = ALLOC_N(UChar32, n);
31
+ pos = src;
32
+ for ( i = 0; i < n; i++){
33
+ temp = p[i];
34
+ if(TYPE(temp) != T_FIXNUM) {
35
+ free(src);
36
+ rb_raise(rb_eTypeError, "Can't convert from %s", rb_class2name(CLASS_OF(temp)));
37
+ }
38
+ chr = (UChar32) FIX2INT(temp);
39
+ // invalid codepoints are converted to U+FFFD
40
+ if( ! (U_IS_UNICODE_CHAR(chr)) ) {
41
+ chr = 0xFFFD;
42
+ }
43
+ *pos = chr;
44
+ pos ++;
45
+ }
46
+ capa = n+1;
47
+ buf = ALLOC_N(UChar, capa);
48
+ u_strFromUTF32(buf, capa, &len, src, n, &status);
49
+ if( U_BUFFER_OVERFLOW_ERROR == status ){
50
+ capa = len+1;
51
+ REALLOC_N(buf, UChar, capa);
52
+ status = U_ZERO_ERROR;
53
+ u_strFromUTF32(buf, capa, &len, src, n, &status);
54
+ }
55
+ if (U_FAILURE(status) ) {
56
+ free(src);
57
+ free(buf);
58
+ rb_raise(rb_eRuntimeError, u_errorName(status));
59
+ }
60
+ if( capa <= len ){
61
+ ++capa;
62
+ REALLOC_N(buf, UChar, capa);
63
+ }
64
+ ret = icu_ustr_new_set(buf, len, capa);
65
+ free(src);
66
+ return ret;
67
+ }
68
+
69
+ /**
70
+ * call-seq:
71
+ * str.to_u(encoding = 'utf8') => String
72
+ *
73
+ * Converts String value in given encoding to UString.
74
+ * When no encoding is given, utf8 is assumed. If string is not valid UTF8,
75
+ * and no encoding is given, exception is raised.
76
+ *
77
+ * When explicit encoding is given, converter will replace incorrect codepoints
78
+ * with <U+FFFD> - replacement character.
79
+ */
80
+ VALUE
81
+ icu_from_rstr(argc, argv, str)
82
+ int argc;
83
+ VALUE *argv,
84
+ str;
85
+ {
86
+ VALUE enc;
87
+ char *encoding = 0; /* default */
88
+ UErrorCode error = 0;
89
+ int32_t capa, len;
90
+ VALUE s;
91
+ UChar * buf;
92
+ UConverter * conv;
93
+ if (rb_scan_args(argc, argv, "01", &enc) == 1) {
94
+ Check_Type(enc, T_STRING);
95
+ encoding = RSTRING_PTR(enc);
96
+ }
97
+ capa = RSTRING_LEN(str) + 1;
98
+ buf = ALLOC_N(UChar, capa);
99
+
100
+ if(! encoding || !strncmp(encoding, "utf8", 4) ) {
101
+ /* from UTF8 */
102
+ u_strFromUTF8(buf, capa-1, &len, RSTRING_PTR(str), RSTRING_LEN(str), &error);
103
+ if( U_FAILURE(error)) {
104
+ free(buf);
105
+ rb_raise(rb_eArgError, u_errorName(error));
106
+ }
107
+ s = icu_ustr_new_set(buf, len, capa);
108
+ } else {
109
+ conv = ucnv_open(encoding, &error);
110
+ if (U_FAILURE(error)) {
111
+ ucnv_close(conv);
112
+ rb_raise(rb_eArgError, u_errorName(error));
113
+ }
114
+ len = ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
115
+ RSTRING_LEN(str), &error);
116
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
117
+ capa = len+1;
118
+ REALLOC_N(buf, UChar, capa);
119
+ error = 0;
120
+ len = ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
121
+ RSTRING_LEN(str), &error);
122
+ if (U_FAILURE(error)) {
123
+ free(buf);
124
+ rb_raise(rb_eArgError, u_errorName(error));
125
+ }
126
+
127
+ }
128
+ s = icu_ustr_new_set(buf, len, capa);
129
+ ucnv_close(conv);
130
+ }
131
+ return s;
132
+ }
133
+
134
+ /**
135
+ * call-seq:
136
+ * u(str, enc = 'utf8') => UString
137
+ *
138
+ * Global function to convert from String to UString
139
+ */
140
+ VALUE
141
+ icu_f_rb_str(argc, argv, obj)
142
+ int argc;
143
+ VALUE *argv;
144
+ VALUE obj;
145
+ {
146
+ VALUE enc;
147
+ VALUE str;
148
+ if (rb_scan_args(argc, argv, "11", &str, &enc) == 2) {
149
+ Check_Type(enc, T_STRING);
150
+ Check_Type(str, T_STRING);
151
+ return icu_from_rstr(1, &enc, str);
152
+ } else {
153
+ Check_Type(str, T_STRING);
154
+ return icu_from_rstr(0, NULL, str);
155
+ }
156
+
157
+ }
158
+
159
+ void initialize_ucore_ext(void)
160
+ {
161
+ /* conversion from String to UString */
162
+ rb_define_method(rb_cString, "to_u", icu_from_rstr, -1);
163
+ rb_define_alias(rb_cString, "u", "to_u");
164
+ rb_define_global_function("u", icu_f_rb_str, -1);
165
+
166
+ /* conversion from Array to UString */
167
+ rb_define_method(rb_cArray, "to_u", icu_ustr_from_array, 0);
168
+ }
@@ -0,0 +1,697 @@
1
+
2
+ /**
3
+ * Document-class: URegexp
4
+ *
5
+ * See [docs/UNICODE_REGEXPS] for details of patterns.
6
+ *
7
+ *
8
+ * Replacement Text
9
+ *
10
+ * The replacement text for find-and-replace operations may contain references to
11
+ * capture-group text from the find. References are of the form $n, where n is the
12
+ * number of the capture group.
13
+ *
14
+ * Character Descriptions
15
+ * $n The text of capture group n will be substituted for $n. n must be >= 0 and not
16
+ * greater than the number of capture groups. A $ not followed by a digit has no special meaning,
17
+ * and will appear in the substitution text as itself, a $.
18
+ * \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
19
+ * substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
20
+ *
21
+ *
22
+ * Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
23
+ */
24
+
25
+ #include "icu_common.h"
26
+ extern VALUE rb_cURegexp;
27
+ extern VALUE rb_cUString;
28
+ extern VALUE rb_cUMatch;
29
+ VALUE icu_umatch_aref(VALUE match, VALUE idx);
30
+ VALUE icu_umatch_new (VALUE re);
31
+ extern VALUE icu_ustr_new(const UChar * ptr, long len);
32
+ extern VALUE icu_ustr_new2(const UChar * ptr);
33
+ extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
34
+ extern VALUE icu_from_rstr(int, VALUE *, VALUE);
35
+
36
+ /* --------- regular expressions */
37
+ void icu_regex_free( ICURegexp *ptr)
38
+ {
39
+ if (ptr->pattern)
40
+ uregex_close(ptr->pattern);
41
+ ptr->pattern = 0;
42
+ free(ptr);
43
+ }
44
+
45
+ VALUE
46
+ icu_reg_s_alloc(klass)
47
+ VALUE klass;
48
+ {
49
+ ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
50
+ ptr->pattern = 0;
51
+ return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
52
+ }
53
+
54
+ void
55
+ icu_reg_initialize(obj, s, len, options)
56
+ VALUE obj;
57
+ const UChar *s;
58
+ long len;
59
+ int options;
60
+ {
61
+ UParseError pe;
62
+ UErrorCode status = 0;
63
+ ICURegexp *re = UREGEX(obj);
64
+
65
+ if (re->pattern)
66
+ uregex_close(re->pattern);
67
+ re->pattern = uregex_open(s, len, options, &pe, &status);
68
+ re->options = options;
69
+
70
+ if (U_FAILURE(status))
71
+ rb_raise(rb_eArgError,
72
+ "Wrong regexp: %s line %d column %d flags %d",
73
+ u_errorName(status), pe.line, pe.offset, options);
74
+
75
+ }
76
+
77
+ const UChar *
78
+ icu_reg_get_pattern(ptr, len)
79
+ ICURegexp *ptr;
80
+ int32_t *len;
81
+ {
82
+ UErrorCode error = U_ZERO_ERROR;
83
+ *len = 0;
84
+ return uregex_pattern(ptr->pattern, len, &error);
85
+ }
86
+
87
+ /**
88
+ * call-seq:
89
+ * URegexp.new(str [,options])
90
+ * URegexp.new(regexp)
91
+ *
92
+ * Constructs a new regular expression from <i>pattern</i>, which can be either
93
+ * a <code>UString</code> or a <code>URegexp</code>.
94
+ * */
95
+ VALUE
96
+ icu_reg_initialize_m(argc, argv, self)
97
+ int argc;
98
+ VALUE *argv;
99
+ VALUE self;
100
+ {
101
+ const UChar *s;
102
+ int32_t len = 0;
103
+ int flags = 0;
104
+
105
+ if (argc == 0 || argc > 2) {
106
+ rb_raise(rb_eArgError, "wrong number of arguments");
107
+ }
108
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
109
+ if (argc > 1) {
110
+ rb_warn("flags ignored");
111
+ }
112
+ flags = UREGEX(argv[0])->options;
113
+ s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
114
+ } else {
115
+ Check_Class(argv[0], rb_cUString);
116
+ if (argc == 2) {
117
+ if (FIXNUM_P(argv[1]))
118
+ flags = FIX2INT(argv[1]);
119
+ else if (RTEST(argv[1]))
120
+ flags = UREGEX_CASE_INSENSITIVE;
121
+ }
122
+ s = ICU_PTR(argv[0]);
123
+ len = ICU_LEN(argv[0]);
124
+ }
125
+ icu_reg_initialize(self, s, len, flags);
126
+ return self;
127
+ }
128
+
129
+ VALUE
130
+ icu_reg_new(s, len, options)
131
+ const UChar *s;
132
+ long len;
133
+ int options;
134
+ {
135
+ VALUE re = icu_reg_s_alloc(rb_cURegexp);
136
+ icu_reg_initialize(re, s, len, options);
137
+ return (VALUE) re;
138
+ }
139
+
140
+ VALUE
141
+ icu_reg_clone(obj)
142
+ VALUE obj;
143
+ {
144
+ ICURegexp *regex = UREGEX(obj);
145
+ URegularExpression *old_pattern = UREGEX(obj)->pattern;
146
+ VALUE ret ;
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
149
+ if(U_FAILURE(status) ){
150
+ rb_raise(rb_eArgError, u_errorName(status));
151
+ }
152
+ ret = icu_reg_s_alloc(rb_cURegexp);
153
+ regex = UREGEX(ret);
154
+ regex->pattern = old_pattern;
155
+ UREGEX(obj)->pattern = new_pattern;
156
+ return ret;
157
+ }
158
+ VALUE
159
+ icu_reg_comp(str)
160
+ VALUE str;
161
+ {
162
+ return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
163
+ }
164
+
165
+ /**
166
+ * call-seq:
167
+ * regexp.to_u => URegexp
168
+ *
169
+ * Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
170
+ * $KCODE must be set to 'u' to work reliably
171
+ */
172
+ VALUE icu_reg_from_rb_reg(re)
173
+ VALUE re;
174
+ {
175
+ return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
176
+ }
177
+
178
+ /**
179
+ * call-seq:
180
+ * uregex.to_u
181
+ *
182
+ * Returns UString of this URegexp pattern.
183
+ * */
184
+ VALUE
185
+ icu_reg_to_u(self)
186
+ VALUE self;
187
+ {
188
+ int32_t len = 0;
189
+ const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
190
+ return icu_ustr_new(s, len);
191
+ }
192
+
193
+ /**
194
+ * call-seq:
195
+ * uregex.split(str, limit)
196
+ *
197
+ * Divides <i>str</i> into substrings based on a regexp pattern,
198
+ * returning an array of these substrings. <i>str</i> is divided where the
199
+ * pattern matches.
200
+ * */
201
+ VALUE
202
+ icu_reg_split(self, str, limit)
203
+ VALUE self,
204
+ str,
205
+ limit;
206
+ {
207
+ VALUE splits;
208
+ URegularExpression *theRegEx = UREGEX(self)->pattern;
209
+ UErrorCode error = U_ZERO_ERROR;
210
+ UChar * dest_buf, **dest_fields;
211
+ int32_t limt, req_cap, total, i;
212
+ Check_Class(str, rb_cUString);
213
+ if (limit != Qnil)
214
+ Check_Type(limit, T_FIXNUM);
215
+ dest_buf = ALLOC_N(UChar, USTRING(str)->len * 2 + 2);
216
+ limt = (limit == Qnil ? USTRING(str)->len + 1 : FIX2INT(limit));
217
+ dest_fields = ALLOC_N(UChar *, limt);
218
+ uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
219
+ if (U_FAILURE(error)) {
220
+ free(dest_buf);
221
+ free(dest_fields);
222
+ rb_raise(rb_eArgError, u_errorName(error));
223
+ }
224
+ req_cap = 0;
225
+ total =
226
+ uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
227
+ dest_fields, limt, &error);
228
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
229
+ error = U_ZERO_ERROR;
230
+ REALLOC_N( dest_buf, UChar, req_cap);
231
+ total = uregex_split(theRegEx, dest_buf, req_cap, &req_cap, dest_fields, limt, &error);
232
+ }
233
+ if (U_FAILURE(error) ) {
234
+ free(dest_buf);
235
+ free(dest_fields);
236
+ rb_raise(rb_eArgError, u_errorName(error));
237
+ }
238
+ splits = rb_ary_new();
239
+ for (i = 0; i < total; i++)
240
+ rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
241
+
242
+ free(dest_buf);
243
+ free(dest_fields);
244
+
245
+ return splits;
246
+ }
247
+
248
+ long
249
+ icu_reg_search(re, str, pos, reverse)
250
+ VALUE re,
251
+ str;
252
+ long pos,
253
+ reverse;
254
+ {
255
+ UErrorCode error = U_ZERO_ERROR;
256
+ long cur_pos = 0;
257
+ long start,
258
+ last;
259
+
260
+ if (!reverse) {
261
+ start = pos;
262
+ } else {
263
+ start = 0;
264
+ }
265
+
266
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
267
+ USTRING(str)->len, &error);
268
+ if (U_FAILURE(error))
269
+ rb_raise(rb_eArgError, u_errorName(error));
270
+ if (!uregex_find(UREGEX(re)->pattern, start, &error))
271
+ return -1;
272
+ if (U_FAILURE(error))
273
+ rb_raise(rb_eArgError, u_errorName(error));
274
+ cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
275
+ if (reverse) {
276
+ while (uregex_findNext(UREGEX(re)->pattern, &error)) {
277
+ last = uregex_start(UREGEX(re)->pattern, 0, &error);
278
+ error = U_ZERO_ERROR;
279
+ if (reverse && last > pos)
280
+ break;
281
+ cur_pos = last;
282
+ }
283
+ }
284
+ if (reverse && cur_pos > pos)
285
+ return -1;
286
+ return cur_pos;
287
+ }
288
+ long
289
+ icu_group_count(re)
290
+ VALUE re;
291
+ {
292
+ UErrorCode error = U_ZERO_ERROR;
293
+ return uregex_groupCount(UREGEX(re)->pattern, &error);
294
+ }
295
+
296
+ VALUE
297
+ icu_reg_nth_match(re, nth)
298
+ VALUE re;
299
+ long nth;
300
+ {
301
+ URegularExpression *the_expr = UREGEX(re)->pattern;
302
+ UErrorCode error = U_ZERO_ERROR;
303
+ long start, end;
304
+ int32_t len;
305
+ if( nth < 0 ) {
306
+ nth += icu_group_count(re) + 1;
307
+ if(nth<=0) return Qnil;
308
+ }
309
+ start = uregex_start(the_expr, nth, &error);
310
+ if (U_FAILURE(error)) {
311
+ return Qnil;
312
+ }
313
+ end = uregex_end(the_expr, nth, &error);
314
+ len = 0;
315
+ return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
316
+ end - start);
317
+ }
318
+
319
+ VALUE
320
+ icu_reg_range(re, nth, start, end)
321
+ VALUE re;
322
+ int nth;
323
+ long *start;
324
+ long *end;
325
+ {
326
+ URegularExpression *the_expr = UREGEX(re)->pattern;
327
+ UErrorCode error = U_ZERO_ERROR;
328
+ if(nth < 0) {
329
+ nth += icu_group_count(re) + 1;
330
+ if(nth <= 0) return Qnil;
331
+ }
332
+ *start = uregex_start(the_expr, nth, &error);
333
+ if (U_FAILURE(error))
334
+ return Qnil;
335
+ *end = uregex_end(the_expr, nth, &error);
336
+ return Qtrue;
337
+ }
338
+
339
+ /**
340
+ * call-seq:
341
+ * uregex.match(str) => matchdata or nil
342
+ * uregex =~ (str) => matchdata or nil
343
+ *
344
+ * Returns a <code>UMatch</code> object describing the match, or
345
+ * <code>nil</code> if there was no match.
346
+ *
347
+ * ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
348
+ */
349
+ VALUE
350
+ icu_reg_match(re, str)
351
+ VALUE re,
352
+ str;
353
+ {
354
+ UErrorCode error = U_ZERO_ERROR;
355
+ Check_Class(str, rb_cUString);
356
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
357
+ USTRING(str)->len, &error);
358
+ if (U_FAILURE(error))
359
+ rb_raise(rb_eArgError, u_errorName(error));
360
+ if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
361
+ return icu_umatch_new(re);
362
+ }
363
+ return Qnil;
364
+ }
365
+
366
+ /**
367
+ * call-seq:
368
+ * rxp === str => true or false
369
+ *
370
+ * Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
371
+ *
372
+ * a = "HELLO".u
373
+ * case a
374
+ * when ure("^[a-z]*$"); print "Lower case\n"
375
+ * when ure("^[A-Z]*$"); print "Upper case\n"
376
+ * else; print "Mixed case\n"
377
+ * end
378
+ *
379
+ * <em>produces:</em>
380
+ *
381
+ * Upper case
382
+ */
383
+ VALUE
384
+ icu_reg_eqq(re, str)
385
+ VALUE re,
386
+ str;
387
+ {
388
+ long start;
389
+ Check_Class(str, rb_cUString);
390
+ start = icu_reg_search(re, str, 0, 0);
391
+ return start < 0 ? Qfalse : Qtrue;
392
+ }
393
+
394
+
395
+
396
+
397
+ int
398
+ icu_reg_find_next(pat)
399
+ VALUE pat;
400
+ {
401
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
402
+ UErrorCode error = U_ZERO_ERROR;
403
+ return uregex_findNext(the_expr, &error);
404
+ }
405
+
406
+ static const UChar BACKSLASH = 0x5c;
407
+ static const UChar DOLLARSIGN = 0x24;
408
+
409
+ VALUE
410
+ icu_reg_get_replacement(pat, repl_text, prev_end)
411
+ VALUE pat,
412
+ repl_text;
413
+ long prev_end;
414
+ {
415
+ UErrorCode error = U_ZERO_ERROR;
416
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
417
+ VALUE ret = icu_ustr_new(0, 0);
418
+
419
+ /* scan the replacement text, looking for substitutions ($n) and \escapes. */
420
+ int32_t replIdx = 0;
421
+ int32_t replacementLength = ICU_LEN(repl_text);
422
+ UChar *replacementText = ICU_PTR(repl_text);
423
+ int32_t numDigits = 0;
424
+ int32_t groupNum = 0, g_start, g_end;
425
+ UChar32 digitC;
426
+ int32_t len;
427
+ /* following code is rewritten version of code found */
428
+ /* in ICU sources : i18n/regexp.cpp */
429
+ while (replIdx < replacementLength) {
430
+ UChar c = replacementText[replIdx];
431
+ replIdx++;
432
+ if (c != DOLLARSIGN && c != BACKSLASH) {
433
+ /* Common case, no substitution, no escaping, */
434
+ /* just copy the char to the dest buf. */
435
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
436
+ continue;
437
+ }
438
+
439
+ if (c == BACKSLASH) {
440
+ /* Backslash Escape. Copy the following char out without further checks. */
441
+ /* Note: Surrogate pairs don't need any special handling */
442
+ /* The second half wont be a '$' or a '\', and */
443
+ /* will move to the dest normally on the next */
444
+ /* loop iteration. */
445
+ if (replIdx >= replacementLength) {
446
+ break;
447
+ }
448
+ /* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
449
+
450
+ /* Plain backslash escape. Just put out the escaped character. */
451
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
452
+ replIdx++;
453
+ continue;
454
+ }
455
+
456
+ /* We've got a $. Pick up a capture group number if one follows. */
457
+ /* Consume at most the number of digits necessary for the largest capture */
458
+ /* number that is valid for this pattern. */
459
+ numDigits = 0;
460
+ groupNum = 0;
461
+
462
+ for (;;) {
463
+ if (replIdx >= replacementLength) {
464
+ break;
465
+ }
466
+ U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
467
+ if (u_isdigit(digitC) == FALSE) {
468
+ break;
469
+ }
470
+
471
+ U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
472
+ groupNum=groupNum*10 + u_charDigitValue(digitC);
473
+ numDigits++;
474
+ if (numDigits >= 3) { /* limit 999 groups */
475
+ break;
476
+ }
477
+ }
478
+
479
+ if (numDigits == 0) {
480
+ /* The $ didn't introduce a group number at all. */
481
+ /* Treat it as just part of the substitution text. */
482
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
483
+ continue;
484
+ }
485
+
486
+ /* Finally, append the capture group data to the destination. */
487
+ error = U_ZERO_ERROR;
488
+ g_start = uregex_start(the_expr, groupNum, &error);
489
+ g_end = uregex_end (the_expr, groupNum, &error);
490
+ if(U_SUCCESS(error) && g_start != -1 ) {
491
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
492
+ uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
493
+ }
494
+
495
+ }
496
+ return ret;
497
+ }
498
+
499
+ VALUE
500
+ icu_reg_get_prematch(pat, prev_end)
501
+ VALUE pat;
502
+ long prev_end;
503
+ {
504
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
505
+ UErrorCode error = U_ZERO_ERROR;
506
+ int32_t len = 0;
507
+ int32_t cur_start = uregex_start(the_expr, 0, &error);
508
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
509
+ VALUE pm =
510
+ icu_ustr_new(temp + prev_end, cur_start - prev_end);
511
+ return pm;
512
+ }
513
+
514
+ VALUE
515
+ icu_reg_get_tail(pat, prev_end)
516
+ VALUE pat;
517
+ long prev_end;
518
+ {
519
+ UErrorCode error = U_ZERO_ERROR;
520
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
521
+ int32_t len = 0;
522
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
523
+ VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
524
+ return pm;
525
+ }
526
+
527
+ /**
528
+ * call-seq:
529
+ * ure(str[, options]) => URegexp
530
+ *
531
+ * Creates URegexp object from UString.
532
+ * */
533
+ VALUE
534
+ icu_reg_from_rb_str(argc, argv, obj)
535
+ int argc;
536
+ VALUE *argv;
537
+ VALUE obj;
538
+ {
539
+ VALUE pat,
540
+ options = Qnil;
541
+ int reg_opts = 0;
542
+ if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
543
+ reg_opts = 0;
544
+ } else {
545
+ if (options != Qnil) {
546
+ Check_Type(options, T_FIXNUM);
547
+ reg_opts = FIX2INT(options);
548
+ }
549
+ }
550
+ if (TYPE(pat) == T_STRING)
551
+ pat = icu_from_rstr(0, NULL, pat);
552
+ if (CLASS_OF(pat) != rb_cUString)
553
+ rb_raise(rb_eArgError, "Expected String or UString");
554
+ return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
555
+ }
556
+
557
+ /**
558
+ * call-seq:
559
+ * umatch[idx] => string
560
+ *
561
+ * Returns capture group. Group 0 is for full match.
562
+ * */
563
+ VALUE
564
+ icu_umatch_aref(match, index)
565
+ VALUE match,
566
+ index;
567
+ {
568
+ long idx;
569
+ VALUE cg;
570
+ Check_Type(index, T_FIXNUM);
571
+ idx = FIX2LONG(index);
572
+ cg = rb_iv_get(match, "@cg");
573
+ return rb_ary_entry(cg, idx);
574
+ }
575
+
576
+ /**
577
+ * call-seq:
578
+ * umatch.range(idx) => range
579
+ *
580
+ * Returns range (start, end) of capture group. Group 0 is for full match.
581
+ *
582
+ * NOTE: this method returns <b>code unit</b> indexes. To convert this range
583
+ * to <b>code point</b> range use UString#conv_unit_range. If your chars don't
584
+ * require surrogate UTF16 pairs, range will be the same.
585
+ * */
586
+ VALUE
587
+ icu_umatch_range(match, index)
588
+ VALUE match,
589
+ index;
590
+ {
591
+ long idx;
592
+ VALUE cg;
593
+ Check_Type(index, T_FIXNUM);
594
+ idx = FIX2LONG(index);
595
+ cg = rb_iv_get(match, "@ranges");
596
+ return rb_ary_entry(cg, idx);
597
+ }
598
+
599
+
600
+ /**
601
+ * call-seq:
602
+ * umatch.size => fixnum
603
+ *
604
+ * Returns number of capture groups.
605
+ * */
606
+ VALUE
607
+ icu_umatch_size(match)
608
+ VALUE match;
609
+ {
610
+ VALUE cg = rb_iv_get(match, "@cg");
611
+ return LONG2NUM(RARRAY_LEN(cg) - 1);
612
+ }
613
+
614
+
615
+ VALUE
616
+ icu_umatch_init( self, re)
617
+ VALUE self, re;
618
+ {
619
+ UErrorCode status = U_ZERO_ERROR;
620
+ long count, i, cu_start, cu_end;
621
+ URegularExpression * the_regex;
622
+ VALUE obj, groups, ranges;
623
+
624
+ Check_Class(re, rb_cURegexp);
625
+ the_regex = UREGEX(re)->pattern;
626
+ count = icu_group_count(re);
627
+ if (U_FAILURE(status)) {
628
+ rb_raise(rb_eArgError, u_errorName(status));
629
+ }
630
+ groups = rb_ary_new2(count);
631
+ rb_iv_set(self, "@cg", groups);
632
+ for (i = 0; i <= count; i++) {
633
+ obj = icu_reg_nth_match(re, i);
634
+ rb_obj_freeze(obj);
635
+ rb_ary_store(groups, i, obj);
636
+ }
637
+
638
+ ranges = rb_ary_new2(count);
639
+ for ( i = 0; i <= count; i++){
640
+ cu_start = uregex_start(the_regex, i, &status);
641
+ cu_end = uregex_end(the_regex, i, &status);
642
+ if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
643
+ else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
644
+ }
645
+ rb_iv_set(self, "@ranges", ranges);
646
+ return self;
647
+ }
648
+ VALUE icu_umatch_new(re)
649
+ VALUE re;
650
+ {
651
+ return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
652
+ }
653
+
654
+
655
+
656
+
657
+ void initialize_uregexp (void)
658
+ {
659
+ /* regular expressions */
660
+ rb_cURegexp = rb_define_class("URegexp", rb_cObject);
661
+ rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
662
+ rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
663
+ rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
664
+ rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
665
+ rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
666
+ rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
667
+ rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
668
+
669
+ /* Enable case insensitive matching. */
670
+ rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
671
+ /* Allow white space and comments within patterns */
672
+ rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
673
+ /* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
674
+ rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
675
+ /* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
676
+ rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
677
+
678
+
679
+ rb_define_global_function("ure", icu_reg_from_rb_str, -1);
680
+
681
+ /**
682
+ * Document-class: UMatch
683
+ *
684
+ * Class to store information about capturing
685
+ * groups. Used in UString#sub, UString#gsub methods, as parameter to
686
+ * passed block.
687
+ */
688
+ rb_cUMatch = rb_define_class("UMatch", rb_cObject);
689
+ rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
690
+ rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
691
+ rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
692
+
693
+ rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
694
+ rb_define_alias (rb_cRegexp, "U", "to_u");
695
+ rb_define_alias (rb_cRegexp, "ur", "to_u");
696
+
697
+ }