icu4r_19 1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,168 @@
1
+ #include "icu_common.h"
2
+ extern VALUE rb_cUString;
3
+ extern VALUE icu_ustr_new_set(const UChar * str, long len, long capa);
4
+
5
+ /**
6
+ * call-seq:
7
+ * ary.to_u => anUString
8
+ *
9
+ * Creates UString from array of fixnums, representing Unicode codepoints.
10
+ * (inversion of UString#codepoints)
11
+ *
12
+ * a = "поддержка".to_u.codepoints # => [1087, 1086, 1076, 1076, 1077, 1088, 1078, 1082, 1072]
13
+ * a.to_u # => "поддержка"
14
+ *
15
+ */
16
+ VALUE icu_ustr_from_array(obj)
17
+ VALUE obj;
18
+ {
19
+ int i, n;
20
+ VALUE *p;
21
+ VALUE ret, temp;
22
+ UChar32 * src , *pos, chr;
23
+ UChar * buf;
24
+ int32_t len, capa;
25
+ UErrorCode status = U_ZERO_ERROR;
26
+
27
+ n = RARRAY_LEN(obj);
28
+ p = RARRAY_PTR(obj);
29
+
30
+ src = ALLOC_N(UChar32, n);
31
+ pos = src;
32
+ for ( i = 0; i < n; i++){
33
+ temp = p[i];
34
+ if(TYPE(temp) != T_FIXNUM) {
35
+ free(src);
36
+ rb_raise(rb_eTypeError, "Can't convert from %s", rb_class2name(CLASS_OF(temp)));
37
+ }
38
+ chr = (UChar32) FIX2INT(temp);
39
+ // invalid codepoints are converted to U+FFFD
40
+ if( ! (U_IS_UNICODE_CHAR(chr)) ) {
41
+ chr = 0xFFFD;
42
+ }
43
+ *pos = chr;
44
+ pos ++;
45
+ }
46
+ capa = n+1;
47
+ buf = ALLOC_N(UChar, capa);
48
+ u_strFromUTF32(buf, capa, &len, src, n, &status);
49
+ if( U_BUFFER_OVERFLOW_ERROR == status ){
50
+ capa = len+1;
51
+ REALLOC_N(buf, UChar, capa);
52
+ status = U_ZERO_ERROR;
53
+ u_strFromUTF32(buf, capa, &len, src, n, &status);
54
+ }
55
+ if (U_FAILURE(status) ) {
56
+ free(src);
57
+ free(buf);
58
+ rb_raise(rb_eRuntimeError, u_errorName(status));
59
+ }
60
+ if( capa <= len ){
61
+ ++capa;
62
+ REALLOC_N(buf, UChar, capa);
63
+ }
64
+ ret = icu_ustr_new_set(buf, len, capa);
65
+ free(src);
66
+ return ret;
67
+ }
68
+
69
+ /**
70
+ * call-seq:
71
+ * str.to_u(encoding = 'utf8') => String
72
+ *
73
+ * Converts String value in given encoding to UString.
74
+ * When no encoding is given, utf8 is assumed. If string is not valid UTF8,
75
+ * and no encoding is given, exception is raised.
76
+ *
77
+ * When explicit encoding is given, converter will replace incorrect codepoints
78
+ * with <U+FFFD> - replacement character.
79
+ */
80
+ VALUE
81
+ icu_from_rstr(argc, argv, str)
82
+ int argc;
83
+ VALUE *argv,
84
+ str;
85
+ {
86
+ VALUE enc;
87
+ char *encoding = 0; /* default */
88
+ UErrorCode error = 0;
89
+ int32_t capa, len;
90
+ VALUE s;
91
+ UChar * buf;
92
+ UConverter * conv;
93
+ if (rb_scan_args(argc, argv, "01", &enc) == 1) {
94
+ Check_Type(enc, T_STRING);
95
+ encoding = RSTRING_PTR(enc);
96
+ }
97
+ capa = RSTRING_LEN(str) + 1;
98
+ buf = ALLOC_N(UChar, capa);
99
+
100
+ if(! encoding || !strncmp(encoding, "utf8", 4) ) {
101
+ /* from UTF8 */
102
+ u_strFromUTF8(buf, capa-1, &len, RSTRING_PTR(str), RSTRING_LEN(str), &error);
103
+ if( U_FAILURE(error)) {
104
+ free(buf);
105
+ rb_raise(rb_eArgError, u_errorName(error));
106
+ }
107
+ s = icu_ustr_new_set(buf, len, capa);
108
+ } else {
109
+ conv = ucnv_open(encoding, &error);
110
+ if (U_FAILURE(error)) {
111
+ ucnv_close(conv);
112
+ rb_raise(rb_eArgError, u_errorName(error));
113
+ }
114
+ len = ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
115
+ RSTRING_LEN(str), &error);
116
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
117
+ capa = len+1;
118
+ REALLOC_N(buf, UChar, capa);
119
+ error = 0;
120
+ len = ucnv_toUChars(conv, buf, capa-1, RSTRING_PTR(str),
121
+ RSTRING_LEN(str), &error);
122
+ if (U_FAILURE(error)) {
123
+ free(buf);
124
+ rb_raise(rb_eArgError, u_errorName(error));
125
+ }
126
+
127
+ }
128
+ s = icu_ustr_new_set(buf, len, capa);
129
+ ucnv_close(conv);
130
+ }
131
+ return s;
132
+ }
133
+
134
+ /**
135
+ * call-seq:
136
+ * u(str, enc = 'utf8') => UString
137
+ *
138
+ * Global function to convert from String to UString
139
+ */
140
+ VALUE
141
+ icu_f_rb_str(argc, argv, obj)
142
+ int argc;
143
+ VALUE *argv;
144
+ VALUE obj;
145
+ {
146
+ VALUE enc;
147
+ VALUE str;
148
+ if (rb_scan_args(argc, argv, "11", &str, &enc) == 2) {
149
+ Check_Type(enc, T_STRING);
150
+ Check_Type(str, T_STRING);
151
+ return icu_from_rstr(1, &enc, str);
152
+ } else {
153
+ Check_Type(str, T_STRING);
154
+ return icu_from_rstr(0, NULL, str);
155
+ }
156
+
157
+ }
158
+
159
+ void initialize_ucore_ext(void)
160
+ {
161
+ /* conversion from String to UString */
162
+ rb_define_method(rb_cString, "to_u", icu_from_rstr, -1);
163
+ rb_define_alias(rb_cString, "u", "to_u");
164
+ rb_define_global_function("u", icu_f_rb_str, -1);
165
+
166
+ /* conversion from Array to UString */
167
+ rb_define_method(rb_cArray, "to_u", icu_ustr_from_array, 0);
168
+ }
@@ -0,0 +1,697 @@
1
+
2
+ /**
3
+ * Document-class: URegexp
4
+ *
5
+ * See [docs/UNICODE_REGEXPS] for details of patterns.
6
+ *
7
+ *
8
+ * Replacement Text
9
+ *
10
+ * The replacement text for find-and-replace operations may contain references to
11
+ * capture-group text from the find. References are of the form $n, where n is the
12
+ * number of the capture group.
13
+ *
14
+ * Character Descriptions
15
+ * $n The text of capture group n will be substituted for $n. n must be >= 0 and not
16
+ * greater than the number of capture groups. A $ not followed by a digit has no special meaning,
17
+ * and will appear in the substitution text as itself, a $.
18
+ * \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
19
+ * substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
20
+ *
21
+ *
22
+ * Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
23
+ */
24
+
25
+ #include "icu_common.h"
26
+ extern VALUE rb_cURegexp;
27
+ extern VALUE rb_cUString;
28
+ extern VALUE rb_cUMatch;
29
+ VALUE icu_umatch_aref(VALUE match, VALUE idx);
30
+ VALUE icu_umatch_new (VALUE re);
31
+ extern VALUE icu_ustr_new(const UChar * ptr, long len);
32
+ extern VALUE icu_ustr_new2(const UChar * ptr);
33
+ extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
34
+ extern VALUE icu_from_rstr(int, VALUE *, VALUE);
35
+
36
+ /* --------- regular expressions */
37
+ void icu_regex_free( ICURegexp *ptr)
38
+ {
39
+ if (ptr->pattern)
40
+ uregex_close(ptr->pattern);
41
+ ptr->pattern = 0;
42
+ free(ptr);
43
+ }
44
+
45
+ VALUE
46
+ icu_reg_s_alloc(klass)
47
+ VALUE klass;
48
+ {
49
+ ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
50
+ ptr->pattern = 0;
51
+ return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
52
+ }
53
+
54
+ void
55
+ icu_reg_initialize(obj, s, len, options)
56
+ VALUE obj;
57
+ const UChar *s;
58
+ long len;
59
+ int options;
60
+ {
61
+ UParseError pe;
62
+ UErrorCode status = 0;
63
+ ICURegexp *re = UREGEX(obj);
64
+
65
+ if (re->pattern)
66
+ uregex_close(re->pattern);
67
+ re->pattern = uregex_open(s, len, options, &pe, &status);
68
+ re->options = options;
69
+
70
+ if (U_FAILURE(status))
71
+ rb_raise(rb_eArgError,
72
+ "Wrong regexp: %s line %d column %d flags %d",
73
+ u_errorName(status), pe.line, pe.offset, options);
74
+
75
+ }
76
+
77
+ const UChar *
78
+ icu_reg_get_pattern(ptr, len)
79
+ ICURegexp *ptr;
80
+ int32_t *len;
81
+ {
82
+ UErrorCode error = U_ZERO_ERROR;
83
+ *len = 0;
84
+ return uregex_pattern(ptr->pattern, len, &error);
85
+ }
86
+
87
+ /**
88
+ * call-seq:
89
+ * URegexp.new(str [,options])
90
+ * URegexp.new(regexp)
91
+ *
92
+ * Constructs a new regular expression from <i>pattern</i>, which can be either
93
+ * a <code>UString</code> or a <code>URegexp</code>.
94
+ * */
95
+ VALUE
96
+ icu_reg_initialize_m(argc, argv, self)
97
+ int argc;
98
+ VALUE *argv;
99
+ VALUE self;
100
+ {
101
+ const UChar *s;
102
+ int32_t len = 0;
103
+ int flags = 0;
104
+
105
+ if (argc == 0 || argc > 2) {
106
+ rb_raise(rb_eArgError, "wrong number of arguments");
107
+ }
108
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
109
+ if (argc > 1) {
110
+ rb_warn("flags ignored");
111
+ }
112
+ flags = UREGEX(argv[0])->options;
113
+ s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
114
+ } else {
115
+ Check_Class(argv[0], rb_cUString);
116
+ if (argc == 2) {
117
+ if (FIXNUM_P(argv[1]))
118
+ flags = FIX2INT(argv[1]);
119
+ else if (RTEST(argv[1]))
120
+ flags = UREGEX_CASE_INSENSITIVE;
121
+ }
122
+ s = ICU_PTR(argv[0]);
123
+ len = ICU_LEN(argv[0]);
124
+ }
125
+ icu_reg_initialize(self, s, len, flags);
126
+ return self;
127
+ }
128
+
129
+ VALUE
130
+ icu_reg_new(s, len, options)
131
+ const UChar *s;
132
+ long len;
133
+ int options;
134
+ {
135
+ VALUE re = icu_reg_s_alloc(rb_cURegexp);
136
+ icu_reg_initialize(re, s, len, options);
137
+ return (VALUE) re;
138
+ }
139
+
140
+ VALUE
141
+ icu_reg_clone(obj)
142
+ VALUE obj;
143
+ {
144
+ ICURegexp *regex = UREGEX(obj);
145
+ URegularExpression *old_pattern = UREGEX(obj)->pattern;
146
+ VALUE ret ;
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
149
+ if(U_FAILURE(status) ){
150
+ rb_raise(rb_eArgError, u_errorName(status));
151
+ }
152
+ ret = icu_reg_s_alloc(rb_cURegexp);
153
+ regex = UREGEX(ret);
154
+ regex->pattern = old_pattern;
155
+ UREGEX(obj)->pattern = new_pattern;
156
+ return ret;
157
+ }
158
+ VALUE
159
+ icu_reg_comp(str)
160
+ VALUE str;
161
+ {
162
+ return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
163
+ }
164
+
165
+ /**
166
+ * call-seq:
167
+ * regexp.to_u => URegexp
168
+ *
169
+ * Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
170
+ * $KCODE must be set to 'u' to work reliably
171
+ */
172
+ VALUE icu_reg_from_rb_reg(re)
173
+ VALUE re;
174
+ {
175
+ return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
176
+ }
177
+
178
+ /**
179
+ * call-seq:
180
+ * uregex.to_u
181
+ *
182
+ * Returns UString of this URegexp pattern.
183
+ * */
184
+ VALUE
185
+ icu_reg_to_u(self)
186
+ VALUE self;
187
+ {
188
+ int32_t len = 0;
189
+ const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
190
+ return icu_ustr_new(s, len);
191
+ }
192
+
193
+ /**
194
+ * call-seq:
195
+ * uregex.split(str, limit)
196
+ *
197
+ * Divides <i>str</i> into substrings based on a regexp pattern,
198
+ * returning an array of these substrings. <i>str</i> is divided where the
199
+ * pattern matches.
200
+ * */
201
+ VALUE
202
+ icu_reg_split(self, str, limit)
203
+ VALUE self,
204
+ str,
205
+ limit;
206
+ {
207
+ VALUE splits;
208
+ URegularExpression *theRegEx = UREGEX(self)->pattern;
209
+ UErrorCode error = U_ZERO_ERROR;
210
+ UChar * dest_buf, **dest_fields;
211
+ int32_t limt, req_cap, total, i;
212
+ Check_Class(str, rb_cUString);
213
+ if (limit != Qnil)
214
+ Check_Type(limit, T_FIXNUM);
215
+ dest_buf = ALLOC_N(UChar, USTRING(str)->len * 2 + 2);
216
+ limt = (limit == Qnil ? USTRING(str)->len + 1 : FIX2INT(limit));
217
+ dest_fields = ALLOC_N(UChar *, limt);
218
+ uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
219
+ if (U_FAILURE(error)) {
220
+ free(dest_buf);
221
+ free(dest_fields);
222
+ rb_raise(rb_eArgError, u_errorName(error));
223
+ }
224
+ req_cap = 0;
225
+ total =
226
+ uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
227
+ dest_fields, limt, &error);
228
+ if (U_BUFFER_OVERFLOW_ERROR == error) {
229
+ error = U_ZERO_ERROR;
230
+ REALLOC_N( dest_buf, UChar, req_cap);
231
+ total = uregex_split(theRegEx, dest_buf, req_cap, &req_cap, dest_fields, limt, &error);
232
+ }
233
+ if (U_FAILURE(error) ) {
234
+ free(dest_buf);
235
+ free(dest_fields);
236
+ rb_raise(rb_eArgError, u_errorName(error));
237
+ }
238
+ splits = rb_ary_new();
239
+ for (i = 0; i < total; i++)
240
+ rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
241
+
242
+ free(dest_buf);
243
+ free(dest_fields);
244
+
245
+ return splits;
246
+ }
247
+
248
+ long
249
+ icu_reg_search(re, str, pos, reverse)
250
+ VALUE re,
251
+ str;
252
+ long pos,
253
+ reverse;
254
+ {
255
+ UErrorCode error = U_ZERO_ERROR;
256
+ long cur_pos = 0;
257
+ long start,
258
+ last;
259
+
260
+ if (!reverse) {
261
+ start = pos;
262
+ } else {
263
+ start = 0;
264
+ }
265
+
266
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
267
+ USTRING(str)->len, &error);
268
+ if (U_FAILURE(error))
269
+ rb_raise(rb_eArgError, u_errorName(error));
270
+ if (!uregex_find(UREGEX(re)->pattern, start, &error))
271
+ return -1;
272
+ if (U_FAILURE(error))
273
+ rb_raise(rb_eArgError, u_errorName(error));
274
+ cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
275
+ if (reverse) {
276
+ while (uregex_findNext(UREGEX(re)->pattern, &error)) {
277
+ last = uregex_start(UREGEX(re)->pattern, 0, &error);
278
+ error = U_ZERO_ERROR;
279
+ if (reverse && last > pos)
280
+ break;
281
+ cur_pos = last;
282
+ }
283
+ }
284
+ if (reverse && cur_pos > pos)
285
+ return -1;
286
+ return cur_pos;
287
+ }
288
+ long
289
+ icu_group_count(re)
290
+ VALUE re;
291
+ {
292
+ UErrorCode error = U_ZERO_ERROR;
293
+ return uregex_groupCount(UREGEX(re)->pattern, &error);
294
+ }
295
+
296
+ VALUE
297
+ icu_reg_nth_match(re, nth)
298
+ VALUE re;
299
+ long nth;
300
+ {
301
+ URegularExpression *the_expr = UREGEX(re)->pattern;
302
+ UErrorCode error = U_ZERO_ERROR;
303
+ long start, end;
304
+ int32_t len;
305
+ if( nth < 0 ) {
306
+ nth += icu_group_count(re) + 1;
307
+ if(nth<=0) return Qnil;
308
+ }
309
+ start = uregex_start(the_expr, nth, &error);
310
+ if (U_FAILURE(error)) {
311
+ return Qnil;
312
+ }
313
+ end = uregex_end(the_expr, nth, &error);
314
+ len = 0;
315
+ return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
316
+ end - start);
317
+ }
318
+
319
+ VALUE
320
+ icu_reg_range(re, nth, start, end)
321
+ VALUE re;
322
+ int nth;
323
+ long *start;
324
+ long *end;
325
+ {
326
+ URegularExpression *the_expr = UREGEX(re)->pattern;
327
+ UErrorCode error = U_ZERO_ERROR;
328
+ if(nth < 0) {
329
+ nth += icu_group_count(re) + 1;
330
+ if(nth <= 0) return Qnil;
331
+ }
332
+ *start = uregex_start(the_expr, nth, &error);
333
+ if (U_FAILURE(error))
334
+ return Qnil;
335
+ *end = uregex_end(the_expr, nth, &error);
336
+ return Qtrue;
337
+ }
338
+
339
+ /**
340
+ * call-seq:
341
+ * uregex.match(str) => matchdata or nil
342
+ * uregex =~ (str) => matchdata or nil
343
+ *
344
+ * Returns a <code>UMatch</code> object describing the match, or
345
+ * <code>nil</code> if there was no match.
346
+ *
347
+ * ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
348
+ */
349
+ VALUE
350
+ icu_reg_match(re, str)
351
+ VALUE re,
352
+ str;
353
+ {
354
+ UErrorCode error = U_ZERO_ERROR;
355
+ Check_Class(str, rb_cUString);
356
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
357
+ USTRING(str)->len, &error);
358
+ if (U_FAILURE(error))
359
+ rb_raise(rb_eArgError, u_errorName(error));
360
+ if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
361
+ return icu_umatch_new(re);
362
+ }
363
+ return Qnil;
364
+ }
365
+
366
+ /**
367
+ * call-seq:
368
+ * rxp === str => true or false
369
+ *
370
+ * Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
371
+ *
372
+ * a = "HELLO".u
373
+ * case a
374
+ * when ure("^[a-z]*$"); print "Lower case\n"
375
+ * when ure("^[A-Z]*$"); print "Upper case\n"
376
+ * else; print "Mixed case\n"
377
+ * end
378
+ *
379
+ * <em>produces:</em>
380
+ *
381
+ * Upper case
382
+ */
383
+ VALUE
384
+ icu_reg_eqq(re, str)
385
+ VALUE re,
386
+ str;
387
+ {
388
+ long start;
389
+ Check_Class(str, rb_cUString);
390
+ start = icu_reg_search(re, str, 0, 0);
391
+ return start < 0 ? Qfalse : Qtrue;
392
+ }
393
+
394
+
395
+
396
+
397
+ int
398
+ icu_reg_find_next(pat)
399
+ VALUE pat;
400
+ {
401
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
402
+ UErrorCode error = U_ZERO_ERROR;
403
+ return uregex_findNext(the_expr, &error);
404
+ }
405
+
406
+ static const UChar BACKSLASH = 0x5c;
407
+ static const UChar DOLLARSIGN = 0x24;
408
+
409
+ VALUE
410
+ icu_reg_get_replacement(pat, repl_text, prev_end)
411
+ VALUE pat,
412
+ repl_text;
413
+ long prev_end;
414
+ {
415
+ UErrorCode error = U_ZERO_ERROR;
416
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
417
+ VALUE ret = icu_ustr_new(0, 0);
418
+
419
+ /* scan the replacement text, looking for substitutions ($n) and \escapes. */
420
+ int32_t replIdx = 0;
421
+ int32_t replacementLength = ICU_LEN(repl_text);
422
+ UChar *replacementText = ICU_PTR(repl_text);
423
+ int32_t numDigits = 0;
424
+ int32_t groupNum = 0, g_start, g_end;
425
+ UChar32 digitC;
426
+ int32_t len;
427
+ /* following code is rewritten version of code found */
428
+ /* in ICU sources : i18n/regexp.cpp */
429
+ while (replIdx < replacementLength) {
430
+ UChar c = replacementText[replIdx];
431
+ replIdx++;
432
+ if (c != DOLLARSIGN && c != BACKSLASH) {
433
+ /* Common case, no substitution, no escaping, */
434
+ /* just copy the char to the dest buf. */
435
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
436
+ continue;
437
+ }
438
+
439
+ if (c == BACKSLASH) {
440
+ /* Backslash Escape. Copy the following char out without further checks. */
441
+ /* Note: Surrogate pairs don't need any special handling */
442
+ /* The second half wont be a '$' or a '\', and */
443
+ /* will move to the dest normally on the next */
444
+ /* loop iteration. */
445
+ if (replIdx >= replacementLength) {
446
+ break;
447
+ }
448
+ /* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
449
+
450
+ /* Plain backslash escape. Just put out the escaped character. */
451
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
452
+ replIdx++;
453
+ continue;
454
+ }
455
+
456
+ /* We've got a $. Pick up a capture group number if one follows. */
457
+ /* Consume at most the number of digits necessary for the largest capture */
458
+ /* number that is valid for this pattern. */
459
+ numDigits = 0;
460
+ groupNum = 0;
461
+
462
+ for (;;) {
463
+ if (replIdx >= replacementLength) {
464
+ break;
465
+ }
466
+ U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
467
+ if (u_isdigit(digitC) == FALSE) {
468
+ break;
469
+ }
470
+
471
+ U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
472
+ groupNum=groupNum*10 + u_charDigitValue(digitC);
473
+ numDigits++;
474
+ if (numDigits >= 3) { /* limit 999 groups */
475
+ break;
476
+ }
477
+ }
478
+
479
+ if (numDigits == 0) {
480
+ /* The $ didn't introduce a group number at all. */
481
+ /* Treat it as just part of the substitution text. */
482
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
483
+ continue;
484
+ }
485
+
486
+ /* Finally, append the capture group data to the destination. */
487
+ error = U_ZERO_ERROR;
488
+ g_start = uregex_start(the_expr, groupNum, &error);
489
+ g_end = uregex_end (the_expr, groupNum, &error);
490
+ if(U_SUCCESS(error) && g_start != -1 ) {
491
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
492
+ uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
493
+ }
494
+
495
+ }
496
+ return ret;
497
+ }
498
+
499
+ VALUE
500
+ icu_reg_get_prematch(pat, prev_end)
501
+ VALUE pat;
502
+ long prev_end;
503
+ {
504
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
505
+ UErrorCode error = U_ZERO_ERROR;
506
+ int32_t len = 0;
507
+ int32_t cur_start = uregex_start(the_expr, 0, &error);
508
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
509
+ VALUE pm =
510
+ icu_ustr_new(temp + prev_end, cur_start - prev_end);
511
+ return pm;
512
+ }
513
+
514
+ VALUE
515
+ icu_reg_get_tail(pat, prev_end)
516
+ VALUE pat;
517
+ long prev_end;
518
+ {
519
+ UErrorCode error = U_ZERO_ERROR;
520
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
521
+ int32_t len = 0;
522
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
523
+ VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
524
+ return pm;
525
+ }
526
+
527
+ /**
528
+ * call-seq:
529
+ * ure(str[, options]) => URegexp
530
+ *
531
+ * Creates URegexp object from UString.
532
+ * */
533
+ VALUE
534
+ icu_reg_from_rb_str(argc, argv, obj)
535
+ int argc;
536
+ VALUE *argv;
537
+ VALUE obj;
538
+ {
539
+ VALUE pat,
540
+ options = Qnil;
541
+ int reg_opts = 0;
542
+ if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
543
+ reg_opts = 0;
544
+ } else {
545
+ if (options != Qnil) {
546
+ Check_Type(options, T_FIXNUM);
547
+ reg_opts = FIX2INT(options);
548
+ }
549
+ }
550
+ if (TYPE(pat) == T_STRING)
551
+ pat = icu_from_rstr(0, NULL, pat);
552
+ if (CLASS_OF(pat) != rb_cUString)
553
+ rb_raise(rb_eArgError, "Expected String or UString");
554
+ return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
555
+ }
556
+
557
+ /**
558
+ * call-seq:
559
+ * umatch[idx] => string
560
+ *
561
+ * Returns capture group. Group 0 is for full match.
562
+ * */
563
+ VALUE
564
+ icu_umatch_aref(match, index)
565
+ VALUE match,
566
+ index;
567
+ {
568
+ long idx;
569
+ VALUE cg;
570
+ Check_Type(index, T_FIXNUM);
571
+ idx = FIX2LONG(index);
572
+ cg = rb_iv_get(match, "@cg");
573
+ return rb_ary_entry(cg, idx);
574
+ }
575
+
576
+ /**
577
+ * call-seq:
578
+ * umatch.range(idx) => range
579
+ *
580
+ * Returns range (start, end) of capture group. Group 0 is for full match.
581
+ *
582
+ * NOTE: this method returns <b>code unit</b> indexes. To convert this range
583
+ * to <b>code point</b> range use UString#conv_unit_range. If your chars don't
584
+ * require surrogate UTF16 pairs, range will be the same.
585
+ * */
586
+ VALUE
587
+ icu_umatch_range(match, index)
588
+ VALUE match,
589
+ index;
590
+ {
591
+ long idx;
592
+ VALUE cg;
593
+ Check_Type(index, T_FIXNUM);
594
+ idx = FIX2LONG(index);
595
+ cg = rb_iv_get(match, "@ranges");
596
+ return rb_ary_entry(cg, idx);
597
+ }
598
+
599
+
600
+ /**
601
+ * call-seq:
602
+ * umatch.size => fixnum
603
+ *
604
+ * Returns number of capture groups.
605
+ * */
606
+ VALUE
607
+ icu_umatch_size(match)
608
+ VALUE match;
609
+ {
610
+ VALUE cg = rb_iv_get(match, "@cg");
611
+ return LONG2NUM(RARRAY_LEN(cg) - 1);
612
+ }
613
+
614
+
615
+ VALUE
616
+ icu_umatch_init( self, re)
617
+ VALUE self, re;
618
+ {
619
+ UErrorCode status = U_ZERO_ERROR;
620
+ long count, i, cu_start, cu_end;
621
+ URegularExpression * the_regex;
622
+ VALUE obj, groups, ranges;
623
+
624
+ Check_Class(re, rb_cURegexp);
625
+ the_regex = UREGEX(re)->pattern;
626
+ count = icu_group_count(re);
627
+ if (U_FAILURE(status)) {
628
+ rb_raise(rb_eArgError, u_errorName(status));
629
+ }
630
+ groups = rb_ary_new2(count);
631
+ rb_iv_set(self, "@cg", groups);
632
+ for (i = 0; i <= count; i++) {
633
+ obj = icu_reg_nth_match(re, i);
634
+ rb_obj_freeze(obj);
635
+ rb_ary_store(groups, i, obj);
636
+ }
637
+
638
+ ranges = rb_ary_new2(count);
639
+ for ( i = 0; i <= count; i++){
640
+ cu_start = uregex_start(the_regex, i, &status);
641
+ cu_end = uregex_end(the_regex, i, &status);
642
+ if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
643
+ else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
644
+ }
645
+ rb_iv_set(self, "@ranges", ranges);
646
+ return self;
647
+ }
648
+ VALUE icu_umatch_new(re)
649
+ VALUE re;
650
+ {
651
+ return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
652
+ }
653
+
654
+
655
+
656
+
657
+ void initialize_uregexp (void)
658
+ {
659
+ /* regular expressions */
660
+ rb_cURegexp = rb_define_class("URegexp", rb_cObject);
661
+ rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
662
+ rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
663
+ rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
664
+ rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
665
+ rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
666
+ rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
667
+ rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
668
+
669
+ /* Enable case insensitive matching. */
670
+ rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
671
+ /* Allow white space and comments within patterns */
672
+ rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
673
+ /* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
674
+ rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
675
+ /* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
676
+ rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
677
+
678
+
679
+ rb_define_global_function("ure", icu_reg_from_rb_str, -1);
680
+
681
+ /**
682
+ * Document-class: UMatch
683
+ *
684
+ * Class to store information about capturing
685
+ * groups. Used in UString#sub, UString#gsub methods, as parameter to
686
+ * passed block.
687
+ */
688
+ rb_cUMatch = rb_define_class("UMatch", rb_cObject);
689
+ rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
690
+ rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
691
+ rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
692
+
693
+ rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
694
+ rb_define_alias (rb_cRegexp, "U", "to_u");
695
+ rb_define_alias (rb_cRegexp, "ur", "to_u");
696
+
697
+ }