icu4r 0.1.3.2006.01.26

Sign up to get free protection for your applications and to get access to all the features.
data/uregex.c ADDED
@@ -0,0 +1,673 @@
1
+
2
+ /**
3
+ * Document-class: URegexp
4
+ *
5
+ * See [docs/UNICODE_REGEXPS] for details of patterns.
6
+ *
7
+ *
8
+ * Replacement Text
9
+ *
10
+ * The replacement text for find-and-replace operations may contain references to
11
+ * capture-group text from the find. References are of the form $n, where n is the
12
+ * number of the capture group.
13
+ *
14
+ * Character Descriptions
15
+ * $n The text of capture group n will be substituted for $n. n must be >= 0 and not
16
+ * greater than the number of capture groups. A $ not followed by a digit has no special meaning,
17
+ * and will appear in the substitution text as itself, a $.
18
+ * \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
19
+ * substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
20
+ *
21
+ *
22
+ * Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
23
+ */
24
+
25
+ #include "icu_common.h"
26
+ extern VALUE rb_cURegexp;
27
+ extern VALUE rb_cUString;
28
+ extern VALUE rb_cUMatch;
29
+ VALUE icu_umatch_aref(VALUE match, VALUE idx);
30
+ VALUE icu_umatch_new (VALUE re);
31
+ extern VALUE icu_ustr_new(const UChar * ptr, long len);
32
+ extern VALUE icu_ustr_new2(const UChar * ptr);
33
+ extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
34
+ extern VALUE icu_from_rstr(int, VALUE *, VALUE);
35
+
36
+ /* --------- regular expressions */
37
+ void icu_regex_free( ICURegexp *ptr)
38
+ {
39
+ if (ptr->pattern)
40
+ uregex_close(ptr->pattern);
41
+ ptr->pattern = 0;
42
+ free(ptr);
43
+ }
44
+
45
+ VALUE
46
+ icu_reg_s_alloc(klass)
47
+ VALUE klass;
48
+ {
49
+ ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
50
+ ptr->pattern = 0;
51
+ return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
52
+ }
53
+
54
+ void
55
+ icu_reg_initialize(obj, s, len, options)
56
+ VALUE obj;
57
+ const UChar *s;
58
+ long len;
59
+ int options;
60
+ {
61
+ UParseError pe;
62
+ UErrorCode status = 0;
63
+ ICURegexp *re = UREGEX(obj);
64
+
65
+ if (re->pattern)
66
+ uregex_close(re->pattern);
67
+ re->pattern = uregex_open(s, len, options, &pe, &status);
68
+ re->options = options;
69
+
70
+ if (U_FAILURE(status))
71
+ rb_raise(rb_eArgError,
72
+ "Wrong regexp: %s line %d column %d flags %d",
73
+ u_errorName(status), pe.line, pe.offset, options);
74
+
75
+ }
76
+
77
+ const UChar *
78
+ icu_reg_get_pattern(ptr, len)
79
+ ICURegexp *ptr;
80
+ int32_t *len;
81
+ {
82
+ UErrorCode error = 0;
83
+ *len = 0;
84
+ return uregex_pattern(ptr->pattern, len, &error);
85
+ }
86
+
87
+ /**
88
+ * call-seq:
89
+ * URegexp.new(str [,options])
90
+ * URegexp.new(regexp)
91
+ *
92
+ * Constructs a new regular expression from <i>pattern</i>, which can be either
93
+ * a <code>UString</code> or a <code>URegexp</code>.
94
+ * */
95
+ VALUE
96
+ icu_reg_initialize_m(argc, argv, self)
97
+ int argc;
98
+ VALUE *argv;
99
+ VALUE self;
100
+ {
101
+ const UChar *s;
102
+ int32_t len = 0;
103
+ int flags = 0;
104
+
105
+ if (argc == 0 || argc > 2) {
106
+ rb_raise(rb_eArgError, "wrong number of arguments");
107
+ }
108
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
109
+ if (argc > 1) {
110
+ rb_warn("flags ignored");
111
+ }
112
+ flags = UREGEX(argv[0])->options;
113
+ s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
114
+ } else {
115
+ Check_Class(argv[0], rb_cUString);
116
+ if (argc == 2) {
117
+ if (FIXNUM_P(argv[1]))
118
+ flags = FIX2INT(argv[1]);
119
+ else if (RTEST(argv[1]))
120
+ flags = UREGEX_CASE_INSENSITIVE;
121
+ }
122
+ s = ICU_PTR(argv[0]);
123
+ len = ICU_LEN(argv[0]);
124
+ }
125
+ icu_reg_initialize(self, s, len, flags);
126
+ return self;
127
+ }
128
+
129
+ VALUE
130
+ icu_reg_new(s, len, options)
131
+ const UChar *s;
132
+ long len;
133
+ int options;
134
+ {
135
+ VALUE re = icu_reg_s_alloc(rb_cURegexp);
136
+ icu_reg_initialize(re, s, len, options);
137
+ return (VALUE) re;
138
+ }
139
+
140
+ VALUE
141
+ icu_reg_clone(obj)
142
+ VALUE obj;
143
+ {
144
+ ICURegexp *regex = UREGEX(obj);
145
+ URegularExpression *old_pattern = UREGEX(obj)->pattern;
146
+ VALUE ret ;
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
149
+ if(U_FAILURE(status) ){
150
+ rb_raise(rb_eArgError, u_errorName(status));
151
+ }
152
+ ret = icu_reg_s_alloc(rb_cURegexp);
153
+ regex = UREGEX(ret);
154
+ regex->pattern = old_pattern;
155
+ UREGEX(obj)->pattern = new_pattern;
156
+ return ret;
157
+ }
158
+ VALUE
159
+ icu_reg_comp(str)
160
+ VALUE str;
161
+ {
162
+ return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
163
+ }
164
+
165
+ /**
166
+ * call-seq:
167
+ * regexp.to_u => URegexp
168
+ *
169
+ * Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
170
+ * $KCODE must be set to 'u' to work reliably
171
+ */
172
+ VALUE icu_reg_from_rb_reg(re)
173
+ VALUE re;
174
+ {
175
+ return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
176
+ }
177
+
178
+ /**
179
+ * call-seq:
180
+ * uregex.to_u
181
+ *
182
+ * Returns UString of this URegexp pattern.
183
+ * */
184
+ VALUE
185
+ icu_reg_to_u(self)
186
+ VALUE self;
187
+ {
188
+ int32_t len = 0;
189
+ const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
190
+ return icu_ustr_new(s, len);
191
+ }
192
+
193
+ /**
194
+ * call-seq:
195
+ * uregex.split(str, limit)
196
+ *
197
+ * Divides <i>str</i> into substrings based on a regexp pattern,
198
+ * returning an array of these substrings. <i>str</i> is divided where the
199
+ * pattern matches.
200
+ * */
201
+ VALUE
202
+ icu_reg_split(self, str, limit)
203
+ VALUE self,
204
+ str,
205
+ limit;
206
+ {
207
+ VALUE splits;
208
+ URegularExpression *theRegEx = UREGEX(self)->pattern;
209
+ UErrorCode error = 0;
210
+ UChar * dest_buf, **dest_fields;
211
+ int32_t limt, req_cap, total, i;
212
+ Check_Class(str, rb_cUString);
213
+ if (limit != Qnil)
214
+ Check_Type(limit, T_FIXNUM);
215
+ splits = rb_ary_new();
216
+ dest_buf = ALLOCA_N(UChar, USTRING(str)->len * 2);
217
+ dest_fields = ALLOCA_N(UChar *, USTRING(str)->len);
218
+ limt = limit == Qnil ? USTRING(str)->len : FIX2INT(limit);
219
+ uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
220
+ if (U_FAILURE(error))
221
+ rb_raise(rb_eArgError, u_errorName(error));
222
+ req_cap = 0;
223
+ total =
224
+ uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
225
+ dest_fields, limt, &error);
226
+ if (U_FAILURE(error))
227
+ rb_raise(rb_eArgError, u_errorName(error));
228
+
229
+ for (i = 0; i < total; i++)
230
+ rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
231
+ return splits;
232
+ }
233
+
234
+ long
235
+ icu_reg_search(re, str, pos, reverse)
236
+ VALUE re,
237
+ str;
238
+ long pos,
239
+ reverse;
240
+ {
241
+ UErrorCode error = 0;
242
+ long cur_pos = 0;
243
+ long start,
244
+ last;
245
+
246
+ if (!reverse) {
247
+ start = pos;
248
+ } else {
249
+ start = 0;
250
+ }
251
+
252
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
253
+ USTRING(str)->len, &error);
254
+ if (U_FAILURE(error))
255
+ rb_raise(rb_eArgError, u_errorName(error));
256
+ if (!uregex_find(UREGEX(re)->pattern, start, &error))
257
+ return -1;
258
+ if (U_FAILURE(error))
259
+ rb_raise(rb_eArgError, u_errorName(error));
260
+ cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
261
+ if (reverse) {
262
+ while (uregex_findNext(UREGEX(re)->pattern, &error)) {
263
+ last = uregex_start(UREGEX(re)->pattern, 0, &error);
264
+ error = 0;
265
+ if (reverse && last > pos)
266
+ break;
267
+ cur_pos = last;
268
+ }
269
+ }
270
+ if (reverse && cur_pos > pos)
271
+ return -1;
272
+ return cur_pos;
273
+ }
274
+
275
+ VALUE
276
+ icu_reg_nth_match(re, nth)
277
+ VALUE re;
278
+ long nth;
279
+ {
280
+ URegularExpression *the_expr = UREGEX(re)->pattern;
281
+ UErrorCode error = 0;
282
+ long start = uregex_start(the_expr, nth, &error), end;
283
+ int32_t len;
284
+ if (U_FAILURE(error)) {
285
+ return Qnil;
286
+ }
287
+ end = uregex_end(the_expr, nth, &error);
288
+ len = 0;
289
+ return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
290
+ end - start);
291
+ }
292
+
293
+ VALUE
294
+ icu_reg_range(re, nth, start, end)
295
+ VALUE re;
296
+ int nth;
297
+ long *start;
298
+ long *end;
299
+ {
300
+ URegularExpression *the_expr = UREGEX(re)->pattern;
301
+ UErrorCode error = 0;
302
+ *start = uregex_start(the_expr, nth, &error);
303
+ if (U_FAILURE(error))
304
+ return Qnil;
305
+ *end = uregex_end(the_expr, nth, &error);
306
+ return Qtrue;
307
+ }
308
+
309
+ /**
310
+ * call-seq:
311
+ * uregex.match(str) => matchdata or nil
312
+ * uregex =~ (str) => matchdata or nil
313
+ *
314
+ * Returns a <code>UMatch</code> object describing the match, or
315
+ * <code>nil</code> if there was no match.
316
+ *
317
+ * ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
318
+ */
319
+ VALUE
320
+ icu_reg_match(re, str)
321
+ VALUE re,
322
+ str;
323
+ {
324
+ UErrorCode error = 0;
325
+ Check_Class(str, rb_cUString);
326
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
327
+ USTRING(str)->len, &error);
328
+ if (U_FAILURE(error))
329
+ rb_raise(rb_eArgError, u_errorName(error));
330
+ if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
331
+ return icu_umatch_new(re);
332
+ }
333
+ return Qnil;
334
+ }
335
+
336
+ /**
337
+ * call-seq:
338
+ * rxp === str => true or false
339
+ *
340
+ * Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
341
+ *
342
+ * a = "HELLO".u
343
+ * case a
344
+ * when ure("^[a-z]*$"); print "Lower case\n"
345
+ * when ure("^[A-Z]*$"); print "Upper case\n"
346
+ * else; print "Mixed case\n"
347
+ * end
348
+ *
349
+ * <em>produces:</em>
350
+ *
351
+ * Upper case
352
+ */
353
+ VALUE
354
+ icu_reg_eqq(re, str)
355
+ VALUE re,
356
+ str;
357
+ {
358
+ long start;
359
+ Check_Class(str, rb_cUString);
360
+ start = icu_reg_search(re, str, 0, 0);
361
+ return start < 0 ? Qfalse : Qtrue;
362
+ }
363
+
364
+
365
+ long
366
+ icu_group_count(re)
367
+ VALUE re;
368
+ {
369
+ UErrorCode error = 0;
370
+ return uregex_groupCount(UREGEX(re)->pattern, &error);
371
+ }
372
+
373
+ int
374
+ icu_reg_find_next(pat)
375
+ VALUE pat;
376
+ {
377
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
378
+ UErrorCode error = 0;
379
+ return uregex_findNext(the_expr, &error);
380
+ }
381
+
382
+ static const UChar BACKSLASH = 0x5c;
383
+ static const UChar DOLLARSIGN = 0x24;
384
+
385
+ VALUE
386
+ icu_reg_get_replacement(pat, repl_text, prev_end)
387
+ VALUE pat,
388
+ repl_text;
389
+ long prev_end;
390
+ {
391
+ UErrorCode error = U_ZERO_ERROR;
392
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
393
+ VALUE ret = icu_ustr_new(0, 0);
394
+
395
+ /* scan the replacement text, looking for substitutions ($n) and \escapes. */
396
+ int32_t replIdx = 0;
397
+ int32_t replacementLength = ICU_LEN(repl_text);
398
+ UChar *replacementText = ICU_PTR(repl_text);
399
+ int32_t numDigits = 0;
400
+ int32_t groupNum = 0, g_start, g_end;
401
+ UChar32 digitC;
402
+ int32_t len;
403
+ /* following code is rewritten version of code found */
404
+ /* in ICU sources : i18n/regexp.cpp */
405
+ while (replIdx < replacementLength) {
406
+ UChar c = replacementText[replIdx];
407
+ replIdx++;
408
+ if (c != DOLLARSIGN && c != BACKSLASH) {
409
+ /* Common case, no substitution, no escaping, */
410
+ /* just copy the char to the dest buf. */
411
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
412
+ continue;
413
+ }
414
+
415
+ if (c == BACKSLASH) {
416
+ /* Backslash Escape. Copy the following char out without further checks. */
417
+ /* Note: Surrogate pairs don't need any special handling */
418
+ /* The second half wont be a '$' or a '\', and */
419
+ /* will move to the dest normally on the next */
420
+ /* loop iteration. */
421
+ if (replIdx >= replacementLength) {
422
+ break;
423
+ }
424
+ /* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
425
+
426
+ /* Plain backslash escape. Just put out the escaped character. */
427
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
428
+ replIdx++;
429
+ continue;
430
+ }
431
+
432
+ /* We've got a $. Pick up a capture group number if one follows. */
433
+ /* Consume at most the number of digits necessary for the largest capture */
434
+ /* number that is valid for this pattern. */
435
+ numDigits = 0;
436
+ groupNum = 0;
437
+
438
+ for (;;) {
439
+ if (replIdx >= replacementLength) {
440
+ break;
441
+ }
442
+ U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
443
+ if (u_isdigit(digitC) == FALSE) {
444
+ break;
445
+ }
446
+
447
+ U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
448
+ groupNum=groupNum*10 + u_charDigitValue(digitC);
449
+ numDigits++;
450
+ if (numDigits >= 3) { /* limit 999 groups */
451
+ break;
452
+ }
453
+ }
454
+
455
+ if (numDigits == 0) {
456
+ /* The $ didn't introduce a group number at all. */
457
+ /* Treat it as just part of the substitution text. */
458
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
459
+ continue;
460
+ }
461
+
462
+ /* Finally, append the capture group data to the destination. */
463
+ error = U_ZERO_ERROR;
464
+ g_start = uregex_start(the_expr, groupNum, &error);
465
+ g_end = uregex_end (the_expr, groupNum, &error);
466
+ if(U_SUCCESS(error) && g_start != -1 ) {
467
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
468
+ uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
469
+ }
470
+
471
+ }
472
+ return ret;
473
+ }
474
+
475
+ VALUE
476
+ icu_reg_get_prematch(pat, prev_end)
477
+ VALUE pat;
478
+ long prev_end;
479
+ {
480
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
481
+ UErrorCode error = 0;
482
+ int32_t len = 0;
483
+ int32_t cur_start = uregex_start(the_expr, 0, &error);
484
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
485
+ VALUE pm =
486
+ icu_ustr_new(temp + prev_end, cur_start - prev_end);
487
+ return pm;
488
+ }
489
+
490
+ VALUE
491
+ icu_reg_get_tail(pat, prev_end)
492
+ VALUE pat;
493
+ long prev_end;
494
+ {
495
+ UErrorCode error = U_ZERO_ERROR;
496
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
497
+ int32_t len = 0;
498
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
499
+ VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
500
+ return pm;
501
+ }
502
+
503
+ /**
504
+ * call-seq:
505
+ * ure(str[, options]) => URegexp
506
+ *
507
+ * Creates URegexp object from UString.
508
+ * */
509
+ VALUE
510
+ icu_reg_from_rb_str(argc, argv, obj)
511
+ int argc;
512
+ VALUE *argv;
513
+ VALUE obj;
514
+ {
515
+ VALUE pat,
516
+ options = Qnil;
517
+ int reg_opts = 0;
518
+ if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
519
+ reg_opts = 0;
520
+ } else {
521
+ if (options != Qnil) {
522
+ Check_Type(options, T_FIXNUM);
523
+ reg_opts = FIX2INT(options);
524
+ }
525
+ }
526
+ if (TYPE(pat) == T_STRING)
527
+ pat = icu_from_rstr(0, NULL, pat);
528
+ if (CLASS_OF(pat) != rb_cUString)
529
+ rb_raise(rb_eArgError, "Expected String or UString");
530
+ return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
531
+ }
532
+
533
+ /**
534
+ * call-seq:
535
+ * umatch[idx] => string
536
+ *
537
+ * Returns capture group. Group 0 is for full match.
538
+ * */
539
+ VALUE
540
+ icu_umatch_aref(match, index)
541
+ VALUE match,
542
+ index;
543
+ {
544
+ long idx;
545
+ VALUE cg;
546
+ Check_Type(index, T_FIXNUM);
547
+ idx = FIX2LONG(index);
548
+ cg = rb_iv_get(match, "@cg");
549
+ return rb_ary_entry(cg, idx);
550
+ }
551
+
552
+ /**
553
+ * call-seq:
554
+ * umatch.range(idx) => range
555
+ *
556
+ * Returns range (start, end) of capture group. Group 0 is for full match.
557
+ *
558
+ * NOTE: this method returns <b>code unit</b> indexes. To convert this range
559
+ * to <b>code point</b> range use UString#conv_unit_range. If your chars don't
560
+ * require surrogate UTF16 pairs, range will be the same.
561
+ * */
562
+ VALUE
563
+ icu_umatch_range(match, index)
564
+ VALUE match,
565
+ index;
566
+ {
567
+ long idx;
568
+ VALUE cg;
569
+ Check_Type(index, T_FIXNUM);
570
+ idx = FIX2LONG(index);
571
+ cg = rb_iv_get(match, "@ranges");
572
+ return rb_ary_entry(cg, idx);
573
+ }
574
+
575
+
576
+ /**
577
+ * call-seq:
578
+ * umatch.size => fixnum
579
+ *
580
+ * Returns number of capture groups.
581
+ * */
582
+ VALUE
583
+ icu_umatch_size(match)
584
+ VALUE match;
585
+ {
586
+ VALUE cg = rb_iv_get(match, "@cg");
587
+ return LONG2NUM(RARRAY(cg)->len - 1);
588
+ }
589
+
590
+
591
+ VALUE
592
+ icu_umatch_init( self, re)
593
+ VALUE self, re;
594
+ {
595
+ UErrorCode status = U_ZERO_ERROR;
596
+ long count, i, cu_start, cu_end;
597
+ URegularExpression * the_regex;
598
+ VALUE obj, groups, ranges;
599
+
600
+ Check_Class(re, rb_cURegexp);
601
+ the_regex = UREGEX(re)->pattern;
602
+ count = uregex_groupCount(the_regex, &status);
603
+ if (U_FAILURE(status)) {
604
+ rb_raise(rb_eArgError, u_errorName(status));
605
+ }
606
+ groups = rb_ary_new2(count);
607
+ rb_iv_set(self, "@cg", groups);
608
+ for (i = 0; i <= count; i++) {
609
+ obj = icu_reg_nth_match(re, i);
610
+ rb_obj_freeze(obj);
611
+ rb_ary_store(groups, i, obj);
612
+ }
613
+
614
+ ranges = rb_ary_new2(count);
615
+ for ( i = 0; i <= count; i++){
616
+ cu_start = uregex_start(the_regex, i, &status);
617
+ cu_end = uregex_end(the_regex, i, &status);
618
+ if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
619
+ else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
620
+ }
621
+ rb_iv_set(self, "@ranges", ranges);
622
+ return self;
623
+ }
624
+ VALUE icu_umatch_new(re)
625
+ VALUE re;
626
+ {
627
+ return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
628
+ }
629
+
630
+
631
+
632
+
633
+ void initialize_uregexp (void)
634
+ {
635
+ /* regular expressions */
636
+ rb_cURegexp = rb_define_class("URegexp", rb_cObject);
637
+ rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
638
+ rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
639
+ rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
640
+ rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
641
+ rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
642
+ rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
643
+ rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
644
+
645
+ /* Enable case insensitive matching. */
646
+ rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
647
+ /* Allow white space and comments within patterns */
648
+ rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
649
+ /* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
650
+ rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
651
+ /* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
652
+ rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
653
+
654
+
655
+ rb_define_global_function("ure", icu_reg_from_rb_str, -1);
656
+
657
+ /**
658
+ * Document-class: UMatch
659
+ *
660
+ * Class to store information about capturing
661
+ * groups. Used in UString#sub, UString#gsub methods, as parameter to
662
+ * passed block.
663
+ */
664
+ rb_cUMatch = rb_define_class("UMatch", rb_cObject);
665
+ rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
666
+ rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
667
+ rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
668
+
669
+ rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
670
+ rb_define_alias (rb_cRegexp, "U", "to_u");
671
+ rb_define_alias (rb_cRegexp, "ur", "to_u");
672
+
673
+ }
data/uregex.h ADDED
@@ -0,0 +1,27 @@
1
+ extern void icu_regex_free (ICURegexp *ptr);
2
+ extern VALUE icu_reg_s_alloc (VALUE klass);
3
+ extern VALUE icu_reg_initialize_m (int argc, VALUE *argv, VALUE self);
4
+ extern VALUE icu_reg_new (UChar *s, long len, int options) ;
5
+ extern VALUE icu_reg_clone (VALUE obj);
6
+ extern VALUE icu_reg_comp (VALUE str);
7
+ extern VALUE icu_reg_from_rb_reg (VALUE re);
8
+ extern VALUE icu_reg_to_u (VALUE self);
9
+ extern VALUE icu_reg_split (VALUE self, VALUE str, VALUE limit);
10
+ extern VALUE icu_reg_nth_match (VALUE re, long nth);
11
+ extern VALUE icu_reg_range (VALUE re, int nth, long *start, long *end);
12
+ extern VALUE icu_reg_match (VALUE re, VALUE str);
13
+ extern VALUE icu_reg_eqq (VALUE re, VALUE str);
14
+ extern int icu_reg_find_next (VALUE pat);
15
+ extern VALUE icu_reg_get_replacement (VALUE pat, VALUE repl_text, long prev_end);
16
+ extern VALUE icu_reg_get_prematch (VALUE pat, long prev_end);
17
+ extern VALUE icu_reg_get_tail (VALUE pat, long prev_end);
18
+ extern VALUE icu_reg_from_rb_str (int argc, VALUE *argv, VALUE obj);
19
+ extern VALUE icu_umatch_range (VALUE match, VALUE index);
20
+ extern VALUE icu_umatch_size (VALUE match);
21
+ extern VALUE icu_umatch_init (VALUE self, VALUE re);
22
+ extern VALUE icu_umatch_aref (VALUE match, VALUE idx);
23
+ extern VALUE icu_umatch_new (VALUE re);
24
+ extern long icu_group_count(VALUE re);
25
+ extern long icu_reg_search(VALUE re, VALUE str, int pos, int reverse);
26
+
27
+ extern void initialize_uregexp (void);