icu4r 0.1.3.2006.01.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/uregex.c ADDED
@@ -0,0 +1,673 @@
1
+
2
+ /**
3
+ * Document-class: URegexp
4
+ *
5
+ * See [docs/UNICODE_REGEXPS] for details of patterns.
6
+ *
7
+ *
8
+ * Replacement Text
9
+ *
10
+ * The replacement text for find-and-replace operations may contain references to
11
+ * capture-group text from the find. References are of the form $n, where n is the
12
+ * number of the capture group.
13
+ *
14
+ * Character Descriptions
15
+ * $n The text of capture group n will be substituted for $n. n must be >= 0 and not
16
+ * greater than the number of capture groups. A $ not followed by a digit has no special meaning,
17
+ * and will appear in the substitution text as itself, a $.
18
+ * \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
19
+ * substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
20
+ *
21
+ *
22
+ * Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
23
+ */
24
+
25
+ #include "icu_common.h"
26
+ extern VALUE rb_cURegexp;
27
+ extern VALUE rb_cUString;
28
+ extern VALUE rb_cUMatch;
29
+ VALUE icu_umatch_aref(VALUE match, VALUE idx);
30
+ VALUE icu_umatch_new (VALUE re);
31
+ extern VALUE icu_ustr_new(const UChar * ptr, long len);
32
+ extern VALUE icu_ustr_new2(const UChar * ptr);
33
+ extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
34
+ extern VALUE icu_from_rstr(int, VALUE *, VALUE);
35
+
36
+ /* --------- regular expressions */
37
+ void icu_regex_free( ICURegexp *ptr)
38
+ {
39
+ if (ptr->pattern)
40
+ uregex_close(ptr->pattern);
41
+ ptr->pattern = 0;
42
+ free(ptr);
43
+ }
44
+
45
+ VALUE
46
+ icu_reg_s_alloc(klass)
47
+ VALUE klass;
48
+ {
49
+ ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
50
+ ptr->pattern = 0;
51
+ return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
52
+ }
53
+
54
+ void
55
+ icu_reg_initialize(obj, s, len, options)
56
+ VALUE obj;
57
+ const UChar *s;
58
+ long len;
59
+ int options;
60
+ {
61
+ UParseError pe;
62
+ UErrorCode status = 0;
63
+ ICURegexp *re = UREGEX(obj);
64
+
65
+ if (re->pattern)
66
+ uregex_close(re->pattern);
67
+ re->pattern = uregex_open(s, len, options, &pe, &status);
68
+ re->options = options;
69
+
70
+ if (U_FAILURE(status))
71
+ rb_raise(rb_eArgError,
72
+ "Wrong regexp: %s line %d column %d flags %d",
73
+ u_errorName(status), pe.line, pe.offset, options);
74
+
75
+ }
76
+
77
+ const UChar *
78
+ icu_reg_get_pattern(ptr, len)
79
+ ICURegexp *ptr;
80
+ int32_t *len;
81
+ {
82
+ UErrorCode error = 0;
83
+ *len = 0;
84
+ return uregex_pattern(ptr->pattern, len, &error);
85
+ }
86
+
87
+ /**
88
+ * call-seq:
89
+ * URegexp.new(str [,options])
90
+ * URegexp.new(regexp)
91
+ *
92
+ * Constructs a new regular expression from <i>pattern</i>, which can be either
93
+ * a <code>UString</code> or a <code>URegexp</code>.
94
+ * */
95
+ VALUE
96
+ icu_reg_initialize_m(argc, argv, self)
97
+ int argc;
98
+ VALUE *argv;
99
+ VALUE self;
100
+ {
101
+ const UChar *s;
102
+ int32_t len = 0;
103
+ int flags = 0;
104
+
105
+ if (argc == 0 || argc > 2) {
106
+ rb_raise(rb_eArgError, "wrong number of arguments");
107
+ }
108
+ if (CLASS_OF(argv[0]) == rb_cURegexp) {
109
+ if (argc > 1) {
110
+ rb_warn("flags ignored");
111
+ }
112
+ flags = UREGEX(argv[0])->options;
113
+ s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
114
+ } else {
115
+ Check_Class(argv[0], rb_cUString);
116
+ if (argc == 2) {
117
+ if (FIXNUM_P(argv[1]))
118
+ flags = FIX2INT(argv[1]);
119
+ else if (RTEST(argv[1]))
120
+ flags = UREGEX_CASE_INSENSITIVE;
121
+ }
122
+ s = ICU_PTR(argv[0]);
123
+ len = ICU_LEN(argv[0]);
124
+ }
125
+ icu_reg_initialize(self, s, len, flags);
126
+ return self;
127
+ }
128
+
129
+ VALUE
130
+ icu_reg_new(s, len, options)
131
+ const UChar *s;
132
+ long len;
133
+ int options;
134
+ {
135
+ VALUE re = icu_reg_s_alloc(rb_cURegexp);
136
+ icu_reg_initialize(re, s, len, options);
137
+ return (VALUE) re;
138
+ }
139
+
140
+ VALUE
141
+ icu_reg_clone(obj)
142
+ VALUE obj;
143
+ {
144
+ ICURegexp *regex = UREGEX(obj);
145
+ URegularExpression *old_pattern = UREGEX(obj)->pattern;
146
+ VALUE ret ;
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
149
+ if(U_FAILURE(status) ){
150
+ rb_raise(rb_eArgError, u_errorName(status));
151
+ }
152
+ ret = icu_reg_s_alloc(rb_cURegexp);
153
+ regex = UREGEX(ret);
154
+ regex->pattern = old_pattern;
155
+ UREGEX(obj)->pattern = new_pattern;
156
+ return ret;
157
+ }
158
+ VALUE
159
+ icu_reg_comp(str)
160
+ VALUE str;
161
+ {
162
+ return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
163
+ }
164
+
165
+ /**
166
+ * call-seq:
167
+ * regexp.to_u => URegexp
168
+ *
169
+ * Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
170
+ * $KCODE must be set to 'u' to work reliably
171
+ */
172
+ VALUE icu_reg_from_rb_reg(re)
173
+ VALUE re;
174
+ {
175
+ return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
176
+ }
177
+
178
+ /**
179
+ * call-seq:
180
+ * uregex.to_u
181
+ *
182
+ * Returns UString of this URegexp pattern.
183
+ * */
184
+ VALUE
185
+ icu_reg_to_u(self)
186
+ VALUE self;
187
+ {
188
+ int32_t len = 0;
189
+ const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
190
+ return icu_ustr_new(s, len);
191
+ }
192
+
193
+ /**
194
+ * call-seq:
195
+ * uregex.split(str, limit)
196
+ *
197
+ * Divides <i>str</i> into substrings based on a regexp pattern,
198
+ * returning an array of these substrings. <i>str</i> is divided where the
199
+ * pattern matches.
200
+ * */
201
+ VALUE
202
+ icu_reg_split(self, str, limit)
203
+ VALUE self,
204
+ str,
205
+ limit;
206
+ {
207
+ VALUE splits;
208
+ URegularExpression *theRegEx = UREGEX(self)->pattern;
209
+ UErrorCode error = 0;
210
+ UChar * dest_buf, **dest_fields;
211
+ int32_t limt, req_cap, total, i;
212
+ Check_Class(str, rb_cUString);
213
+ if (limit != Qnil)
214
+ Check_Type(limit, T_FIXNUM);
215
+ splits = rb_ary_new();
216
+ dest_buf = ALLOCA_N(UChar, USTRING(str)->len * 2);
217
+ dest_fields = ALLOCA_N(UChar *, USTRING(str)->len);
218
+ limt = limit == Qnil ? USTRING(str)->len : FIX2INT(limit);
219
+ uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
220
+ if (U_FAILURE(error))
221
+ rb_raise(rb_eArgError, u_errorName(error));
222
+ req_cap = 0;
223
+ total =
224
+ uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
225
+ dest_fields, limt, &error);
226
+ if (U_FAILURE(error))
227
+ rb_raise(rb_eArgError, u_errorName(error));
228
+
229
+ for (i = 0; i < total; i++)
230
+ rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
231
+ return splits;
232
+ }
233
+
234
+ long
235
+ icu_reg_search(re, str, pos, reverse)
236
+ VALUE re,
237
+ str;
238
+ long pos,
239
+ reverse;
240
+ {
241
+ UErrorCode error = 0;
242
+ long cur_pos = 0;
243
+ long start,
244
+ last;
245
+
246
+ if (!reverse) {
247
+ start = pos;
248
+ } else {
249
+ start = 0;
250
+ }
251
+
252
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
253
+ USTRING(str)->len, &error);
254
+ if (U_FAILURE(error))
255
+ rb_raise(rb_eArgError, u_errorName(error));
256
+ if (!uregex_find(UREGEX(re)->pattern, start, &error))
257
+ return -1;
258
+ if (U_FAILURE(error))
259
+ rb_raise(rb_eArgError, u_errorName(error));
260
+ cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
261
+ if (reverse) {
262
+ while (uregex_findNext(UREGEX(re)->pattern, &error)) {
263
+ last = uregex_start(UREGEX(re)->pattern, 0, &error);
264
+ error = 0;
265
+ if (reverse && last > pos)
266
+ break;
267
+ cur_pos = last;
268
+ }
269
+ }
270
+ if (reverse && cur_pos > pos)
271
+ return -1;
272
+ return cur_pos;
273
+ }
274
+
275
+ VALUE
276
+ icu_reg_nth_match(re, nth)
277
+ VALUE re;
278
+ long nth;
279
+ {
280
+ URegularExpression *the_expr = UREGEX(re)->pattern;
281
+ UErrorCode error = 0;
282
+ long start = uregex_start(the_expr, nth, &error), end;
283
+ int32_t len;
284
+ if (U_FAILURE(error)) {
285
+ return Qnil;
286
+ }
287
+ end = uregex_end(the_expr, nth, &error);
288
+ len = 0;
289
+ return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
290
+ end - start);
291
+ }
292
+
293
+ VALUE
294
+ icu_reg_range(re, nth, start, end)
295
+ VALUE re;
296
+ int nth;
297
+ long *start;
298
+ long *end;
299
+ {
300
+ URegularExpression *the_expr = UREGEX(re)->pattern;
301
+ UErrorCode error = 0;
302
+ *start = uregex_start(the_expr, nth, &error);
303
+ if (U_FAILURE(error))
304
+ return Qnil;
305
+ *end = uregex_end(the_expr, nth, &error);
306
+ return Qtrue;
307
+ }
308
+
309
+ /**
310
+ * call-seq:
311
+ * uregex.match(str) => matchdata or nil
312
+ * uregex =~ (str) => matchdata or nil
313
+ *
314
+ * Returns a <code>UMatch</code> object describing the match, or
315
+ * <code>nil</code> if there was no match.
316
+ *
317
+ * ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
318
+ */
319
+ VALUE
320
+ icu_reg_match(re, str)
321
+ VALUE re,
322
+ str;
323
+ {
324
+ UErrorCode error = 0;
325
+ Check_Class(str, rb_cUString);
326
+ uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
327
+ USTRING(str)->len, &error);
328
+ if (U_FAILURE(error))
329
+ rb_raise(rb_eArgError, u_errorName(error));
330
+ if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
331
+ return icu_umatch_new(re);
332
+ }
333
+ return Qnil;
334
+ }
335
+
336
+ /**
337
+ * call-seq:
338
+ * rxp === str => true or false
339
+ *
340
+ * Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
341
+ *
342
+ * a = "HELLO".u
343
+ * case a
344
+ * when ure("^[a-z]*$"); print "Lower case\n"
345
+ * when ure("^[A-Z]*$"); print "Upper case\n"
346
+ * else; print "Mixed case\n"
347
+ * end
348
+ *
349
+ * <em>produces:</em>
350
+ *
351
+ * Upper case
352
+ */
353
+ VALUE
354
+ icu_reg_eqq(re, str)
355
+ VALUE re,
356
+ str;
357
+ {
358
+ long start;
359
+ Check_Class(str, rb_cUString);
360
+ start = icu_reg_search(re, str, 0, 0);
361
+ return start < 0 ? Qfalse : Qtrue;
362
+ }
363
+
364
+
365
+ long
366
+ icu_group_count(re)
367
+ VALUE re;
368
+ {
369
+ UErrorCode error = 0;
370
+ return uregex_groupCount(UREGEX(re)->pattern, &error);
371
+ }
372
+
373
+ int
374
+ icu_reg_find_next(pat)
375
+ VALUE pat;
376
+ {
377
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
378
+ UErrorCode error = 0;
379
+ return uregex_findNext(the_expr, &error);
380
+ }
381
+
382
+ static const UChar BACKSLASH = 0x5c;
383
+ static const UChar DOLLARSIGN = 0x24;
384
+
385
+ VALUE
386
+ icu_reg_get_replacement(pat, repl_text, prev_end)
387
+ VALUE pat,
388
+ repl_text;
389
+ long prev_end;
390
+ {
391
+ UErrorCode error = U_ZERO_ERROR;
392
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
393
+ VALUE ret = icu_ustr_new(0, 0);
394
+
395
+ /* scan the replacement text, looking for substitutions ($n) and \escapes. */
396
+ int32_t replIdx = 0;
397
+ int32_t replacementLength = ICU_LEN(repl_text);
398
+ UChar *replacementText = ICU_PTR(repl_text);
399
+ int32_t numDigits = 0;
400
+ int32_t groupNum = 0, g_start, g_end;
401
+ UChar32 digitC;
402
+ int32_t len;
403
+ /* following code is rewritten version of code found */
404
+ /* in ICU sources : i18n/regexp.cpp */
405
+ while (replIdx < replacementLength) {
406
+ UChar c = replacementText[replIdx];
407
+ replIdx++;
408
+ if (c != DOLLARSIGN && c != BACKSLASH) {
409
+ /* Common case, no substitution, no escaping, */
410
+ /* just copy the char to the dest buf. */
411
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
412
+ continue;
413
+ }
414
+
415
+ if (c == BACKSLASH) {
416
+ /* Backslash Escape. Copy the following char out without further checks. */
417
+ /* Note: Surrogate pairs don't need any special handling */
418
+ /* The second half wont be a '$' or a '\', and */
419
+ /* will move to the dest normally on the next */
420
+ /* loop iteration. */
421
+ if (replIdx >= replacementLength) {
422
+ break;
423
+ }
424
+ /* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
425
+
426
+ /* Plain backslash escape. Just put out the escaped character. */
427
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
428
+ replIdx++;
429
+ continue;
430
+ }
431
+
432
+ /* We've got a $. Pick up a capture group number if one follows. */
433
+ /* Consume at most the number of digits necessary for the largest capture */
434
+ /* number that is valid for this pattern. */
435
+ numDigits = 0;
436
+ groupNum = 0;
437
+
438
+ for (;;) {
439
+ if (replIdx >= replacementLength) {
440
+ break;
441
+ }
442
+ U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
443
+ if (u_isdigit(digitC) == FALSE) {
444
+ break;
445
+ }
446
+
447
+ U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
448
+ groupNum=groupNum*10 + u_charDigitValue(digitC);
449
+ numDigits++;
450
+ if (numDigits >= 3) { /* limit 999 groups */
451
+ break;
452
+ }
453
+ }
454
+
455
+ if (numDigits == 0) {
456
+ /* The $ didn't introduce a group number at all. */
457
+ /* Treat it as just part of the substitution text. */
458
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
459
+ continue;
460
+ }
461
+
462
+ /* Finally, append the capture group data to the destination. */
463
+ error = U_ZERO_ERROR;
464
+ g_start = uregex_start(the_expr, groupNum, &error);
465
+ g_end = uregex_end (the_expr, groupNum, &error);
466
+ if(U_SUCCESS(error) && g_start != -1 ) {
467
+ ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
468
+ uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
469
+ }
470
+
471
+ }
472
+ return ret;
473
+ }
474
+
475
+ VALUE
476
+ icu_reg_get_prematch(pat, prev_end)
477
+ VALUE pat;
478
+ long prev_end;
479
+ {
480
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
481
+ UErrorCode error = 0;
482
+ int32_t len = 0;
483
+ int32_t cur_start = uregex_start(the_expr, 0, &error);
484
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
485
+ VALUE pm =
486
+ icu_ustr_new(temp + prev_end, cur_start - prev_end);
487
+ return pm;
488
+ }
489
+
490
+ VALUE
491
+ icu_reg_get_tail(pat, prev_end)
492
+ VALUE pat;
493
+ long prev_end;
494
+ {
495
+ UErrorCode error = U_ZERO_ERROR;
496
+ URegularExpression *the_expr = UREGEX(pat)->pattern;
497
+ int32_t len = 0;
498
+ const UChar *temp = uregex_getText(the_expr, &len, &error);
499
+ VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
500
+ return pm;
501
+ }
502
+
503
+ /**
504
+ * call-seq:
505
+ * ure(str[, options]) => URegexp
506
+ *
507
+ * Creates URegexp object from UString.
508
+ * */
509
+ VALUE
510
+ icu_reg_from_rb_str(argc, argv, obj)
511
+ int argc;
512
+ VALUE *argv;
513
+ VALUE obj;
514
+ {
515
+ VALUE pat,
516
+ options = Qnil;
517
+ int reg_opts = 0;
518
+ if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
519
+ reg_opts = 0;
520
+ } else {
521
+ if (options != Qnil) {
522
+ Check_Type(options, T_FIXNUM);
523
+ reg_opts = FIX2INT(options);
524
+ }
525
+ }
526
+ if (TYPE(pat) == T_STRING)
527
+ pat = icu_from_rstr(0, NULL, pat);
528
+ if (CLASS_OF(pat) != rb_cUString)
529
+ rb_raise(rb_eArgError, "Expected String or UString");
530
+ return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
531
+ }
532
+
533
+ /**
534
+ * call-seq:
535
+ * umatch[idx] => string
536
+ *
537
+ * Returns capture group. Group 0 is for full match.
538
+ * */
539
+ VALUE
540
+ icu_umatch_aref(match, index)
541
+ VALUE match,
542
+ index;
543
+ {
544
+ long idx;
545
+ VALUE cg;
546
+ Check_Type(index, T_FIXNUM);
547
+ idx = FIX2LONG(index);
548
+ cg = rb_iv_get(match, "@cg");
549
+ return rb_ary_entry(cg, idx);
550
+ }
551
+
552
+ /**
553
+ * call-seq:
554
+ * umatch.range(idx) => range
555
+ *
556
+ * Returns range (start, end) of capture group. Group 0 is for full match.
557
+ *
558
+ * NOTE: this method returns <b>code unit</b> indexes. To convert this range
559
+ * to <b>code point</b> range use UString#conv_unit_range. If your chars don't
560
+ * require surrogate UTF16 pairs, range will be the same.
561
+ * */
562
+ VALUE
563
+ icu_umatch_range(match, index)
564
+ VALUE match,
565
+ index;
566
+ {
567
+ long idx;
568
+ VALUE cg;
569
+ Check_Type(index, T_FIXNUM);
570
+ idx = FIX2LONG(index);
571
+ cg = rb_iv_get(match, "@ranges");
572
+ return rb_ary_entry(cg, idx);
573
+ }
574
+
575
+
576
+ /**
577
+ * call-seq:
578
+ * umatch.size => fixnum
579
+ *
580
+ * Returns number of capture groups.
581
+ * */
582
+ VALUE
583
+ icu_umatch_size(match)
584
+ VALUE match;
585
+ {
586
+ VALUE cg = rb_iv_get(match, "@cg");
587
+ return LONG2NUM(RARRAY(cg)->len - 1);
588
+ }
589
+
590
+
591
+ VALUE
592
+ icu_umatch_init( self, re)
593
+ VALUE self, re;
594
+ {
595
+ UErrorCode status = U_ZERO_ERROR;
596
+ long count, i, cu_start, cu_end;
597
+ URegularExpression * the_regex;
598
+ VALUE obj, groups, ranges;
599
+
600
+ Check_Class(re, rb_cURegexp);
601
+ the_regex = UREGEX(re)->pattern;
602
+ count = uregex_groupCount(the_regex, &status);
603
+ if (U_FAILURE(status)) {
604
+ rb_raise(rb_eArgError, u_errorName(status));
605
+ }
606
+ groups = rb_ary_new2(count);
607
+ rb_iv_set(self, "@cg", groups);
608
+ for (i = 0; i <= count; i++) {
609
+ obj = icu_reg_nth_match(re, i);
610
+ rb_obj_freeze(obj);
611
+ rb_ary_store(groups, i, obj);
612
+ }
613
+
614
+ ranges = rb_ary_new2(count);
615
+ for ( i = 0; i <= count; i++){
616
+ cu_start = uregex_start(the_regex, i, &status);
617
+ cu_end = uregex_end(the_regex, i, &status);
618
+ if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
619
+ else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
620
+ }
621
+ rb_iv_set(self, "@ranges", ranges);
622
+ return self;
623
+ }
624
+ VALUE icu_umatch_new(re)
625
+ VALUE re;
626
+ {
627
+ return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
628
+ }
629
+
630
+
631
+
632
+
633
+ void initialize_uregexp (void)
634
+ {
635
+ /* regular expressions */
636
+ rb_cURegexp = rb_define_class("URegexp", rb_cObject);
637
+ rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
638
+ rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
639
+ rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
640
+ rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
641
+ rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
642
+ rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
643
+ rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
644
+
645
+ /* Enable case insensitive matching. */
646
+ rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
647
+ /* Allow white space and comments within patterns */
648
+ rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
649
+ /* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
650
+ rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
651
+ /* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
652
+ rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
653
+
654
+
655
+ rb_define_global_function("ure", icu_reg_from_rb_str, -1);
656
+
657
+ /**
658
+ * Document-class: UMatch
659
+ *
660
+ * Class to store information about capturing
661
+ * groups. Used in UString#sub, UString#gsub methods, as parameter to
662
+ * passed block.
663
+ */
664
+ rb_cUMatch = rb_define_class("UMatch", rb_cObject);
665
+ rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
666
+ rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
667
+ rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
668
+
669
+ rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
670
+ rb_define_alias (rb_cRegexp, "U", "to_u");
671
+ rb_define_alias (rb_cRegexp, "ur", "to_u");
672
+
673
+ }
data/uregex.h ADDED
@@ -0,0 +1,27 @@
1
+ extern void icu_regex_free (ICURegexp *ptr);
2
+ extern VALUE icu_reg_s_alloc (VALUE klass);
3
+ extern VALUE icu_reg_initialize_m (int argc, VALUE *argv, VALUE self);
4
+ extern VALUE icu_reg_new (UChar *s, long len, int options) ;
5
+ extern VALUE icu_reg_clone (VALUE obj);
6
+ extern VALUE icu_reg_comp (VALUE str);
7
+ extern VALUE icu_reg_from_rb_reg (VALUE re);
8
+ extern VALUE icu_reg_to_u (VALUE self);
9
+ extern VALUE icu_reg_split (VALUE self, VALUE str, VALUE limit);
10
+ extern VALUE icu_reg_nth_match (VALUE re, long nth);
11
+ extern VALUE icu_reg_range (VALUE re, int nth, long *start, long *end);
12
+ extern VALUE icu_reg_match (VALUE re, VALUE str);
13
+ extern VALUE icu_reg_eqq (VALUE re, VALUE str);
14
+ extern int icu_reg_find_next (VALUE pat);
15
+ extern VALUE icu_reg_get_replacement (VALUE pat, VALUE repl_text, long prev_end);
16
+ extern VALUE icu_reg_get_prematch (VALUE pat, long prev_end);
17
+ extern VALUE icu_reg_get_tail (VALUE pat, long prev_end);
18
+ extern VALUE icu_reg_from_rb_str (int argc, VALUE *argv, VALUE obj);
19
+ extern VALUE icu_umatch_range (VALUE match, VALUE index);
20
+ extern VALUE icu_umatch_size (VALUE match);
21
+ extern VALUE icu_umatch_init (VALUE self, VALUE re);
22
+ extern VALUE icu_umatch_aref (VALUE match, VALUE idx);
23
+ extern VALUE icu_umatch_new (VALUE re);
24
+ extern long icu_group_count(VALUE re);
25
+ extern long icu_reg_search(VALUE re, VALUE str, int pos, int reverse);
26
+
27
+ extern void initialize_uregexp (void);