icu4r 0.1.3.2006.01.26
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +153 -0
- data/calendar.c +576 -0
- data/docs/FORMATTING +131 -0
- data/docs/UNICODE_REGEXPS +204 -0
- data/extconf.rb +15 -0
- data/fmt.cpp +150 -0
- data/icu4r.c +14 -0
- data/icu_common.h +45 -0
- data/samples/demo_each.rb +23 -0
- data/samples/demo_locales.rb +16 -0
- data/samples/demo_regexp.rb +11 -0
- data/samples/resbundle/appmsg/root.res +0 -0
- data/samples/resbundle/appmsg/ru.res +0 -0
- data/samples/resbundle/demo_bundle.rb +4 -0
- data/samples/resbundle/mkres.sh +4 -0
- data/samples/resbundle/root.txt +10 -0
- data/samples/resbundle/ru.txt +4 -0
- data/test/test_calendar.rb +109 -0
- data/test/test_ustring.rb +381 -0
- data/tools/doc.sh +2 -0
- data/tools/km.rb +425 -0
- data/ubundle.c +209 -0
- data/ucore_ext.c +168 -0
- data/uregex.c +673 -0
- data/uregex.h +27 -0
- data/ustring.c +3042 -0
- metadata +81 -0
data/uregex.c
ADDED
@@ -0,0 +1,673 @@
|
|
1
|
+
|
2
|
+
/**
|
3
|
+
* Document-class: URegexp
|
4
|
+
*
|
5
|
+
* See [docs/UNICODE_REGEXPS] for details of patterns.
|
6
|
+
*
|
7
|
+
*
|
8
|
+
* Replacement Text
|
9
|
+
*
|
10
|
+
* The replacement text for find-and-replace operations may contain references to
|
11
|
+
* capture-group text from the find. References are of the form $n, where n is the
|
12
|
+
* number of the capture group.
|
13
|
+
*
|
14
|
+
* Character Descriptions
|
15
|
+
* $n The text of capture group n will be substituted for $n. n must be >= 0 and not
|
16
|
+
* greater than the number of capture groups. A $ not followed by a digit has no special meaning,
|
17
|
+
* and will appear in the substitution text as itself, a $.
|
18
|
+
* \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
|
19
|
+
* substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
|
20
|
+
*
|
21
|
+
*
|
22
|
+
* Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include "icu_common.h"
|
26
|
+
extern VALUE rb_cURegexp;
|
27
|
+
extern VALUE rb_cUString;
|
28
|
+
extern VALUE rb_cUMatch;
|
29
|
+
VALUE icu_umatch_aref(VALUE match, VALUE idx);
|
30
|
+
VALUE icu_umatch_new (VALUE re);
|
31
|
+
extern VALUE icu_ustr_new(const UChar * ptr, long len);
|
32
|
+
extern VALUE icu_ustr_new2(const UChar * ptr);
|
33
|
+
extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
|
34
|
+
extern VALUE icu_from_rstr(int, VALUE *, VALUE);
|
35
|
+
|
36
|
+
/* --------- regular expressions */
|
37
|
+
void icu_regex_free( ICURegexp *ptr)
|
38
|
+
{
|
39
|
+
if (ptr->pattern)
|
40
|
+
uregex_close(ptr->pattern);
|
41
|
+
ptr->pattern = 0;
|
42
|
+
free(ptr);
|
43
|
+
}
|
44
|
+
|
45
|
+
VALUE
|
46
|
+
icu_reg_s_alloc(klass)
|
47
|
+
VALUE klass;
|
48
|
+
{
|
49
|
+
ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
|
50
|
+
ptr->pattern = 0;
|
51
|
+
return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
|
52
|
+
}
|
53
|
+
|
54
|
+
void
|
55
|
+
icu_reg_initialize(obj, s, len, options)
|
56
|
+
VALUE obj;
|
57
|
+
const UChar *s;
|
58
|
+
long len;
|
59
|
+
int options;
|
60
|
+
{
|
61
|
+
UParseError pe;
|
62
|
+
UErrorCode status = 0;
|
63
|
+
ICURegexp *re = UREGEX(obj);
|
64
|
+
|
65
|
+
if (re->pattern)
|
66
|
+
uregex_close(re->pattern);
|
67
|
+
re->pattern = uregex_open(s, len, options, &pe, &status);
|
68
|
+
re->options = options;
|
69
|
+
|
70
|
+
if (U_FAILURE(status))
|
71
|
+
rb_raise(rb_eArgError,
|
72
|
+
"Wrong regexp: %s line %d column %d flags %d",
|
73
|
+
u_errorName(status), pe.line, pe.offset, options);
|
74
|
+
|
75
|
+
}
|
76
|
+
|
77
|
+
const UChar *
|
78
|
+
icu_reg_get_pattern(ptr, len)
|
79
|
+
ICURegexp *ptr;
|
80
|
+
int32_t *len;
|
81
|
+
{
|
82
|
+
UErrorCode error = 0;
|
83
|
+
*len = 0;
|
84
|
+
return uregex_pattern(ptr->pattern, len, &error);
|
85
|
+
}
|
86
|
+
|
87
|
+
/**
|
88
|
+
* call-seq:
|
89
|
+
* URegexp.new(str [,options])
|
90
|
+
* URegexp.new(regexp)
|
91
|
+
*
|
92
|
+
* Constructs a new regular expression from <i>pattern</i>, which can be either
|
93
|
+
* a <code>UString</code> or a <code>URegexp</code>.
|
94
|
+
* */
|
95
|
+
VALUE
|
96
|
+
icu_reg_initialize_m(argc, argv, self)
|
97
|
+
int argc;
|
98
|
+
VALUE *argv;
|
99
|
+
VALUE self;
|
100
|
+
{
|
101
|
+
const UChar *s;
|
102
|
+
int32_t len = 0;
|
103
|
+
int flags = 0;
|
104
|
+
|
105
|
+
if (argc == 0 || argc > 2) {
|
106
|
+
rb_raise(rb_eArgError, "wrong number of arguments");
|
107
|
+
}
|
108
|
+
if (CLASS_OF(argv[0]) == rb_cURegexp) {
|
109
|
+
if (argc > 1) {
|
110
|
+
rb_warn("flags ignored");
|
111
|
+
}
|
112
|
+
flags = UREGEX(argv[0])->options;
|
113
|
+
s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
|
114
|
+
} else {
|
115
|
+
Check_Class(argv[0], rb_cUString);
|
116
|
+
if (argc == 2) {
|
117
|
+
if (FIXNUM_P(argv[1]))
|
118
|
+
flags = FIX2INT(argv[1]);
|
119
|
+
else if (RTEST(argv[1]))
|
120
|
+
flags = UREGEX_CASE_INSENSITIVE;
|
121
|
+
}
|
122
|
+
s = ICU_PTR(argv[0]);
|
123
|
+
len = ICU_LEN(argv[0]);
|
124
|
+
}
|
125
|
+
icu_reg_initialize(self, s, len, flags);
|
126
|
+
return self;
|
127
|
+
}
|
128
|
+
|
129
|
+
VALUE
|
130
|
+
icu_reg_new(s, len, options)
|
131
|
+
const UChar *s;
|
132
|
+
long len;
|
133
|
+
int options;
|
134
|
+
{
|
135
|
+
VALUE re = icu_reg_s_alloc(rb_cURegexp);
|
136
|
+
icu_reg_initialize(re, s, len, options);
|
137
|
+
return (VALUE) re;
|
138
|
+
}
|
139
|
+
|
140
|
+
VALUE
|
141
|
+
icu_reg_clone(obj)
|
142
|
+
VALUE obj;
|
143
|
+
{
|
144
|
+
ICURegexp *regex = UREGEX(obj);
|
145
|
+
URegularExpression *old_pattern = UREGEX(obj)->pattern;
|
146
|
+
VALUE ret ;
|
147
|
+
UErrorCode status = U_ZERO_ERROR;
|
148
|
+
URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
|
149
|
+
if(U_FAILURE(status) ){
|
150
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
151
|
+
}
|
152
|
+
ret = icu_reg_s_alloc(rb_cURegexp);
|
153
|
+
regex = UREGEX(ret);
|
154
|
+
regex->pattern = old_pattern;
|
155
|
+
UREGEX(obj)->pattern = new_pattern;
|
156
|
+
return ret;
|
157
|
+
}
|
158
|
+
VALUE
|
159
|
+
icu_reg_comp(str)
|
160
|
+
VALUE str;
|
161
|
+
{
|
162
|
+
return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
|
163
|
+
}
|
164
|
+
|
165
|
+
/**
|
166
|
+
* call-seq:
|
167
|
+
* regexp.to_u => URegexp
|
168
|
+
*
|
169
|
+
* Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
|
170
|
+
* $KCODE must be set to 'u' to work reliably
|
171
|
+
*/
|
172
|
+
VALUE icu_reg_from_rb_reg(re)
|
173
|
+
VALUE re;
|
174
|
+
{
|
175
|
+
return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
|
176
|
+
}
|
177
|
+
|
178
|
+
/**
|
179
|
+
* call-seq:
|
180
|
+
* uregex.to_u
|
181
|
+
*
|
182
|
+
* Returns UString of this URegexp pattern.
|
183
|
+
* */
|
184
|
+
VALUE
|
185
|
+
icu_reg_to_u(self)
|
186
|
+
VALUE self;
|
187
|
+
{
|
188
|
+
int32_t len = 0;
|
189
|
+
const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
|
190
|
+
return icu_ustr_new(s, len);
|
191
|
+
}
|
192
|
+
|
193
|
+
/**
|
194
|
+
* call-seq:
|
195
|
+
* uregex.split(str, limit)
|
196
|
+
*
|
197
|
+
* Divides <i>str</i> into substrings based on a regexp pattern,
|
198
|
+
* returning an array of these substrings. <i>str</i> is divided where the
|
199
|
+
* pattern matches.
|
200
|
+
* */
|
201
|
+
VALUE
|
202
|
+
icu_reg_split(self, str, limit)
|
203
|
+
VALUE self,
|
204
|
+
str,
|
205
|
+
limit;
|
206
|
+
{
|
207
|
+
VALUE splits;
|
208
|
+
URegularExpression *theRegEx = UREGEX(self)->pattern;
|
209
|
+
UErrorCode error = 0;
|
210
|
+
UChar * dest_buf, **dest_fields;
|
211
|
+
int32_t limt, req_cap, total, i;
|
212
|
+
Check_Class(str, rb_cUString);
|
213
|
+
if (limit != Qnil)
|
214
|
+
Check_Type(limit, T_FIXNUM);
|
215
|
+
splits = rb_ary_new();
|
216
|
+
dest_buf = ALLOCA_N(UChar, USTRING(str)->len * 2);
|
217
|
+
dest_fields = ALLOCA_N(UChar *, USTRING(str)->len);
|
218
|
+
limt = limit == Qnil ? USTRING(str)->len : FIX2INT(limit);
|
219
|
+
uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
|
220
|
+
if (U_FAILURE(error))
|
221
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
222
|
+
req_cap = 0;
|
223
|
+
total =
|
224
|
+
uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
|
225
|
+
dest_fields, limt, &error);
|
226
|
+
if (U_FAILURE(error))
|
227
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
228
|
+
|
229
|
+
for (i = 0; i < total; i++)
|
230
|
+
rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
|
231
|
+
return splits;
|
232
|
+
}
|
233
|
+
|
234
|
+
long
|
235
|
+
icu_reg_search(re, str, pos, reverse)
|
236
|
+
VALUE re,
|
237
|
+
str;
|
238
|
+
long pos,
|
239
|
+
reverse;
|
240
|
+
{
|
241
|
+
UErrorCode error = 0;
|
242
|
+
long cur_pos = 0;
|
243
|
+
long start,
|
244
|
+
last;
|
245
|
+
|
246
|
+
if (!reverse) {
|
247
|
+
start = pos;
|
248
|
+
} else {
|
249
|
+
start = 0;
|
250
|
+
}
|
251
|
+
|
252
|
+
uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
|
253
|
+
USTRING(str)->len, &error);
|
254
|
+
if (U_FAILURE(error))
|
255
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
256
|
+
if (!uregex_find(UREGEX(re)->pattern, start, &error))
|
257
|
+
return -1;
|
258
|
+
if (U_FAILURE(error))
|
259
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
260
|
+
cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
|
261
|
+
if (reverse) {
|
262
|
+
while (uregex_findNext(UREGEX(re)->pattern, &error)) {
|
263
|
+
last = uregex_start(UREGEX(re)->pattern, 0, &error);
|
264
|
+
error = 0;
|
265
|
+
if (reverse && last > pos)
|
266
|
+
break;
|
267
|
+
cur_pos = last;
|
268
|
+
}
|
269
|
+
}
|
270
|
+
if (reverse && cur_pos > pos)
|
271
|
+
return -1;
|
272
|
+
return cur_pos;
|
273
|
+
}
|
274
|
+
|
275
|
+
VALUE
|
276
|
+
icu_reg_nth_match(re, nth)
|
277
|
+
VALUE re;
|
278
|
+
long nth;
|
279
|
+
{
|
280
|
+
URegularExpression *the_expr = UREGEX(re)->pattern;
|
281
|
+
UErrorCode error = 0;
|
282
|
+
long start = uregex_start(the_expr, nth, &error), end;
|
283
|
+
int32_t len;
|
284
|
+
if (U_FAILURE(error)) {
|
285
|
+
return Qnil;
|
286
|
+
}
|
287
|
+
end = uregex_end(the_expr, nth, &error);
|
288
|
+
len = 0;
|
289
|
+
return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
|
290
|
+
end - start);
|
291
|
+
}
|
292
|
+
|
293
|
+
VALUE
|
294
|
+
icu_reg_range(re, nth, start, end)
|
295
|
+
VALUE re;
|
296
|
+
int nth;
|
297
|
+
long *start;
|
298
|
+
long *end;
|
299
|
+
{
|
300
|
+
URegularExpression *the_expr = UREGEX(re)->pattern;
|
301
|
+
UErrorCode error = 0;
|
302
|
+
*start = uregex_start(the_expr, nth, &error);
|
303
|
+
if (U_FAILURE(error))
|
304
|
+
return Qnil;
|
305
|
+
*end = uregex_end(the_expr, nth, &error);
|
306
|
+
return Qtrue;
|
307
|
+
}
|
308
|
+
|
309
|
+
/**
|
310
|
+
* call-seq:
|
311
|
+
* uregex.match(str) => matchdata or nil
|
312
|
+
* uregex =~ (str) => matchdata or nil
|
313
|
+
*
|
314
|
+
* Returns a <code>UMatch</code> object describing the match, or
|
315
|
+
* <code>nil</code> if there was no match.
|
316
|
+
*
|
317
|
+
* ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
|
318
|
+
*/
|
319
|
+
VALUE
|
320
|
+
icu_reg_match(re, str)
|
321
|
+
VALUE re,
|
322
|
+
str;
|
323
|
+
{
|
324
|
+
UErrorCode error = 0;
|
325
|
+
Check_Class(str, rb_cUString);
|
326
|
+
uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
|
327
|
+
USTRING(str)->len, &error);
|
328
|
+
if (U_FAILURE(error))
|
329
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
330
|
+
if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
|
331
|
+
return icu_umatch_new(re);
|
332
|
+
}
|
333
|
+
return Qnil;
|
334
|
+
}
|
335
|
+
|
336
|
+
/**
|
337
|
+
* call-seq:
|
338
|
+
* rxp === str => true or false
|
339
|
+
*
|
340
|
+
* Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
|
341
|
+
*
|
342
|
+
* a = "HELLO".u
|
343
|
+
* case a
|
344
|
+
* when ure("^[a-z]*$"); print "Lower case\n"
|
345
|
+
* when ure("^[A-Z]*$"); print "Upper case\n"
|
346
|
+
* else; print "Mixed case\n"
|
347
|
+
* end
|
348
|
+
*
|
349
|
+
* <em>produces:</em>
|
350
|
+
*
|
351
|
+
* Upper case
|
352
|
+
*/
|
353
|
+
VALUE
|
354
|
+
icu_reg_eqq(re, str)
|
355
|
+
VALUE re,
|
356
|
+
str;
|
357
|
+
{
|
358
|
+
long start;
|
359
|
+
Check_Class(str, rb_cUString);
|
360
|
+
start = icu_reg_search(re, str, 0, 0);
|
361
|
+
return start < 0 ? Qfalse : Qtrue;
|
362
|
+
}
|
363
|
+
|
364
|
+
|
365
|
+
long
|
366
|
+
icu_group_count(re)
|
367
|
+
VALUE re;
|
368
|
+
{
|
369
|
+
UErrorCode error = 0;
|
370
|
+
return uregex_groupCount(UREGEX(re)->pattern, &error);
|
371
|
+
}
|
372
|
+
|
373
|
+
int
|
374
|
+
icu_reg_find_next(pat)
|
375
|
+
VALUE pat;
|
376
|
+
{
|
377
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
378
|
+
UErrorCode error = 0;
|
379
|
+
return uregex_findNext(the_expr, &error);
|
380
|
+
}
|
381
|
+
|
382
|
+
static const UChar BACKSLASH = 0x5c;
|
383
|
+
static const UChar DOLLARSIGN = 0x24;
|
384
|
+
|
385
|
+
VALUE
|
386
|
+
icu_reg_get_replacement(pat, repl_text, prev_end)
|
387
|
+
VALUE pat,
|
388
|
+
repl_text;
|
389
|
+
long prev_end;
|
390
|
+
{
|
391
|
+
UErrorCode error = U_ZERO_ERROR;
|
392
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
393
|
+
VALUE ret = icu_ustr_new(0, 0);
|
394
|
+
|
395
|
+
/* scan the replacement text, looking for substitutions ($n) and \escapes. */
|
396
|
+
int32_t replIdx = 0;
|
397
|
+
int32_t replacementLength = ICU_LEN(repl_text);
|
398
|
+
UChar *replacementText = ICU_PTR(repl_text);
|
399
|
+
int32_t numDigits = 0;
|
400
|
+
int32_t groupNum = 0, g_start, g_end;
|
401
|
+
UChar32 digitC;
|
402
|
+
int32_t len;
|
403
|
+
/* following code is rewritten version of code found */
|
404
|
+
/* in ICU sources : i18n/regexp.cpp */
|
405
|
+
while (replIdx < replacementLength) {
|
406
|
+
UChar c = replacementText[replIdx];
|
407
|
+
replIdx++;
|
408
|
+
if (c != DOLLARSIGN && c != BACKSLASH) {
|
409
|
+
/* Common case, no substitution, no escaping, */
|
410
|
+
/* just copy the char to the dest buf. */
|
411
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
|
412
|
+
continue;
|
413
|
+
}
|
414
|
+
|
415
|
+
if (c == BACKSLASH) {
|
416
|
+
/* Backslash Escape. Copy the following char out without further checks. */
|
417
|
+
/* Note: Surrogate pairs don't need any special handling */
|
418
|
+
/* The second half wont be a '$' or a '\', and */
|
419
|
+
/* will move to the dest normally on the next */
|
420
|
+
/* loop iteration. */
|
421
|
+
if (replIdx >= replacementLength) {
|
422
|
+
break;
|
423
|
+
}
|
424
|
+
/* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
|
425
|
+
|
426
|
+
/* Plain backslash escape. Just put out the escaped character. */
|
427
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
|
428
|
+
replIdx++;
|
429
|
+
continue;
|
430
|
+
}
|
431
|
+
|
432
|
+
/* We've got a $. Pick up a capture group number if one follows. */
|
433
|
+
/* Consume at most the number of digits necessary for the largest capture */
|
434
|
+
/* number that is valid for this pattern. */
|
435
|
+
numDigits = 0;
|
436
|
+
groupNum = 0;
|
437
|
+
|
438
|
+
for (;;) {
|
439
|
+
if (replIdx >= replacementLength) {
|
440
|
+
break;
|
441
|
+
}
|
442
|
+
U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
|
443
|
+
if (u_isdigit(digitC) == FALSE) {
|
444
|
+
break;
|
445
|
+
}
|
446
|
+
|
447
|
+
U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
|
448
|
+
groupNum=groupNum*10 + u_charDigitValue(digitC);
|
449
|
+
numDigits++;
|
450
|
+
if (numDigits >= 3) { /* limit 999 groups */
|
451
|
+
break;
|
452
|
+
}
|
453
|
+
}
|
454
|
+
|
455
|
+
if (numDigits == 0) {
|
456
|
+
/* The $ didn't introduce a group number at all. */
|
457
|
+
/* Treat it as just part of the substitution text. */
|
458
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
|
459
|
+
continue;
|
460
|
+
}
|
461
|
+
|
462
|
+
/* Finally, append the capture group data to the destination. */
|
463
|
+
error = U_ZERO_ERROR;
|
464
|
+
g_start = uregex_start(the_expr, groupNum, &error);
|
465
|
+
g_end = uregex_end (the_expr, groupNum, &error);
|
466
|
+
if(U_SUCCESS(error) && g_start != -1 ) {
|
467
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
|
468
|
+
uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
|
469
|
+
}
|
470
|
+
|
471
|
+
}
|
472
|
+
return ret;
|
473
|
+
}
|
474
|
+
|
475
|
+
VALUE
|
476
|
+
icu_reg_get_prematch(pat, prev_end)
|
477
|
+
VALUE pat;
|
478
|
+
long prev_end;
|
479
|
+
{
|
480
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
481
|
+
UErrorCode error = 0;
|
482
|
+
int32_t len = 0;
|
483
|
+
int32_t cur_start = uregex_start(the_expr, 0, &error);
|
484
|
+
const UChar *temp = uregex_getText(the_expr, &len, &error);
|
485
|
+
VALUE pm =
|
486
|
+
icu_ustr_new(temp + prev_end, cur_start - prev_end);
|
487
|
+
return pm;
|
488
|
+
}
|
489
|
+
|
490
|
+
VALUE
|
491
|
+
icu_reg_get_tail(pat, prev_end)
|
492
|
+
VALUE pat;
|
493
|
+
long prev_end;
|
494
|
+
{
|
495
|
+
UErrorCode error = U_ZERO_ERROR;
|
496
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
497
|
+
int32_t len = 0;
|
498
|
+
const UChar *temp = uregex_getText(the_expr, &len, &error);
|
499
|
+
VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
|
500
|
+
return pm;
|
501
|
+
}
|
502
|
+
|
503
|
+
/**
|
504
|
+
* call-seq:
|
505
|
+
* ure(str[, options]) => URegexp
|
506
|
+
*
|
507
|
+
* Creates URegexp object from UString.
|
508
|
+
* */
|
509
|
+
VALUE
|
510
|
+
icu_reg_from_rb_str(argc, argv, obj)
|
511
|
+
int argc;
|
512
|
+
VALUE *argv;
|
513
|
+
VALUE obj;
|
514
|
+
{
|
515
|
+
VALUE pat,
|
516
|
+
options = Qnil;
|
517
|
+
int reg_opts = 0;
|
518
|
+
if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
|
519
|
+
reg_opts = 0;
|
520
|
+
} else {
|
521
|
+
if (options != Qnil) {
|
522
|
+
Check_Type(options, T_FIXNUM);
|
523
|
+
reg_opts = FIX2INT(options);
|
524
|
+
}
|
525
|
+
}
|
526
|
+
if (TYPE(pat) == T_STRING)
|
527
|
+
pat = icu_from_rstr(0, NULL, pat);
|
528
|
+
if (CLASS_OF(pat) != rb_cUString)
|
529
|
+
rb_raise(rb_eArgError, "Expected String or UString");
|
530
|
+
return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
|
531
|
+
}
|
532
|
+
|
533
|
+
/**
|
534
|
+
* call-seq:
|
535
|
+
* umatch[idx] => string
|
536
|
+
*
|
537
|
+
* Returns capture group. Group 0 is for full match.
|
538
|
+
* */
|
539
|
+
VALUE
|
540
|
+
icu_umatch_aref(match, index)
|
541
|
+
VALUE match,
|
542
|
+
index;
|
543
|
+
{
|
544
|
+
long idx;
|
545
|
+
VALUE cg;
|
546
|
+
Check_Type(index, T_FIXNUM);
|
547
|
+
idx = FIX2LONG(index);
|
548
|
+
cg = rb_iv_get(match, "@cg");
|
549
|
+
return rb_ary_entry(cg, idx);
|
550
|
+
}
|
551
|
+
|
552
|
+
/**
|
553
|
+
* call-seq:
|
554
|
+
* umatch.range(idx) => range
|
555
|
+
*
|
556
|
+
* Returns range (start, end) of capture group. Group 0 is for full match.
|
557
|
+
*
|
558
|
+
* NOTE: this method returns <b>code unit</b> indexes. To convert this range
|
559
|
+
* to <b>code point</b> range use UString#conv_unit_range. If your chars don't
|
560
|
+
* require surrogate UTF16 pairs, range will be the same.
|
561
|
+
* */
|
562
|
+
VALUE
|
563
|
+
icu_umatch_range(match, index)
|
564
|
+
VALUE match,
|
565
|
+
index;
|
566
|
+
{
|
567
|
+
long idx;
|
568
|
+
VALUE cg;
|
569
|
+
Check_Type(index, T_FIXNUM);
|
570
|
+
idx = FIX2LONG(index);
|
571
|
+
cg = rb_iv_get(match, "@ranges");
|
572
|
+
return rb_ary_entry(cg, idx);
|
573
|
+
}
|
574
|
+
|
575
|
+
|
576
|
+
/**
|
577
|
+
* call-seq:
|
578
|
+
* umatch.size => fixnum
|
579
|
+
*
|
580
|
+
* Returns number of capture groups.
|
581
|
+
* */
|
582
|
+
VALUE
|
583
|
+
icu_umatch_size(match)
|
584
|
+
VALUE match;
|
585
|
+
{
|
586
|
+
VALUE cg = rb_iv_get(match, "@cg");
|
587
|
+
return LONG2NUM(RARRAY(cg)->len - 1);
|
588
|
+
}
|
589
|
+
|
590
|
+
|
591
|
+
VALUE
|
592
|
+
icu_umatch_init( self, re)
|
593
|
+
VALUE self, re;
|
594
|
+
{
|
595
|
+
UErrorCode status = U_ZERO_ERROR;
|
596
|
+
long count, i, cu_start, cu_end;
|
597
|
+
URegularExpression * the_regex;
|
598
|
+
VALUE obj, groups, ranges;
|
599
|
+
|
600
|
+
Check_Class(re, rb_cURegexp);
|
601
|
+
the_regex = UREGEX(re)->pattern;
|
602
|
+
count = uregex_groupCount(the_regex, &status);
|
603
|
+
if (U_FAILURE(status)) {
|
604
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
605
|
+
}
|
606
|
+
groups = rb_ary_new2(count);
|
607
|
+
rb_iv_set(self, "@cg", groups);
|
608
|
+
for (i = 0; i <= count; i++) {
|
609
|
+
obj = icu_reg_nth_match(re, i);
|
610
|
+
rb_obj_freeze(obj);
|
611
|
+
rb_ary_store(groups, i, obj);
|
612
|
+
}
|
613
|
+
|
614
|
+
ranges = rb_ary_new2(count);
|
615
|
+
for ( i = 0; i <= count; i++){
|
616
|
+
cu_start = uregex_start(the_regex, i, &status);
|
617
|
+
cu_end = uregex_end(the_regex, i, &status);
|
618
|
+
if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
|
619
|
+
else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
|
620
|
+
}
|
621
|
+
rb_iv_set(self, "@ranges", ranges);
|
622
|
+
return self;
|
623
|
+
}
|
624
|
+
VALUE icu_umatch_new(re)
|
625
|
+
VALUE re;
|
626
|
+
{
|
627
|
+
return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
|
628
|
+
}
|
629
|
+
|
630
|
+
|
631
|
+
|
632
|
+
|
633
|
+
void initialize_uregexp (void)
|
634
|
+
{
|
635
|
+
/* regular expressions */
|
636
|
+
rb_cURegexp = rb_define_class("URegexp", rb_cObject);
|
637
|
+
rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
|
638
|
+
rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
|
639
|
+
rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
|
640
|
+
rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
|
641
|
+
rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
|
642
|
+
rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
|
643
|
+
rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
|
644
|
+
|
645
|
+
/* Enable case insensitive matching. */
|
646
|
+
rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
|
647
|
+
/* Allow white space and comments within patterns */
|
648
|
+
rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
|
649
|
+
/* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
|
650
|
+
rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
|
651
|
+
/* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
|
652
|
+
rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
|
653
|
+
|
654
|
+
|
655
|
+
rb_define_global_function("ure", icu_reg_from_rb_str, -1);
|
656
|
+
|
657
|
+
/**
|
658
|
+
* Document-class: UMatch
|
659
|
+
*
|
660
|
+
* Class to store information about capturing
|
661
|
+
* groups. Used in UString#sub, UString#gsub methods, as parameter to
|
662
|
+
* passed block.
|
663
|
+
*/
|
664
|
+
rb_cUMatch = rb_define_class("UMatch", rb_cObject);
|
665
|
+
rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
|
666
|
+
rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
|
667
|
+
rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
|
668
|
+
|
669
|
+
rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
|
670
|
+
rb_define_alias (rb_cRegexp, "U", "to_u");
|
671
|
+
rb_define_alias (rb_cRegexp, "ur", "to_u");
|
672
|
+
|
673
|
+
}
|
data/uregex.h
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
extern void icu_regex_free (ICURegexp *ptr);
|
2
|
+
extern VALUE icu_reg_s_alloc (VALUE klass);
|
3
|
+
extern VALUE icu_reg_initialize_m (int argc, VALUE *argv, VALUE self);
|
4
|
+
extern VALUE icu_reg_new (UChar *s, long len, int options) ;
|
5
|
+
extern VALUE icu_reg_clone (VALUE obj);
|
6
|
+
extern VALUE icu_reg_comp (VALUE str);
|
7
|
+
extern VALUE icu_reg_from_rb_reg (VALUE re);
|
8
|
+
extern VALUE icu_reg_to_u (VALUE self);
|
9
|
+
extern VALUE icu_reg_split (VALUE self, VALUE str, VALUE limit);
|
10
|
+
extern VALUE icu_reg_nth_match (VALUE re, long nth);
|
11
|
+
extern VALUE icu_reg_range (VALUE re, int nth, long *start, long *end);
|
12
|
+
extern VALUE icu_reg_match (VALUE re, VALUE str);
|
13
|
+
extern VALUE icu_reg_eqq (VALUE re, VALUE str);
|
14
|
+
extern int icu_reg_find_next (VALUE pat);
|
15
|
+
extern VALUE icu_reg_get_replacement (VALUE pat, VALUE repl_text, long prev_end);
|
16
|
+
extern VALUE icu_reg_get_prematch (VALUE pat, long prev_end);
|
17
|
+
extern VALUE icu_reg_get_tail (VALUE pat, long prev_end);
|
18
|
+
extern VALUE icu_reg_from_rb_str (int argc, VALUE *argv, VALUE obj);
|
19
|
+
extern VALUE icu_umatch_range (VALUE match, VALUE index);
|
20
|
+
extern VALUE icu_umatch_size (VALUE match);
|
21
|
+
extern VALUE icu_umatch_init (VALUE self, VALUE re);
|
22
|
+
extern VALUE icu_umatch_aref (VALUE match, VALUE idx);
|
23
|
+
extern VALUE icu_umatch_new (VALUE re);
|
24
|
+
extern long icu_group_count(VALUE re);
|
25
|
+
extern long icu_reg_search(VALUE re, VALUE str, int pos, int reverse);
|
26
|
+
|
27
|
+
extern void initialize_uregexp (void);
|