icu4r 0.1.3.2006.01.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +153 -0
- data/calendar.c +576 -0
- data/docs/FORMATTING +131 -0
- data/docs/UNICODE_REGEXPS +204 -0
- data/extconf.rb +15 -0
- data/fmt.cpp +150 -0
- data/icu4r.c +14 -0
- data/icu_common.h +45 -0
- data/samples/demo_each.rb +23 -0
- data/samples/demo_locales.rb +16 -0
- data/samples/demo_regexp.rb +11 -0
- data/samples/resbundle/appmsg/root.res +0 -0
- data/samples/resbundle/appmsg/ru.res +0 -0
- data/samples/resbundle/demo_bundle.rb +4 -0
- data/samples/resbundle/mkres.sh +4 -0
- data/samples/resbundle/root.txt +10 -0
- data/samples/resbundle/ru.txt +4 -0
- data/test/test_calendar.rb +109 -0
- data/test/test_ustring.rb +381 -0
- data/tools/doc.sh +2 -0
- data/tools/km.rb +425 -0
- data/ubundle.c +209 -0
- data/ucore_ext.c +168 -0
- data/uregex.c +673 -0
- data/uregex.h +27 -0
- data/ustring.c +3042 -0
- metadata +81 -0
data/uregex.c
ADDED
@@ -0,0 +1,673 @@
|
|
1
|
+
|
2
|
+
/**
|
3
|
+
* Document-class: URegexp
|
4
|
+
*
|
5
|
+
* See [docs/UNICODE_REGEXPS] for details of patterns.
|
6
|
+
*
|
7
|
+
*
|
8
|
+
* Replacement Text
|
9
|
+
*
|
10
|
+
* The replacement text for find-and-replace operations may contain references to
|
11
|
+
* capture-group text from the find. References are of the form $n, where n is the
|
12
|
+
* number of the capture group.
|
13
|
+
*
|
14
|
+
* Character Descriptions
|
15
|
+
* $n The text of capture group n will be substituted for $n. n must be >= 0 and not
|
16
|
+
* greater than the number of capture groups. A $ not followed by a digit has no special meaning,
|
17
|
+
* and will appear in the substitution text as itself, a $.
|
18
|
+
* \ Treat the following character as a literal, suppressing any special meaning. Backslash escaping in
|
19
|
+
* substitution text is only required for '$' and '\', but may be used on any other character without bad effects.
|
20
|
+
*
|
21
|
+
*
|
22
|
+
* Valid URegexp options are: COMMENTS, MULTILINE, DOTALL, IGNORECASE, which can be OR'ed.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include "icu_common.h"
|
26
|
+
extern VALUE rb_cURegexp;
|
27
|
+
extern VALUE rb_cUString;
|
28
|
+
extern VALUE rb_cUMatch;
|
29
|
+
VALUE icu_umatch_aref(VALUE match, VALUE idx);
|
30
|
+
VALUE icu_umatch_new (VALUE re);
|
31
|
+
extern VALUE icu_ustr_new(const UChar * ptr, long len);
|
32
|
+
extern VALUE icu_ustr_new2(const UChar * ptr);
|
33
|
+
extern void ustr_splice_units(ICUString * str, long start, long del_len, const UChar * replacement, long repl_len);
|
34
|
+
extern VALUE icu_from_rstr(int, VALUE *, VALUE);
|
35
|
+
|
36
|
+
/* --------- regular expressions */
|
37
|
+
void icu_regex_free( ICURegexp *ptr)
|
38
|
+
{
|
39
|
+
if (ptr->pattern)
|
40
|
+
uregex_close(ptr->pattern);
|
41
|
+
ptr->pattern = 0;
|
42
|
+
free(ptr);
|
43
|
+
}
|
44
|
+
|
45
|
+
VALUE
|
46
|
+
icu_reg_s_alloc(klass)
|
47
|
+
VALUE klass;
|
48
|
+
{
|
49
|
+
ICURegexp *ptr = ALLOC_N(ICURegexp, 1);
|
50
|
+
ptr->pattern = 0;
|
51
|
+
return Data_Wrap_Struct(klass, 0, icu_regex_free, ptr);
|
52
|
+
}
|
53
|
+
|
54
|
+
void
|
55
|
+
icu_reg_initialize(obj, s, len, options)
|
56
|
+
VALUE obj;
|
57
|
+
const UChar *s;
|
58
|
+
long len;
|
59
|
+
int options;
|
60
|
+
{
|
61
|
+
UParseError pe;
|
62
|
+
UErrorCode status = 0;
|
63
|
+
ICURegexp *re = UREGEX(obj);
|
64
|
+
|
65
|
+
if (re->pattern)
|
66
|
+
uregex_close(re->pattern);
|
67
|
+
re->pattern = uregex_open(s, len, options, &pe, &status);
|
68
|
+
re->options = options;
|
69
|
+
|
70
|
+
if (U_FAILURE(status))
|
71
|
+
rb_raise(rb_eArgError,
|
72
|
+
"Wrong regexp: %s line %d column %d flags %d",
|
73
|
+
u_errorName(status), pe.line, pe.offset, options);
|
74
|
+
|
75
|
+
}
|
76
|
+
|
77
|
+
const UChar *
|
78
|
+
icu_reg_get_pattern(ptr, len)
|
79
|
+
ICURegexp *ptr;
|
80
|
+
int32_t *len;
|
81
|
+
{
|
82
|
+
UErrorCode error = 0;
|
83
|
+
*len = 0;
|
84
|
+
return uregex_pattern(ptr->pattern, len, &error);
|
85
|
+
}
|
86
|
+
|
87
|
+
/**
|
88
|
+
* call-seq:
|
89
|
+
* URegexp.new(str [,options])
|
90
|
+
* URegexp.new(regexp)
|
91
|
+
*
|
92
|
+
* Constructs a new regular expression from <i>pattern</i>, which can be either
|
93
|
+
* a <code>UString</code> or a <code>URegexp</code>.
|
94
|
+
* */
|
95
|
+
VALUE
|
96
|
+
icu_reg_initialize_m(argc, argv, self)
|
97
|
+
int argc;
|
98
|
+
VALUE *argv;
|
99
|
+
VALUE self;
|
100
|
+
{
|
101
|
+
const UChar *s;
|
102
|
+
int32_t len = 0;
|
103
|
+
int flags = 0;
|
104
|
+
|
105
|
+
if (argc == 0 || argc > 2) {
|
106
|
+
rb_raise(rb_eArgError, "wrong number of arguments");
|
107
|
+
}
|
108
|
+
if (CLASS_OF(argv[0]) == rb_cURegexp) {
|
109
|
+
if (argc > 1) {
|
110
|
+
rb_warn("flags ignored");
|
111
|
+
}
|
112
|
+
flags = UREGEX(argv[0])->options;
|
113
|
+
s = icu_reg_get_pattern(UREGEX(argv[0]), &len);
|
114
|
+
} else {
|
115
|
+
Check_Class(argv[0], rb_cUString);
|
116
|
+
if (argc == 2) {
|
117
|
+
if (FIXNUM_P(argv[1]))
|
118
|
+
flags = FIX2INT(argv[1]);
|
119
|
+
else if (RTEST(argv[1]))
|
120
|
+
flags = UREGEX_CASE_INSENSITIVE;
|
121
|
+
}
|
122
|
+
s = ICU_PTR(argv[0]);
|
123
|
+
len = ICU_LEN(argv[0]);
|
124
|
+
}
|
125
|
+
icu_reg_initialize(self, s, len, flags);
|
126
|
+
return self;
|
127
|
+
}
|
128
|
+
|
129
|
+
VALUE
|
130
|
+
icu_reg_new(s, len, options)
|
131
|
+
const UChar *s;
|
132
|
+
long len;
|
133
|
+
int options;
|
134
|
+
{
|
135
|
+
VALUE re = icu_reg_s_alloc(rb_cURegexp);
|
136
|
+
icu_reg_initialize(re, s, len, options);
|
137
|
+
return (VALUE) re;
|
138
|
+
}
|
139
|
+
|
140
|
+
VALUE
|
141
|
+
icu_reg_clone(obj)
|
142
|
+
VALUE obj;
|
143
|
+
{
|
144
|
+
ICURegexp *regex = UREGEX(obj);
|
145
|
+
URegularExpression *old_pattern = UREGEX(obj)->pattern;
|
146
|
+
VALUE ret ;
|
147
|
+
UErrorCode status = U_ZERO_ERROR;
|
148
|
+
URegularExpression * new_pattern = uregex_clone(regex->pattern, &status);
|
149
|
+
if(U_FAILURE(status) ){
|
150
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
151
|
+
}
|
152
|
+
ret = icu_reg_s_alloc(rb_cURegexp);
|
153
|
+
regex = UREGEX(ret);
|
154
|
+
regex->pattern = old_pattern;
|
155
|
+
UREGEX(obj)->pattern = new_pattern;
|
156
|
+
return ret;
|
157
|
+
}
|
158
|
+
VALUE
|
159
|
+
icu_reg_comp(str)
|
160
|
+
VALUE str;
|
161
|
+
{
|
162
|
+
return icu_reg_new(USTRING(str)->ptr, USTRING(str)->len, 0);
|
163
|
+
}
|
164
|
+
|
165
|
+
/**
|
166
|
+
* call-seq:
|
167
|
+
* regexp.to_u => URegexp
|
168
|
+
*
|
169
|
+
* Converts Ruby Regexp to unicode URegexp, assuming it is in UTF8 encoding.
|
170
|
+
* $KCODE must be set to 'u' to work reliably
|
171
|
+
*/
|
172
|
+
VALUE icu_reg_from_rb_reg(re)
|
173
|
+
VALUE re;
|
174
|
+
{
|
175
|
+
return icu_reg_comp(icu_from_rstr(0, NULL, rb_funcall(re, rb_intern("to_s"), 0)));
|
176
|
+
}
|
177
|
+
|
178
|
+
/**
|
179
|
+
* call-seq:
|
180
|
+
* uregex.to_u
|
181
|
+
*
|
182
|
+
* Returns UString of this URegexp pattern.
|
183
|
+
* */
|
184
|
+
VALUE
|
185
|
+
icu_reg_to_u(self)
|
186
|
+
VALUE self;
|
187
|
+
{
|
188
|
+
int32_t len = 0;
|
189
|
+
const UChar *s = icu_reg_get_pattern(UREGEX(self), &len);
|
190
|
+
return icu_ustr_new(s, len);
|
191
|
+
}
|
192
|
+
|
193
|
+
/**
|
194
|
+
* call-seq:
|
195
|
+
* uregex.split(str, limit)
|
196
|
+
*
|
197
|
+
* Divides <i>str</i> into substrings based on a regexp pattern,
|
198
|
+
* returning an array of these substrings. <i>str</i> is divided where the
|
199
|
+
* pattern matches.
|
200
|
+
* */
|
201
|
+
VALUE
|
202
|
+
icu_reg_split(self, str, limit)
|
203
|
+
VALUE self,
|
204
|
+
str,
|
205
|
+
limit;
|
206
|
+
{
|
207
|
+
VALUE splits;
|
208
|
+
URegularExpression *theRegEx = UREGEX(self)->pattern;
|
209
|
+
UErrorCode error = 0;
|
210
|
+
UChar * dest_buf, **dest_fields;
|
211
|
+
int32_t limt, req_cap, total, i;
|
212
|
+
Check_Class(str, rb_cUString);
|
213
|
+
if (limit != Qnil)
|
214
|
+
Check_Type(limit, T_FIXNUM);
|
215
|
+
splits = rb_ary_new();
|
216
|
+
dest_buf = ALLOCA_N(UChar, USTRING(str)->len * 2);
|
217
|
+
dest_fields = ALLOCA_N(UChar *, USTRING(str)->len);
|
218
|
+
limt = limit == Qnil ? USTRING(str)->len : FIX2INT(limit);
|
219
|
+
uregex_setText(theRegEx, USTRING(str)->ptr, USTRING(str)->len, &error);
|
220
|
+
if (U_FAILURE(error))
|
221
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
222
|
+
req_cap = 0;
|
223
|
+
total =
|
224
|
+
uregex_split(theRegEx, dest_buf, USTRING(str)->len * 2, &req_cap,
|
225
|
+
dest_fields, limt, &error);
|
226
|
+
if (U_FAILURE(error))
|
227
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
228
|
+
|
229
|
+
for (i = 0; i < total; i++)
|
230
|
+
rb_ary_push(splits, icu_ustr_new2(dest_fields[i]));
|
231
|
+
return splits;
|
232
|
+
}
|
233
|
+
|
234
|
+
long
|
235
|
+
icu_reg_search(re, str, pos, reverse)
|
236
|
+
VALUE re,
|
237
|
+
str;
|
238
|
+
long pos,
|
239
|
+
reverse;
|
240
|
+
{
|
241
|
+
UErrorCode error = 0;
|
242
|
+
long cur_pos = 0;
|
243
|
+
long start,
|
244
|
+
last;
|
245
|
+
|
246
|
+
if (!reverse) {
|
247
|
+
start = pos;
|
248
|
+
} else {
|
249
|
+
start = 0;
|
250
|
+
}
|
251
|
+
|
252
|
+
uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
|
253
|
+
USTRING(str)->len, &error);
|
254
|
+
if (U_FAILURE(error))
|
255
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
256
|
+
if (!uregex_find(UREGEX(re)->pattern, start, &error))
|
257
|
+
return -1;
|
258
|
+
if (U_FAILURE(error))
|
259
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
260
|
+
cur_pos = uregex_start(UREGEX(re)->pattern, 0, &error);
|
261
|
+
if (reverse) {
|
262
|
+
while (uregex_findNext(UREGEX(re)->pattern, &error)) {
|
263
|
+
last = uregex_start(UREGEX(re)->pattern, 0, &error);
|
264
|
+
error = 0;
|
265
|
+
if (reverse && last > pos)
|
266
|
+
break;
|
267
|
+
cur_pos = last;
|
268
|
+
}
|
269
|
+
}
|
270
|
+
if (reverse && cur_pos > pos)
|
271
|
+
return -1;
|
272
|
+
return cur_pos;
|
273
|
+
}
|
274
|
+
|
275
|
+
VALUE
|
276
|
+
icu_reg_nth_match(re, nth)
|
277
|
+
VALUE re;
|
278
|
+
long nth;
|
279
|
+
{
|
280
|
+
URegularExpression *the_expr = UREGEX(re)->pattern;
|
281
|
+
UErrorCode error = 0;
|
282
|
+
long start = uregex_start(the_expr, nth, &error), end;
|
283
|
+
int32_t len;
|
284
|
+
if (U_FAILURE(error)) {
|
285
|
+
return Qnil;
|
286
|
+
}
|
287
|
+
end = uregex_end(the_expr, nth, &error);
|
288
|
+
len = 0;
|
289
|
+
return icu_ustr_new(uregex_getText(the_expr, &len, &error) + start,
|
290
|
+
end - start);
|
291
|
+
}
|
292
|
+
|
293
|
+
VALUE
|
294
|
+
icu_reg_range(re, nth, start, end)
|
295
|
+
VALUE re;
|
296
|
+
int nth;
|
297
|
+
long *start;
|
298
|
+
long *end;
|
299
|
+
{
|
300
|
+
URegularExpression *the_expr = UREGEX(re)->pattern;
|
301
|
+
UErrorCode error = 0;
|
302
|
+
*start = uregex_start(the_expr, nth, &error);
|
303
|
+
if (U_FAILURE(error))
|
304
|
+
return Qnil;
|
305
|
+
*end = uregex_end(the_expr, nth, &error);
|
306
|
+
return Qtrue;
|
307
|
+
}
|
308
|
+
|
309
|
+
/**
|
310
|
+
* call-seq:
|
311
|
+
* uregex.match(str) => matchdata or nil
|
312
|
+
* uregex =~ (str) => matchdata or nil
|
313
|
+
*
|
314
|
+
* Returns a <code>UMatch</code> object describing the match, or
|
315
|
+
* <code>nil</code> if there was no match.
|
316
|
+
*
|
317
|
+
* ure("(.)(.)(.)").match("abc".u)[2] #=> "b"
|
318
|
+
*/
|
319
|
+
VALUE
|
320
|
+
icu_reg_match(re, str)
|
321
|
+
VALUE re,
|
322
|
+
str;
|
323
|
+
{
|
324
|
+
UErrorCode error = 0;
|
325
|
+
Check_Class(str, rb_cUString);
|
326
|
+
uregex_setText(UREGEX(re)->pattern, USTRING(str)->ptr,
|
327
|
+
USTRING(str)->len, &error);
|
328
|
+
if (U_FAILURE(error))
|
329
|
+
rb_raise(rb_eArgError, u_errorName(error));
|
330
|
+
if (uregex_find(UREGEX(re)->pattern, 0, &error)) {
|
331
|
+
return icu_umatch_new(re);
|
332
|
+
}
|
333
|
+
return Qnil;
|
334
|
+
}
|
335
|
+
|
336
|
+
/**
|
337
|
+
* call-seq:
|
338
|
+
* rxp === str => true or false
|
339
|
+
*
|
340
|
+
* Case Equality---Synonym for <code>URegexp#=~</code> used in case statements.
|
341
|
+
*
|
342
|
+
* a = "HELLO".u
|
343
|
+
* case a
|
344
|
+
* when ure("^[a-z]*$"); print "Lower case\n"
|
345
|
+
* when ure("^[A-Z]*$"); print "Upper case\n"
|
346
|
+
* else; print "Mixed case\n"
|
347
|
+
* end
|
348
|
+
*
|
349
|
+
* <em>produces:</em>
|
350
|
+
*
|
351
|
+
* Upper case
|
352
|
+
*/
|
353
|
+
VALUE
|
354
|
+
icu_reg_eqq(re, str)
|
355
|
+
VALUE re,
|
356
|
+
str;
|
357
|
+
{
|
358
|
+
long start;
|
359
|
+
Check_Class(str, rb_cUString);
|
360
|
+
start = icu_reg_search(re, str, 0, 0);
|
361
|
+
return start < 0 ? Qfalse : Qtrue;
|
362
|
+
}
|
363
|
+
|
364
|
+
|
365
|
+
long
|
366
|
+
icu_group_count(re)
|
367
|
+
VALUE re;
|
368
|
+
{
|
369
|
+
UErrorCode error = 0;
|
370
|
+
return uregex_groupCount(UREGEX(re)->pattern, &error);
|
371
|
+
}
|
372
|
+
|
373
|
+
int
|
374
|
+
icu_reg_find_next(pat)
|
375
|
+
VALUE pat;
|
376
|
+
{
|
377
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
378
|
+
UErrorCode error = 0;
|
379
|
+
return uregex_findNext(the_expr, &error);
|
380
|
+
}
|
381
|
+
|
382
|
+
static const UChar BACKSLASH = 0x5c;
|
383
|
+
static const UChar DOLLARSIGN = 0x24;
|
384
|
+
|
385
|
+
VALUE
|
386
|
+
icu_reg_get_replacement(pat, repl_text, prev_end)
|
387
|
+
VALUE pat,
|
388
|
+
repl_text;
|
389
|
+
long prev_end;
|
390
|
+
{
|
391
|
+
UErrorCode error = U_ZERO_ERROR;
|
392
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
393
|
+
VALUE ret = icu_ustr_new(0, 0);
|
394
|
+
|
395
|
+
/* scan the replacement text, looking for substitutions ($n) and \escapes. */
|
396
|
+
int32_t replIdx = 0;
|
397
|
+
int32_t replacementLength = ICU_LEN(repl_text);
|
398
|
+
UChar *replacementText = ICU_PTR(repl_text);
|
399
|
+
int32_t numDigits = 0;
|
400
|
+
int32_t groupNum = 0, g_start, g_end;
|
401
|
+
UChar32 digitC;
|
402
|
+
int32_t len;
|
403
|
+
/* following code is rewritten version of code found */
|
404
|
+
/* in ICU sources : i18n/regexp.cpp */
|
405
|
+
while (replIdx < replacementLength) {
|
406
|
+
UChar c = replacementText[replIdx];
|
407
|
+
replIdx++;
|
408
|
+
if (c != DOLLARSIGN && c != BACKSLASH) {
|
409
|
+
/* Common case, no substitution, no escaping, */
|
410
|
+
/* just copy the char to the dest buf. */
|
411
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx-1, 1);
|
412
|
+
continue;
|
413
|
+
}
|
414
|
+
|
415
|
+
if (c == BACKSLASH) {
|
416
|
+
/* Backslash Escape. Copy the following char out without further checks. */
|
417
|
+
/* Note: Surrogate pairs don't need any special handling */
|
418
|
+
/* The second half wont be a '$' or a '\', and */
|
419
|
+
/* will move to the dest normally on the next */
|
420
|
+
/* loop iteration. */
|
421
|
+
if (replIdx >= replacementLength) {
|
422
|
+
break;
|
423
|
+
}
|
424
|
+
/* ICU4R : \uxxxx case is removed for simplicity : if (c==0x55 || c==0x75) { */
|
425
|
+
|
426
|
+
/* Plain backslash escape. Just put out the escaped character. */
|
427
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, replacementText+replIdx, 1);
|
428
|
+
replIdx++;
|
429
|
+
continue;
|
430
|
+
}
|
431
|
+
|
432
|
+
/* We've got a $. Pick up a capture group number if one follows. */
|
433
|
+
/* Consume at most the number of digits necessary for the largest capture */
|
434
|
+
/* number that is valid for this pattern. */
|
435
|
+
numDigits = 0;
|
436
|
+
groupNum = 0;
|
437
|
+
|
438
|
+
for (;;) {
|
439
|
+
if (replIdx >= replacementLength) {
|
440
|
+
break;
|
441
|
+
}
|
442
|
+
U16_GET(replacementText, 0, replIdx, replacementLength, digitC); /* care surrogates */
|
443
|
+
if (u_isdigit(digitC) == FALSE) {
|
444
|
+
break;
|
445
|
+
}
|
446
|
+
|
447
|
+
U16_FWD_1(replacementText, replIdx, replacementLength); /* care surrogates */
|
448
|
+
groupNum=groupNum*10 + u_charDigitValue(digitC);
|
449
|
+
numDigits++;
|
450
|
+
if (numDigits >= 3) { /* limit 999 groups */
|
451
|
+
break;
|
452
|
+
}
|
453
|
+
}
|
454
|
+
|
455
|
+
if (numDigits == 0) {
|
456
|
+
/* The $ didn't introduce a group number at all. */
|
457
|
+
/* Treat it as just part of the substitution text. */
|
458
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0, &DOLLARSIGN, 1);
|
459
|
+
continue;
|
460
|
+
}
|
461
|
+
|
462
|
+
/* Finally, append the capture group data to the destination. */
|
463
|
+
error = U_ZERO_ERROR;
|
464
|
+
g_start = uregex_start(the_expr, groupNum, &error);
|
465
|
+
g_end = uregex_end (the_expr, groupNum, &error);
|
466
|
+
if(U_SUCCESS(error) && g_start != -1 ) {
|
467
|
+
ustr_splice_units(USTRING(ret), ICU_LEN(ret), 0,
|
468
|
+
uregex_getText(the_expr, &len, &error) + g_start, g_end - g_start);
|
469
|
+
}
|
470
|
+
|
471
|
+
}
|
472
|
+
return ret;
|
473
|
+
}
|
474
|
+
|
475
|
+
VALUE
|
476
|
+
icu_reg_get_prematch(pat, prev_end)
|
477
|
+
VALUE pat;
|
478
|
+
long prev_end;
|
479
|
+
{
|
480
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
481
|
+
UErrorCode error = 0;
|
482
|
+
int32_t len = 0;
|
483
|
+
int32_t cur_start = uregex_start(the_expr, 0, &error);
|
484
|
+
const UChar *temp = uregex_getText(the_expr, &len, &error);
|
485
|
+
VALUE pm =
|
486
|
+
icu_ustr_new(temp + prev_end, cur_start - prev_end);
|
487
|
+
return pm;
|
488
|
+
}
|
489
|
+
|
490
|
+
VALUE
|
491
|
+
icu_reg_get_tail(pat, prev_end)
|
492
|
+
VALUE pat;
|
493
|
+
long prev_end;
|
494
|
+
{
|
495
|
+
UErrorCode error = U_ZERO_ERROR;
|
496
|
+
URegularExpression *the_expr = UREGEX(pat)->pattern;
|
497
|
+
int32_t len = 0;
|
498
|
+
const UChar *temp = uregex_getText(the_expr, &len, &error);
|
499
|
+
VALUE pm = icu_ustr_new(temp + prev_end, len - prev_end);
|
500
|
+
return pm;
|
501
|
+
}
|
502
|
+
|
503
|
+
/**
|
504
|
+
* call-seq:
|
505
|
+
* ure(str[, options]) => URegexp
|
506
|
+
*
|
507
|
+
* Creates URegexp object from UString.
|
508
|
+
* */
|
509
|
+
VALUE
|
510
|
+
icu_reg_from_rb_str(argc, argv, obj)
|
511
|
+
int argc;
|
512
|
+
VALUE *argv;
|
513
|
+
VALUE obj;
|
514
|
+
{
|
515
|
+
VALUE pat,
|
516
|
+
options = Qnil;
|
517
|
+
int reg_opts = 0;
|
518
|
+
if (rb_scan_args(argc, argv, "11", &pat, &options) == 1) {
|
519
|
+
reg_opts = 0;
|
520
|
+
} else {
|
521
|
+
if (options != Qnil) {
|
522
|
+
Check_Type(options, T_FIXNUM);
|
523
|
+
reg_opts = FIX2INT(options);
|
524
|
+
}
|
525
|
+
}
|
526
|
+
if (TYPE(pat) == T_STRING)
|
527
|
+
pat = icu_from_rstr(0, NULL, pat);
|
528
|
+
if (CLASS_OF(pat) != rb_cUString)
|
529
|
+
rb_raise(rb_eArgError, "Expected String or UString");
|
530
|
+
return icu_reg_new(ICU_PTR(pat), ICU_LEN(pat), reg_opts);
|
531
|
+
}
|
532
|
+
|
533
|
+
/**
|
534
|
+
* call-seq:
|
535
|
+
* umatch[idx] => string
|
536
|
+
*
|
537
|
+
* Returns capture group. Group 0 is for full match.
|
538
|
+
* */
|
539
|
+
VALUE
|
540
|
+
icu_umatch_aref(match, index)
|
541
|
+
VALUE match,
|
542
|
+
index;
|
543
|
+
{
|
544
|
+
long idx;
|
545
|
+
VALUE cg;
|
546
|
+
Check_Type(index, T_FIXNUM);
|
547
|
+
idx = FIX2LONG(index);
|
548
|
+
cg = rb_iv_get(match, "@cg");
|
549
|
+
return rb_ary_entry(cg, idx);
|
550
|
+
}
|
551
|
+
|
552
|
+
/**
|
553
|
+
* call-seq:
|
554
|
+
* umatch.range(idx) => range
|
555
|
+
*
|
556
|
+
* Returns range (start, end) of capture group. Group 0 is for full match.
|
557
|
+
*
|
558
|
+
* NOTE: this method returns <b>code unit</b> indexes. To convert this range
|
559
|
+
* to <b>code point</b> range use UString#conv_unit_range. If your chars don't
|
560
|
+
* require surrogate UTF16 pairs, range will be the same.
|
561
|
+
* */
|
562
|
+
VALUE
|
563
|
+
icu_umatch_range(match, index)
|
564
|
+
VALUE match,
|
565
|
+
index;
|
566
|
+
{
|
567
|
+
long idx;
|
568
|
+
VALUE cg;
|
569
|
+
Check_Type(index, T_FIXNUM);
|
570
|
+
idx = FIX2LONG(index);
|
571
|
+
cg = rb_iv_get(match, "@ranges");
|
572
|
+
return rb_ary_entry(cg, idx);
|
573
|
+
}
|
574
|
+
|
575
|
+
|
576
|
+
/**
|
577
|
+
* call-seq:
|
578
|
+
* umatch.size => fixnum
|
579
|
+
*
|
580
|
+
* Returns number of capture groups.
|
581
|
+
* */
|
582
|
+
VALUE
|
583
|
+
icu_umatch_size(match)
|
584
|
+
VALUE match;
|
585
|
+
{
|
586
|
+
VALUE cg = rb_iv_get(match, "@cg");
|
587
|
+
return LONG2NUM(RARRAY(cg)->len - 1);
|
588
|
+
}
|
589
|
+
|
590
|
+
|
591
|
+
VALUE
|
592
|
+
icu_umatch_init( self, re)
|
593
|
+
VALUE self, re;
|
594
|
+
{
|
595
|
+
UErrorCode status = U_ZERO_ERROR;
|
596
|
+
long count, i, cu_start, cu_end;
|
597
|
+
URegularExpression * the_regex;
|
598
|
+
VALUE obj, groups, ranges;
|
599
|
+
|
600
|
+
Check_Class(re, rb_cURegexp);
|
601
|
+
the_regex = UREGEX(re)->pattern;
|
602
|
+
count = uregex_groupCount(the_regex, &status);
|
603
|
+
if (U_FAILURE(status)) {
|
604
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
605
|
+
}
|
606
|
+
groups = rb_ary_new2(count);
|
607
|
+
rb_iv_set(self, "@cg", groups);
|
608
|
+
for (i = 0; i <= count; i++) {
|
609
|
+
obj = icu_reg_nth_match(re, i);
|
610
|
+
rb_obj_freeze(obj);
|
611
|
+
rb_ary_store(groups, i, obj);
|
612
|
+
}
|
613
|
+
|
614
|
+
ranges = rb_ary_new2(count);
|
615
|
+
for ( i = 0; i <= count; i++){
|
616
|
+
cu_start = uregex_start(the_regex, i, &status);
|
617
|
+
cu_end = uregex_end(the_regex, i, &status);
|
618
|
+
if( cu_start == -1) rb_ary_store(ranges, i, Qnil);
|
619
|
+
else rb_ary_store(ranges, i, rb_range_new(LONG2NUM(cu_start), LONG2NUM(cu_end-1), 0));
|
620
|
+
}
|
621
|
+
rb_iv_set(self, "@ranges", ranges);
|
622
|
+
return self;
|
623
|
+
}
|
624
|
+
VALUE icu_umatch_new(re)
|
625
|
+
VALUE re;
|
626
|
+
{
|
627
|
+
return icu_umatch_init(rb_class_new_instance(0, NULL, rb_cUMatch), re);
|
628
|
+
}
|
629
|
+
|
630
|
+
|
631
|
+
|
632
|
+
|
633
|
+
void initialize_uregexp (void)
|
634
|
+
{
|
635
|
+
/* regular expressions */
|
636
|
+
rb_cURegexp = rb_define_class("URegexp", rb_cObject);
|
637
|
+
rb_define_alloc_func(rb_cURegexp, icu_reg_s_alloc);
|
638
|
+
rb_define_method(rb_cURegexp, "initialize", icu_reg_initialize_m, -1);
|
639
|
+
rb_define_method(rb_cURegexp, "to_u", icu_reg_to_u, 0);
|
640
|
+
rb_define_method(rb_cURegexp, "match", icu_reg_match, 1);
|
641
|
+
rb_define_method(rb_cURegexp, "split", icu_reg_split, 2);
|
642
|
+
rb_define_method(rb_cURegexp, "=~", icu_reg_match, 1);
|
643
|
+
rb_define_method(rb_cURegexp, "===", icu_reg_eqq, 1);
|
644
|
+
|
645
|
+
/* Enable case insensitive matching. */
|
646
|
+
rb_define_const(rb_cURegexp, "IGNORECASE", INT2FIX(UREGEX_CASE_INSENSITIVE));
|
647
|
+
/* Allow white space and comments within patterns */
|
648
|
+
rb_define_const(rb_cURegexp, "COMMENTS", INT2FIX(UREGEX_COMMENTS));
|
649
|
+
/* Control behavior of "$" and "^" If set, recognize line terminators within string, otherwise, match only at start and end of input string. */
|
650
|
+
rb_define_const(rb_cURegexp, "MULTILINE", INT2FIX(UREGEX_MULTILINE));
|
651
|
+
/* If set, '.' matches line terminators, otherwise '.' matching stops at line end. */
|
652
|
+
rb_define_const(rb_cURegexp, "DOTALL", INT2FIX(UREGEX_DOTALL));
|
653
|
+
|
654
|
+
|
655
|
+
rb_define_global_function("ure", icu_reg_from_rb_str, -1);
|
656
|
+
|
657
|
+
/**
|
658
|
+
* Document-class: UMatch
|
659
|
+
*
|
660
|
+
* Class to store information about capturing
|
661
|
+
* groups. Used in UString#sub, UString#gsub methods, as parameter to
|
662
|
+
* passed block.
|
663
|
+
*/
|
664
|
+
rb_cUMatch = rb_define_class("UMatch", rb_cObject);
|
665
|
+
rb_define_method(rb_cUMatch, "[]", icu_umatch_aref, 1);
|
666
|
+
rb_define_method(rb_cUMatch, "size", icu_umatch_size, 0);
|
667
|
+
rb_define_method(rb_cUMatch, "range", icu_umatch_range, 1);
|
668
|
+
|
669
|
+
rb_define_method(rb_cRegexp, "to_u", icu_reg_from_rb_reg, 0);
|
670
|
+
rb_define_alias (rb_cRegexp, "U", "to_u");
|
671
|
+
rb_define_alias (rb_cRegexp, "ur", "to_u");
|
672
|
+
|
673
|
+
}
|
data/uregex.h
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
extern void icu_regex_free (ICURegexp *ptr);
|
2
|
+
extern VALUE icu_reg_s_alloc (VALUE klass);
|
3
|
+
extern VALUE icu_reg_initialize_m (int argc, VALUE *argv, VALUE self);
|
4
|
+
extern VALUE icu_reg_new (UChar *s, long len, int options) ;
|
5
|
+
extern VALUE icu_reg_clone (VALUE obj);
|
6
|
+
extern VALUE icu_reg_comp (VALUE str);
|
7
|
+
extern VALUE icu_reg_from_rb_reg (VALUE re);
|
8
|
+
extern VALUE icu_reg_to_u (VALUE self);
|
9
|
+
extern VALUE icu_reg_split (VALUE self, VALUE str, VALUE limit);
|
10
|
+
extern VALUE icu_reg_nth_match (VALUE re, long nth);
|
11
|
+
extern VALUE icu_reg_range (VALUE re, int nth, long *start, long *end);
|
12
|
+
extern VALUE icu_reg_match (VALUE re, VALUE str);
|
13
|
+
extern VALUE icu_reg_eqq (VALUE re, VALUE str);
|
14
|
+
extern int icu_reg_find_next (VALUE pat);
|
15
|
+
extern VALUE icu_reg_get_replacement (VALUE pat, VALUE repl_text, long prev_end);
|
16
|
+
extern VALUE icu_reg_get_prematch (VALUE pat, long prev_end);
|
17
|
+
extern VALUE icu_reg_get_tail (VALUE pat, long prev_end);
|
18
|
+
extern VALUE icu_reg_from_rb_str (int argc, VALUE *argv, VALUE obj);
|
19
|
+
extern VALUE icu_umatch_range (VALUE match, VALUE index);
|
20
|
+
extern VALUE icu_umatch_size (VALUE match);
|
21
|
+
extern VALUE icu_umatch_init (VALUE self, VALUE re);
|
22
|
+
extern VALUE icu_umatch_aref (VALUE match, VALUE idx);
|
23
|
+
extern VALUE icu_umatch_new (VALUE re);
|
24
|
+
extern long icu_group_count(VALUE re);
|
25
|
+
extern long icu_reg_search(VALUE re, VALUE str, int pos, int reverse);
|
26
|
+
|
27
|
+
extern void initialize_uregexp (void);
|