jasherai-oniguruma 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +56 -0
- data/Manifest.txt +10 -0
- data/README.txt +71 -0
- data/Rakefile +67 -0
- data/Syntax.txt +396 -0
- data/ext/extconf.rb +9 -0
- data/ext/oregexp.c +739 -0
- data/lib/oniguruma.rb +479 -0
- data/test/test_oniguruma.rb +371 -0
- metadata +75 -0
data/ext/extconf.rb
ADDED
data/ext/oregexp.c
ADDED
@@ -0,0 +1,739 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <oniguruma.h>
|
3
|
+
/*
|
4
|
+
TODO:
|
5
|
+
- Complete oregexp_match with range parameter.
|
6
|
+
*/
|
7
|
+
|
8
|
+
typedef struct _oregexp {
|
9
|
+
regex_t * reg;
|
10
|
+
} ORegexp;
|
11
|
+
|
12
|
+
VALUE mOniguruma;
|
13
|
+
VALUE nameHash;
|
14
|
+
|
15
|
+
static void oregexp_free( ORegexp * oregexp) {
|
16
|
+
onig_free( oregexp->reg );
|
17
|
+
free( oregexp );
|
18
|
+
}
|
19
|
+
|
20
|
+
static VALUE oregexp_allocate( VALUE klass ) {
|
21
|
+
ORegexp * oregexp = malloc( sizeof( ORegexp ) );
|
22
|
+
oregexp->reg = NULL;
|
23
|
+
return Data_Wrap_Struct( klass, 0, oregexp_free, oregexp );
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
static OnigEncodingType * int2encoding( VALUE v_index ) {
|
28
|
+
int index;
|
29
|
+
if( ! NIL_P(v_index) ) {
|
30
|
+
index = FIX2INT(v_index);
|
31
|
+
switch( index ) {
|
32
|
+
case 0: return ONIG_ENCODING_ASCII;
|
33
|
+
case 1: return ONIG_ENCODING_ISO_8859_1;
|
34
|
+
case 2: return ONIG_ENCODING_ISO_8859_2;
|
35
|
+
case 3: return ONIG_ENCODING_ISO_8859_3;
|
36
|
+
case 4: return ONIG_ENCODING_ISO_8859_4;
|
37
|
+
case 5: return ONIG_ENCODING_ISO_8859_5;
|
38
|
+
case 6: return ONIG_ENCODING_ISO_8859_6;
|
39
|
+
case 7: return ONIG_ENCODING_ISO_8859_7;
|
40
|
+
case 8: return ONIG_ENCODING_ISO_8859_8;
|
41
|
+
case 9: return ONIG_ENCODING_ISO_8859_9;
|
42
|
+
case 10: return ONIG_ENCODING_ISO_8859_10;
|
43
|
+
case 11: return ONIG_ENCODING_ISO_8859_11;
|
44
|
+
case 12: return ONIG_ENCODING_ISO_8859_11;
|
45
|
+
case 13: return ONIG_ENCODING_ISO_8859_13;
|
46
|
+
case 14: return ONIG_ENCODING_ISO_8859_14;
|
47
|
+
case 15: return ONIG_ENCODING_ISO_8859_15;
|
48
|
+
case 16: return ONIG_ENCODING_ISO_8859_16;
|
49
|
+
case 17: return ONIG_ENCODING_UTF8;
|
50
|
+
case 22: return ONIG_ENCODING_EUC_JP;
|
51
|
+
case 23: return ONIG_ENCODING_EUC_TW;
|
52
|
+
case 24: return ONIG_ENCODING_EUC_KR;
|
53
|
+
case 25: return ONIG_ENCODING_EUC_CN;
|
54
|
+
case 26: return ONIG_ENCODING_SJIS;
|
55
|
+
/*case 27: return ONIG_ENCODING_KOI8;*/
|
56
|
+
case 28: return ONIG_ENCODING_KOI8_R;
|
57
|
+
#if ONIGURUMA_VERSION_MAJOR == 5
|
58
|
+
case 29: return ONIG_ENCODING_CP1251;
|
59
|
+
#endif
|
60
|
+
case 30: return ONIG_ENCODING_BIG5;
|
61
|
+
case 32: return ONIG_ENCODING_UNDEF;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
return ONIG_ENCODING_UNDEF;
|
65
|
+
}
|
66
|
+
|
67
|
+
static OnigSyntaxType * int2syntax( VALUE v_index ) {
|
68
|
+
int index;
|
69
|
+
if( ! NIL_P(v_index) ) {
|
70
|
+
index = FIX2INT(v_index);
|
71
|
+
switch( index ) {
|
72
|
+
case 1: return ONIG_SYNTAX_POSIX_BASIC;
|
73
|
+
case 2: return ONIG_SYNTAX_POSIX_EXTENDED;
|
74
|
+
case 3: return ONIG_SYNTAX_EMACS;
|
75
|
+
case 4: return ONIG_SYNTAX_GREP;
|
76
|
+
case 5: return ONIG_SYNTAX_GNU_REGEX;
|
77
|
+
case 6: return ONIG_SYNTAX_JAVA;
|
78
|
+
case 7: return ONIG_SYNTAX_PERL;
|
79
|
+
case 9: return ONIG_SYNTAX_RUBY;
|
80
|
+
case 10: return ONIG_SYNTAX_DEFAULT;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
return ONIG_SYNTAX_DEFAULT;
|
84
|
+
}
|
85
|
+
|
86
|
+
struct callback_packet {
|
87
|
+
VALUE hash;
|
88
|
+
OnigRegion * region;
|
89
|
+
};
|
90
|
+
|
91
|
+
static int name_callback(
|
92
|
+
const UChar* name,
|
93
|
+
const UChar* name_end,
|
94
|
+
int ngroup_num,
|
95
|
+
int* group_nums,
|
96
|
+
regex_t* reg,
|
97
|
+
struct callback_packet* arg
|
98
|
+
) {
|
99
|
+
int i, gn;
|
100
|
+
VALUE nameHash = arg->hash;
|
101
|
+
|
102
|
+
for (i = 0; i < ngroup_num; i++) {
|
103
|
+
gn = group_nums[i];
|
104
|
+
rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
|
105
|
+
}
|
106
|
+
return 0;
|
107
|
+
}
|
108
|
+
|
109
|
+
static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
110
|
+
ORegexp *oregexp;
|
111
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
112
|
+
|
113
|
+
VALUE pattern_str = StringValue( pattern );
|
114
|
+
rb_iv_set( self, "@pattern", pattern_str );
|
115
|
+
rb_iv_set( self, "@options", options );
|
116
|
+
UChar* pat_ptr = RSTRING(pattern_str)->ptr;
|
117
|
+
int pat_len = RSTRING(pattern_str)->len;
|
118
|
+
VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
|
119
|
+
VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
|
120
|
+
VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
|
121
|
+
int iOptions = NUM2INT( rOptions );
|
122
|
+
OnigEncodingType * iEncoding = int2encoding( rEncoding );
|
123
|
+
OnigSyntaxType * iSyntax = int2syntax( rSyntax );
|
124
|
+
|
125
|
+
|
126
|
+
int r;
|
127
|
+
OnigErrorInfo einfo;
|
128
|
+
r = onig_new(&(oregexp->reg), pat_ptr, pat_ptr + pat_len, iOptions, iEncoding, iSyntax, &einfo);
|
129
|
+
if (r != ONIG_NORMAL) {
|
130
|
+
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
131
|
+
onig_error_code_to_str(s, r, &einfo);
|
132
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
133
|
+
}
|
134
|
+
return self;
|
135
|
+
}
|
136
|
+
|
137
|
+
/* can't include re.h, since it conflicts with oniguruma typedefs */
|
138
|
+
struct RMatch {
|
139
|
+
struct RBasic basic;
|
140
|
+
VALUE str;
|
141
|
+
struct re_registers *regs;
|
142
|
+
};
|
143
|
+
#define RMATCH(obj) (R_CAST(RMatch)(obj))
|
144
|
+
void rb_match_busy _((VALUE));
|
145
|
+
|
146
|
+
static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
|
147
|
+
VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
|
148
|
+
NEWOBJ(match, struct RMatch);
|
149
|
+
OBJSETUP(match, rb_cMatch, T_MATCH);
|
150
|
+
VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
|
151
|
+
int i , count = region->num_regs;
|
152
|
+
struct callback_packet packet;
|
153
|
+
|
154
|
+
match->str = rb_str_new4(string_str);
|
155
|
+
match->regs = ALLOC(struct re_registers);
|
156
|
+
match->regs->allocated = count;
|
157
|
+
match->regs->num_regs = count;
|
158
|
+
match->regs->beg = ALLOC_N(int, count);
|
159
|
+
match->regs->end = ALLOC_N(int, count);
|
160
|
+
|
161
|
+
for ( i = 0; i < count; i++){
|
162
|
+
match->regs->beg[i] = region->beg[i];
|
163
|
+
match->regs->end[i] = region->end[i];
|
164
|
+
}
|
165
|
+
rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
|
166
|
+
packet.region = region;
|
167
|
+
if( onig_number_of_names( oregexp->reg ) > 0 ) {
|
168
|
+
packet.hash = rb_hash_new();
|
169
|
+
onig_foreach_name(oregexp->reg, name_callback, &packet);
|
170
|
+
rb_iv_set((VALUE)match, "@named_captures", packet.hash);
|
171
|
+
}
|
172
|
+
return (VALUE)match;
|
173
|
+
}
|
174
|
+
|
175
|
+
/**
|
176
|
+
* call-seq:
|
177
|
+
* rxp.match(str) => matchdata or nil
|
178
|
+
* rxp.match(str, begin, end) => matchdata or nil
|
179
|
+
*
|
180
|
+
* Returns a <code>MatchData</code> object describing the match, or
|
181
|
+
* <code>nil</code> if there was no match. This is equivalent to retrieving the
|
182
|
+
* value of the special variable <code>$~</code> following a normal match.
|
183
|
+
*
|
184
|
+
* ORegexp.new('(.)(.)(.)').match("abc")[2] #=> "b"
|
185
|
+
*
|
186
|
+
* The second form allows to perform the match in a region
|
187
|
+
* defined by <code>begin</code> and <code>end</code> while
|
188
|
+
* still taking into account look-behinds and look-forwards.
|
189
|
+
*
|
190
|
+
* ORegexp.new('1*2*').match('11221122').offset => [4,8]
|
191
|
+
* ORegexp.new('(?<=2)1*2*').match('11221122').offset => [4,8]
|
192
|
+
*
|
193
|
+
* Compare with:
|
194
|
+
*
|
195
|
+
* ORegexp.new('(?<=2)1*2*').match('11221122'[4..-1]) => nil
|
196
|
+
*/
|
197
|
+
static VALUE oregexp_match( int argc, VALUE * argv, VALUE self ) {
|
198
|
+
ORegexp *oregexp;
|
199
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
200
|
+
|
201
|
+
|
202
|
+
if ( argc == 0 || argc > 2) {
|
203
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
|
204
|
+
exit;
|
205
|
+
}
|
206
|
+
|
207
|
+
VALUE string_str = StringValue( argv[0] );
|
208
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
209
|
+
int str_len = RSTRING(string_str)->len;
|
210
|
+
|
211
|
+
int begin = 0;
|
212
|
+
int end = str_len;
|
213
|
+
|
214
|
+
if (argc > 1 ) {
|
215
|
+
begin = NUM2INT( argv[1] );
|
216
|
+
}
|
217
|
+
// if (argc > 2) {
|
218
|
+
// end = NUM2INT( argv[2] );
|
219
|
+
// }
|
220
|
+
|
221
|
+
|
222
|
+
OnigRegion *region = onig_region_new();
|
223
|
+
int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr + begin, str_ptr + end, region, ONIG_OPTION_NONE);
|
224
|
+
rb_backref_set(Qnil);
|
225
|
+
if (r >= 0) {
|
226
|
+
VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
|
227
|
+
onig_region_free(region, 1 );
|
228
|
+
rb_backref_set(matchData);
|
229
|
+
rb_match_busy(matchData);
|
230
|
+
return matchData;
|
231
|
+
} else if (r == ONIG_MISMATCH) {
|
232
|
+
onig_region_free(region, 1 );
|
233
|
+
return Qnil;
|
234
|
+
} else {
|
235
|
+
onig_region_free(region, 1 );
|
236
|
+
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
237
|
+
onig_error_code_to_str(s, r);
|
238
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
239
|
+
}
|
240
|
+
|
241
|
+
}
|
242
|
+
|
243
|
+
static const UChar BACKSLASH = 0x5c;
|
244
|
+
|
245
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
246
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
247
|
+
backslash). */
|
248
|
+
|
249
|
+
/* scan the replacement text, looking for substitutions (\n) and \escapes. */
|
250
|
+
static VALUE
|
251
|
+
oregexp_append_replacement(pat, src_text, repl_text, region, ret)
|
252
|
+
VALUE pat,
|
253
|
+
src_text,
|
254
|
+
repl_text;
|
255
|
+
OnigRegion * region;
|
256
|
+
VALUE ret;
|
257
|
+
{
|
258
|
+
ORegexp *oregexp;
|
259
|
+
int32_t replIdx = 0, name_pos, name_start, name_end ;
|
260
|
+
int32_t replacementLength = RSTRING(repl_text)->len;
|
261
|
+
UChar *replacementText = RSTRING(repl_text)->ptr;
|
262
|
+
UChar *replacementEnd = replacementText + (replacementLength-1);
|
263
|
+
long numDigits = 0;
|
264
|
+
long groupNum = 0, g_start, g_end;
|
265
|
+
OnigCodePoint digitC;
|
266
|
+
OnigEncoding enc;
|
267
|
+
const UChar * matchText;
|
268
|
+
long matchLen;
|
269
|
+
|
270
|
+
matchText = RSTRING(src_text)->ptr;
|
271
|
+
matchLen = RSTRING(src_text)->len;
|
272
|
+
Data_Get_Struct( pat, ORegexp, oregexp );
|
273
|
+
enc = onig_get_encoding( oregexp->reg );
|
274
|
+
|
275
|
+
while (replIdx < replacementLength) {
|
276
|
+
OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
277
|
+
int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
278
|
+
if( c_len == 0 ) {
|
279
|
+
rb_warn("Strange, for %d enc_len is 0", c);
|
280
|
+
c_len = 1;
|
281
|
+
}
|
282
|
+
replIdx += c_len;
|
283
|
+
if ( c != BACKSLASH) {
|
284
|
+
/* Common case, no substitution, no escaping, */
|
285
|
+
/* just copy the char to the dest buf. */
|
286
|
+
rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
|
287
|
+
continue;
|
288
|
+
}
|
289
|
+
if (replIdx >= replacementLength) {
|
290
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
|
291
|
+
break;
|
292
|
+
}
|
293
|
+
/* Pick up a capture group number if one follows. */
|
294
|
+
numDigits = 0;
|
295
|
+
groupNum = 0;
|
296
|
+
for (;;) {
|
297
|
+
if (replIdx >= replacementLength) {
|
298
|
+
break;
|
299
|
+
}
|
300
|
+
digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
301
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
302
|
+
if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
|
303
|
+
break;
|
304
|
+
}
|
305
|
+
replIdx += c_len;
|
306
|
+
groupNum=groupNum*10 + (digitC - '0');
|
307
|
+
numDigits++;
|
308
|
+
if (numDigits >= 2) { /* limit 99 groups */
|
309
|
+
break;
|
310
|
+
}
|
311
|
+
}
|
312
|
+
if (numDigits == 0) {
|
313
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
314
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
315
|
+
backslash). */
|
316
|
+
int p_len = c_len;
|
317
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
318
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
319
|
+
switch(c) {
|
320
|
+
case '&' : // matched substring
|
321
|
+
rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
|
322
|
+
replIdx += c_len;
|
323
|
+
break;
|
324
|
+
case '`' : // prematch
|
325
|
+
rb_str_buf_cat(ret, matchText, region->beg[0]);
|
326
|
+
replIdx += c_len;
|
327
|
+
break;
|
328
|
+
case '\'': // postmatch
|
329
|
+
rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
|
330
|
+
replIdx += c_len;
|
331
|
+
break;
|
332
|
+
case '\\': // literal backslash
|
333
|
+
// place single backslash
|
334
|
+
rb_str_buf_cat(ret, replacementText+replIdx, c_len);
|
335
|
+
replIdx += c_len;
|
336
|
+
break;
|
337
|
+
case '+': // last matched group
|
338
|
+
replIdx += c_len;
|
339
|
+
for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
|
340
|
+
g_start = region->beg[ groupNum ];
|
341
|
+
g_end = region->end[ groupNum ];
|
342
|
+
if( g_start != -1 ) {
|
343
|
+
rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
|
344
|
+
break;
|
345
|
+
}
|
346
|
+
}
|
347
|
+
break;
|
348
|
+
case '<': // named group references \<name>
|
349
|
+
name_pos = replIdx+c_len;
|
350
|
+
name_end = name_start = replIdx+c_len;
|
351
|
+
while(name_pos < replacementLength) {
|
352
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
|
353
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
|
354
|
+
name_pos += c_len;
|
355
|
+
if( c == '>') break;
|
356
|
+
if( ONIGENC_IS_CODE_WORD(enc, c) ) {
|
357
|
+
name_end += c_len;
|
358
|
+
} else {
|
359
|
+
break;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
if( c != '>' || name_end == name_start ) {
|
363
|
+
// place backslash and '<'
|
364
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
365
|
+
replIdx += c_len;
|
366
|
+
} else {
|
367
|
+
// lookup for group and subst for that value
|
368
|
+
groupNum = onig_name_to_backref_number( oregexp->reg,
|
369
|
+
replacementText+name_start, replacementText+name_end, region);
|
370
|
+
if( groupNum >= 0 ) {
|
371
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum],
|
372
|
+
region->end[groupNum]-region->beg[groupNum]);
|
373
|
+
}
|
374
|
+
replIdx = name_pos;
|
375
|
+
}
|
376
|
+
break;
|
377
|
+
default:
|
378
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
379
|
+
replIdx += c_len;
|
380
|
+
|
381
|
+
}
|
382
|
+
} else {
|
383
|
+
/* Finally, append the capture group data to the destination. */
|
384
|
+
if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
|
385
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
|
386
|
+
}
|
387
|
+
}
|
388
|
+
}
|
389
|
+
return ret;
|
390
|
+
}
|
391
|
+
|
392
|
+
static inline void
|
393
|
+
str_mod_check(s, p, len)
|
394
|
+
VALUE s;
|
395
|
+
char *p;
|
396
|
+
long len;
|
397
|
+
{
|
398
|
+
if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
|
399
|
+
rb_raise(rb_eRuntimeError, "string modified");
|
400
|
+
}
|
401
|
+
}
|
402
|
+
|
403
|
+
static VALUE
|
404
|
+
oregexp_gsub(self, argc, argv, bang, once, region)
|
405
|
+
VALUE self; // pattern
|
406
|
+
int argc; // should be 1 if block given
|
407
|
+
VALUE *argv; // either replacement string
|
408
|
+
int bang;
|
409
|
+
int once;
|
410
|
+
OnigRegion *region;
|
411
|
+
{
|
412
|
+
VALUE repl;
|
413
|
+
long beg,
|
414
|
+
end,
|
415
|
+
len,
|
416
|
+
prev_end;
|
417
|
+
int tainted = 0,
|
418
|
+
iter = 0;
|
419
|
+
|
420
|
+
VALUE buf, curr_repl, block_res;
|
421
|
+
ORegexp *oregexp;
|
422
|
+
OnigEncoding enc;
|
423
|
+
|
424
|
+
if (argc == 1 && rb_block_given_p()) {
|
425
|
+
iter = 1;
|
426
|
+
} else if (argc == 2) {
|
427
|
+
repl = argv[1];
|
428
|
+
Check_Type(repl, T_STRING);
|
429
|
+
if (OBJ_TAINTED(argv[1]))
|
430
|
+
tainted = 1;
|
431
|
+
} else {
|
432
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
|
433
|
+
}
|
434
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
435
|
+
|
436
|
+
VALUE string_str = StringValue( argv[0] );
|
437
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
438
|
+
int str_len = RSTRING(string_str)->len;
|
439
|
+
|
440
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
441
|
+
|
442
|
+
if (beg < 0) {
|
443
|
+
/* no match */
|
444
|
+
if (bang)
|
445
|
+
return Qnil;
|
446
|
+
return rb_str_dup(string_str);
|
447
|
+
}
|
448
|
+
end = 0;
|
449
|
+
buf = rb_str_buf_new(str_len);
|
450
|
+
enc = onig_get_encoding( oregexp->reg );
|
451
|
+
do {
|
452
|
+
prev_end = end;
|
453
|
+
beg = region->beg[0];
|
454
|
+
end = region->end[0];
|
455
|
+
rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
|
456
|
+
if ( iter ) {
|
457
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
458
|
+
rb_backref_set(match_data);
|
459
|
+
rb_match_busy(match_data);
|
460
|
+
block_res = rb_yield( match_data );
|
461
|
+
str_mod_check( string_str, str_ptr, str_len);
|
462
|
+
curr_repl = rb_obj_as_string(block_res);
|
463
|
+
rb_str_append(buf, curr_repl);
|
464
|
+
} else {
|
465
|
+
oregexp_append_replacement(self, string_str, repl, region, buf);
|
466
|
+
}
|
467
|
+
if( once ) break;
|
468
|
+
// find next match
|
469
|
+
if( end == beg) {
|
470
|
+
/*
|
471
|
+
* Always consume at least one character of the input string
|
472
|
+
* in order to prevent infinite loops.
|
473
|
+
*/
|
474
|
+
if( str_len <= end )
|
475
|
+
break;
|
476
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
477
|
+
rb_str_buf_cat( buf, str_ptr+end, len);
|
478
|
+
end += len;
|
479
|
+
}
|
480
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
481
|
+
str_ptr+end, str_ptr + str_len,
|
482
|
+
region, ONIG_OPTION_NONE);
|
483
|
+
} while ( beg >= 0);
|
484
|
+
rb_str_buf_cat( buf, str_ptr+end, str_len - end);
|
485
|
+
|
486
|
+
if(tainted)
|
487
|
+
OBJ_INFECT(buf, repl);
|
488
|
+
OBJ_INFECT(buf, string_str);
|
489
|
+
if (bang) {
|
490
|
+
rb_funcall(string_str, rb_intern("replace"), 1, buf);
|
491
|
+
return string_str;
|
492
|
+
} else {
|
493
|
+
return buf;
|
494
|
+
}
|
495
|
+
}
|
496
|
+
|
497
|
+
typedef struct gsub_packet_t {
|
498
|
+
VALUE self; // pattern
|
499
|
+
int argc; // should be 1 if block given
|
500
|
+
VALUE *argv; // either replacement string
|
501
|
+
int bang;
|
502
|
+
int once;
|
503
|
+
OnigRegion *region;
|
504
|
+
} gsub_packet;
|
505
|
+
static VALUE oregexp_packed_gsub( gsub_packet* args ) {
|
506
|
+
return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
|
507
|
+
}
|
508
|
+
void oregexp_cleanup_region(OnigRegion * region){
|
509
|
+
onig_region_free(region, 1);
|
510
|
+
}
|
511
|
+
static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
|
512
|
+
VALUE self; // pattern
|
513
|
+
int argc; // should be 1 if block given
|
514
|
+
VALUE *argv; // either replacement string
|
515
|
+
int bang;
|
516
|
+
int once;
|
517
|
+
{
|
518
|
+
OnigRegion * region = onig_region_new();
|
519
|
+
gsub_packet call_args = {self, argc, argv, bang, once, region};
|
520
|
+
return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
521
|
+
}
|
522
|
+
|
523
|
+
/**
|
524
|
+
* call-seq:
|
525
|
+
* rxp.gsub(str, replacement)
|
526
|
+
* rxp.gsub(str) {|match_data| ... }
|
527
|
+
*
|
528
|
+
* Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
|
529
|
+
* replaced with either _replacement_ or the value of the block.
|
530
|
+
*
|
531
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
532
|
+
* and so on may be used to interpolate successive groups in the match.
|
533
|
+
*
|
534
|
+
* In the block form, the current MatchData object is passed in as a
|
535
|
+
* parameter. The value returned by the block will be substituted for
|
536
|
+
* the match on each call.
|
537
|
+
*
|
538
|
+
**/
|
539
|
+
static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
|
540
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 0);
|
541
|
+
}
|
542
|
+
|
543
|
+
/**
|
544
|
+
* call-seq:
|
545
|
+
* rxp.sub(str, replacement)
|
546
|
+
* rxp.sub(str) {|match_data| ... }
|
547
|
+
*
|
548
|
+
* Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
|
549
|
+
* replaced with either _replacement_ or the value of the block.
|
550
|
+
*
|
551
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
552
|
+
* and so on may be used to interpolate successive groups in the match.
|
553
|
+
*
|
554
|
+
* In the block form, the current MatchData object is passed in as a
|
555
|
+
* parameter. The value returned by the block will be substituted for
|
556
|
+
* the match on each call.
|
557
|
+
*
|
558
|
+
**/
|
559
|
+
static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
|
560
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 1);
|
561
|
+
}
|
562
|
+
|
563
|
+
/**
|
564
|
+
* call-seq:
|
565
|
+
* rxp.gsub!(str, replacement)
|
566
|
+
* rxp.gsub!(str) {|match_data| ... }
|
567
|
+
*
|
568
|
+
* Performs the substitutions of ORegexp#gsub in place, returning
|
569
|
+
* _str_, or _nil_ if no substitutions were performed.
|
570
|
+
*
|
571
|
+
**/
|
572
|
+
static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
|
573
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 0);
|
574
|
+
}
|
575
|
+
|
576
|
+
/**
|
577
|
+
* call-seq:
|
578
|
+
* oregexp.sub!(str, replacement)
|
579
|
+
* oregexp.sub!(str) {|match_data| ... }
|
580
|
+
*
|
581
|
+
* Performs the substitutions of ORegexp#sub in place, returning
|
582
|
+
* _str_, or _nil_ if no substitutions were performed.
|
583
|
+
*
|
584
|
+
*/
|
585
|
+
static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
|
586
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 1);
|
587
|
+
}
|
588
|
+
|
589
|
+
static VALUE
|
590
|
+
oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
|
591
|
+
{
|
592
|
+
long beg,
|
593
|
+
len,
|
594
|
+
end;
|
595
|
+
int iter = 0;
|
596
|
+
|
597
|
+
VALUE matches;
|
598
|
+
ORegexp *oregexp;
|
599
|
+
OnigEncoding enc;
|
600
|
+
|
601
|
+
if ( rb_block_given_p()) {
|
602
|
+
iter = 1;
|
603
|
+
}
|
604
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
605
|
+
|
606
|
+
VALUE string_str = StringValue( str );
|
607
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
608
|
+
int str_len = RSTRING(string_str)->len;
|
609
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
610
|
+
if (beg < 0) {
|
611
|
+
/* no match */
|
612
|
+
return Qnil;
|
613
|
+
}
|
614
|
+
matches = rb_ary_new();
|
615
|
+
enc = onig_get_encoding( oregexp -> reg );
|
616
|
+
do {
|
617
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
618
|
+
end = region->end[0];
|
619
|
+
rb_ary_push( matches, match_data );
|
620
|
+
if ( iter )
|
621
|
+
rb_yield( match_data );
|
622
|
+
// find next match
|
623
|
+
if( end == beg) {
|
624
|
+
/*
|
625
|
+
* Always consume at least one character of the input string
|
626
|
+
* in order to prevent infinite loops.
|
627
|
+
*/
|
628
|
+
if( str_len <= end )
|
629
|
+
break;
|
630
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
631
|
+
end += len;
|
632
|
+
}
|
633
|
+
|
634
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
635
|
+
str_ptr+end, str_ptr + str_len,
|
636
|
+
region, ONIG_OPTION_NONE);
|
637
|
+
} while ( beg >= 0);
|
638
|
+
|
639
|
+
return matches;
|
640
|
+
}
|
641
|
+
|
642
|
+
struct scan_packet {
|
643
|
+
VALUE self, str;
|
644
|
+
OnigRegion * region;
|
645
|
+
};
|
646
|
+
static VALUE oregexp_packed_scan( struct scan_packet * args) {
|
647
|
+
return oregexp_scan(args->self, args->str, args->region);
|
648
|
+
}
|
649
|
+
|
650
|
+
/**
|
651
|
+
* call-seq:
|
652
|
+
* rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
|
653
|
+
* rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
|
654
|
+
*
|
655
|
+
* Both forms iterate through _str_, matching the pattern. For each match,
|
656
|
+
* a MatchData object is generated and passed to the block, and
|
657
|
+
* added to the resulting array of MatchData objects.
|
658
|
+
*
|
659
|
+
* If _str_ does not match pattern, _nil_ is returned.
|
660
|
+
*
|
661
|
+
**/
|
662
|
+
static VALUE oregexp_m_scan(VALUE self, VALUE str) {
|
663
|
+
OnigRegion * region = onig_region_new();
|
664
|
+
struct scan_packet call_args = {self, str, region};
|
665
|
+
return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
666
|
+
}
|
667
|
+
|
668
|
+
/**
|
669
|
+
* call-seq:
|
670
|
+
* rxp === str => true or false
|
671
|
+
*
|
672
|
+
* Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
|
673
|
+
*
|
674
|
+
* a = "HELLO"
|
675
|
+
* case a
|
676
|
+
* when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
677
|
+
* when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
678
|
+
* else; print "Mixed case\n"
|
679
|
+
* end
|
680
|
+
*
|
681
|
+
* <em>produces:</em>
|
682
|
+
*
|
683
|
+
* Upper case
|
684
|
+
*
|
685
|
+
**/
|
686
|
+
|
687
|
+
static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
|
688
|
+
VALUE match;
|
689
|
+
|
690
|
+
if (TYPE(str) != T_STRING) {
|
691
|
+
str = rb_check_string_type(str);
|
692
|
+
if (NIL_P(str)) {
|
693
|
+
return Qfalse;
|
694
|
+
}
|
695
|
+
}
|
696
|
+
StringValue(str);
|
697
|
+
VALUE args[] = {str};
|
698
|
+
match = oregexp_match(1, args, self);
|
699
|
+
if (Qnil == match) {
|
700
|
+
return Qfalse;
|
701
|
+
}
|
702
|
+
return Qtrue;
|
703
|
+
}
|
704
|
+
|
705
|
+
/**
|
706
|
+
* call-seq:
|
707
|
+
* rxp =~ string => int or nil
|
708
|
+
*
|
709
|
+
* Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
710
|
+
* start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
711
|
+
* <code>MatchData</code> or <code>nil</code>.
|
712
|
+
*
|
713
|
+
* ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
714
|
+
* ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
715
|
+
*/
|
716
|
+
|
717
|
+
static VALUE oregexp_match_op(VALUE self, VALUE str) {
|
718
|
+
VALUE args[] = {str};
|
719
|
+
VALUE ret = oregexp_match(1, args, self);
|
720
|
+
if(ret == Qnil)
|
721
|
+
return Qnil;
|
722
|
+
return INT2FIX(RMATCH(ret)->regs->beg[0]);
|
723
|
+
}
|
724
|
+
|
725
|
+
void Init_oregexp() {
|
726
|
+
mOniguruma = rb_define_module("Oniguruma");
|
727
|
+
VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
|
728
|
+
rb_define_alloc_func(cORegexp, oregexp_allocate);
|
729
|
+
rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
|
730
|
+
rb_define_method( cORegexp, "match", oregexp_match, -1 );
|
731
|
+
rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
|
732
|
+
rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
|
733
|
+
rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
|
734
|
+
rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
|
735
|
+
rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
|
736
|
+
rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
|
737
|
+
rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
|
738
|
+
rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
|
739
|
+
}
|