oniguruma 1.0.1-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +49 -0
- data/Manifest.txt +10 -0
- data/README.txt +71 -0
- data/Rakefile +41 -0
- data/Syntax.txt +396 -0
- data/ext/oregexp.c +712 -0
- data/lib/oniguruma.rb +479 -0
- data/test/test_oniguruma.rb +361 -0
- data/win/oregexp.so +0 -0
- metadata +57 -0
data/ext/oregexp.c
ADDED
@@ -0,0 +1,712 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <oniguruma.h>
|
3
|
+
/*
|
4
|
+
TODO:
|
5
|
+
- Add named backreferences.
|
6
|
+
*/
|
7
|
+
|
8
|
+
typedef struct _oregexp {
|
9
|
+
regex_t * reg;
|
10
|
+
} ORegexp;
|
11
|
+
|
12
|
+
VALUE mOniguruma;
|
13
|
+
VALUE nameHash;
|
14
|
+
|
15
|
+
static void oregexp_free( ORegexp * oregexp) {
|
16
|
+
onig_free( oregexp->reg );
|
17
|
+
free( oregexp );
|
18
|
+
}
|
19
|
+
|
20
|
+
static VALUE oregexp_allocate( VALUE klass ) {
|
21
|
+
ORegexp * oregexp = malloc( sizeof( ORegexp ) );
|
22
|
+
oregexp->reg = NULL;
|
23
|
+
return Data_Wrap_Struct( klass, 0, oregexp_free, oregexp );
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
static OnigEncodingType * int2encoding( VALUE v_index ) {
|
28
|
+
int index;
|
29
|
+
if( ! NIL_P(v_index) ) {
|
30
|
+
index = FIX2INT(v_index);
|
31
|
+
switch( index ) {
|
32
|
+
case 0: return ONIG_ENCODING_ASCII;
|
33
|
+
case 1: return ONIG_ENCODING_ISO_8859_1;
|
34
|
+
case 2: return ONIG_ENCODING_ISO_8859_2;
|
35
|
+
case 3: return ONIG_ENCODING_ISO_8859_3;
|
36
|
+
case 4: return ONIG_ENCODING_ISO_8859_4;
|
37
|
+
case 5: return ONIG_ENCODING_ISO_8859_5;
|
38
|
+
case 6: return ONIG_ENCODING_ISO_8859_6;
|
39
|
+
case 7: return ONIG_ENCODING_ISO_8859_7;
|
40
|
+
case 8: return ONIG_ENCODING_ISO_8859_8;
|
41
|
+
case 9: return ONIG_ENCODING_ISO_8859_9;
|
42
|
+
case 10: return ONIG_ENCODING_ISO_8859_10;
|
43
|
+
case 11: return ONIG_ENCODING_ISO_8859_11;
|
44
|
+
case 12: return ONIG_ENCODING_ISO_8859_11;
|
45
|
+
case 13: return ONIG_ENCODING_ISO_8859_13;
|
46
|
+
case 14: return ONIG_ENCODING_ISO_8859_14;
|
47
|
+
case 15: return ONIG_ENCODING_ISO_8859_15;
|
48
|
+
case 16: return ONIG_ENCODING_ISO_8859_16;
|
49
|
+
case 17: return ONIG_ENCODING_UTF8;
|
50
|
+
case 18: return ONIG_ENCODING_UTF16_BE;
|
51
|
+
case 19: return ONIG_ENCODING_UTF16_LE;
|
52
|
+
case 20: return ONIG_ENCODING_UTF32_BE;
|
53
|
+
case 21: return ONIG_ENCODING_UTF32_LE;
|
54
|
+
case 22: return ONIG_ENCODING_EUC_JP;
|
55
|
+
case 23: return ONIG_ENCODING_EUC_TW;
|
56
|
+
case 24: return ONIG_ENCODING_EUC_KR;
|
57
|
+
case 25: return ONIG_ENCODING_EUC_CN;
|
58
|
+
case 26: return ONIG_ENCODING_SJIS;
|
59
|
+
/*case 27: return ONIG_ENCODING_KOI8;*/
|
60
|
+
case 28: return ONIG_ENCODING_KOI8_R;
|
61
|
+
#if ONIGURUMA_VERSION_MAJOR == 5
|
62
|
+
case 29: return ONIG_ENCODING_CP1251;
|
63
|
+
#endif
|
64
|
+
case 30: return ONIG_ENCODING_BIG5;
|
65
|
+
case 31: return ONIG_ENCODING_GB18030;
|
66
|
+
case 32: return ONIG_ENCODING_UNDEF;
|
67
|
+
}
|
68
|
+
}
|
69
|
+
return ONIG_ENCODING_UNDEF;
|
70
|
+
}
|
71
|
+
|
72
|
+
static OnigSyntaxType * int2syntax( VALUE v_index ) {
|
73
|
+
int index;
|
74
|
+
if( ! NIL_P(v_index) ) {
|
75
|
+
index = FIX2INT(v_index);
|
76
|
+
switch( index ) {
|
77
|
+
case 0: return ONIG_SYNTAX_ASIS;
|
78
|
+
case 1: return ONIG_SYNTAX_POSIX_BASIC;
|
79
|
+
case 2: return ONIG_SYNTAX_POSIX_EXTENDED;
|
80
|
+
case 3: return ONIG_SYNTAX_EMACS;
|
81
|
+
case 4: return ONIG_SYNTAX_GREP;
|
82
|
+
case 5: return ONIG_SYNTAX_GNU_REGEX;
|
83
|
+
case 6: return ONIG_SYNTAX_JAVA;
|
84
|
+
case 7: return ONIG_SYNTAX_PERL;
|
85
|
+
case 8: return ONIG_SYNTAX_PERL_NG;
|
86
|
+
case 9: return ONIG_SYNTAX_RUBY;
|
87
|
+
case 10: return ONIG_SYNTAX_DEFAULT;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
return ONIG_SYNTAX_DEFAULT;
|
91
|
+
}
|
92
|
+
|
93
|
+
struct callback_packet {
|
94
|
+
VALUE hash;
|
95
|
+
OnigRegion * region;
|
96
|
+
};
|
97
|
+
|
98
|
+
static int name_callback(
|
99
|
+
const UChar* name,
|
100
|
+
const UChar* name_end,
|
101
|
+
int ngroup_num,
|
102
|
+
int* group_nums,
|
103
|
+
regex_t* reg,
|
104
|
+
struct callback_packet* arg
|
105
|
+
) {
|
106
|
+
int i, gn;
|
107
|
+
VALUE nameHash = arg->hash;
|
108
|
+
|
109
|
+
for (i = 0; i < ngroup_num; i++) {
|
110
|
+
gn = group_nums[i];
|
111
|
+
rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
|
112
|
+
}
|
113
|
+
return 0;
|
114
|
+
}
|
115
|
+
|
116
|
+
static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
117
|
+
ORegexp *oregexp;
|
118
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
119
|
+
|
120
|
+
VALUE pattern_str = StringValue( pattern );
|
121
|
+
rb_iv_set( self, "@pattern", pattern_str );
|
122
|
+
rb_iv_set( self, "@options", options );
|
123
|
+
UChar* pat_ptr = RSTRING(pattern_str)->ptr;
|
124
|
+
int pat_len = RSTRING(pattern_str)->len;
|
125
|
+
VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
|
126
|
+
VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
|
127
|
+
VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
|
128
|
+
int iOptions = NUM2INT( rOptions );
|
129
|
+
OnigEncodingType * iEncoding = int2encoding( rEncoding );
|
130
|
+
OnigSyntaxType * iSyntax = int2syntax( rSyntax );
|
131
|
+
|
132
|
+
|
133
|
+
int r;
|
134
|
+
OnigErrorInfo einfo;
|
135
|
+
r = onig_new(&(oregexp->reg), pat_ptr, pat_ptr + pat_len, iOptions, iEncoding, iSyntax, &einfo);
|
136
|
+
if (r != ONIG_NORMAL) {
|
137
|
+
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
138
|
+
onig_error_code_to_str(s, r, &einfo);
|
139
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
140
|
+
}
|
141
|
+
return self;
|
142
|
+
}
|
143
|
+
|
144
|
+
/* can't include re.h, since it conflicts with oniguruma typedefs */
|
145
|
+
struct RMatch {
|
146
|
+
struct RBasic basic;
|
147
|
+
VALUE str;
|
148
|
+
struct re_registers *regs;
|
149
|
+
};
|
150
|
+
#define RMATCH(obj) (R_CAST(RMatch)(obj))
|
151
|
+
void rb_match_busy _((VALUE));
|
152
|
+
|
153
|
+
static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
|
154
|
+
VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
|
155
|
+
NEWOBJ(match, struct RMatch);
|
156
|
+
OBJSETUP(match, rb_cMatch, T_MATCH);
|
157
|
+
VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
|
158
|
+
int i , count = region->num_regs;
|
159
|
+
struct callback_packet packet;
|
160
|
+
|
161
|
+
match->str = rb_str_new4(string_str);
|
162
|
+
match->regs = ALLOC(struct re_registers);
|
163
|
+
match->regs->allocated = count;
|
164
|
+
match->regs->num_regs = count;
|
165
|
+
match->regs->beg = ALLOC_N(int, count);
|
166
|
+
match->regs->end = ALLOC_N(int, count);
|
167
|
+
|
168
|
+
for ( i = 0; i < count; i++){
|
169
|
+
match->regs->beg[i] = region->beg[i];
|
170
|
+
match->regs->end[i] = region->end[i];
|
171
|
+
}
|
172
|
+
rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
|
173
|
+
packet.region = region;
|
174
|
+
if( onig_number_of_names( oregexp->reg ) > 0 ) {
|
175
|
+
packet.hash = rb_hash_new();
|
176
|
+
onig_foreach_name(oregexp->reg, name_callback, &packet);
|
177
|
+
rb_iv_set((VALUE)match, "@named_captures", packet.hash);
|
178
|
+
}
|
179
|
+
return (VALUE)match;
|
180
|
+
}
|
181
|
+
|
182
|
+
/*
|
183
|
+
* call-seq:
|
184
|
+
* rxp.match(str) => matchdata or nil
|
185
|
+
*
|
186
|
+
* Returns a <code>MatchData</code> object describing the match, or
|
187
|
+
* <code>nil</code> if there was no match. This is equivalent to retrieving the
|
188
|
+
* value of the special variable <code>$~</code> following a normal match.
|
189
|
+
*
|
190
|
+
* /(.)(.)(.)/.match("abc")[2] #=> "b"
|
191
|
+
*/
|
192
|
+
static VALUE oregexp_match( VALUE self, VALUE string ) {
|
193
|
+
ORegexp *oregexp;
|
194
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
195
|
+
|
196
|
+
VALUE string_str = StringValue( string );
|
197
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
198
|
+
int str_len = RSTRING(string_str)->len;
|
199
|
+
|
200
|
+
OnigRegion *region = onig_region_new();
|
201
|
+
int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
202
|
+
rb_backref_set(Qnil);
|
203
|
+
if (r >= 0) {
|
204
|
+
VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
|
205
|
+
onig_region_free(region, 1 );
|
206
|
+
rb_backref_set(matchData);
|
207
|
+
rb_match_busy(matchData);
|
208
|
+
return matchData;
|
209
|
+
} else if (r == ONIG_MISMATCH) {
|
210
|
+
onig_region_free(region, 1 );
|
211
|
+
return Qnil;
|
212
|
+
} else {
|
213
|
+
onig_region_free(region, 1 );
|
214
|
+
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
215
|
+
onig_error_code_to_str(s, r);
|
216
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
217
|
+
}
|
218
|
+
|
219
|
+
}
|
220
|
+
|
221
|
+
static const UChar BACKSLASH = 0x5c;
|
222
|
+
|
223
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
224
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
225
|
+
backslash). */
|
226
|
+
|
227
|
+
/* scan the replacement text, looking for substitutions (\n) and \escapes. */
|
228
|
+
static VALUE
|
229
|
+
oregexp_append_replacement(pat, src_text, repl_text, region, ret)
|
230
|
+
VALUE pat,
|
231
|
+
src_text,
|
232
|
+
repl_text;
|
233
|
+
OnigRegion * region;
|
234
|
+
VALUE ret;
|
235
|
+
{
|
236
|
+
ORegexp *oregexp;
|
237
|
+
int32_t replIdx = 0, name_pos, name_start, name_end ;
|
238
|
+
int32_t replacementLength = RSTRING(repl_text)->len;
|
239
|
+
UChar *replacementText = RSTRING(repl_text)->ptr;
|
240
|
+
UChar *replacementEnd = replacementText + (replacementLength-1);
|
241
|
+
long numDigits = 0;
|
242
|
+
long groupNum = 0, g_start, g_end;
|
243
|
+
OnigCodePoint digitC;
|
244
|
+
OnigEncoding enc;
|
245
|
+
const UChar * matchText;
|
246
|
+
long matchLen;
|
247
|
+
|
248
|
+
matchText = RSTRING(src_text)->ptr;
|
249
|
+
matchLen = RSTRING(src_text)->len;
|
250
|
+
Data_Get_Struct( pat, ORegexp, oregexp );
|
251
|
+
enc = onig_get_encoding( oregexp->reg );
|
252
|
+
|
253
|
+
while (replIdx < replacementLength) {
|
254
|
+
OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
255
|
+
int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
256
|
+
if( c_len == 0 ) {
|
257
|
+
rb_warn("Strange, for %d enc_len is 0", c);
|
258
|
+
c_len = 1;
|
259
|
+
}
|
260
|
+
replIdx += c_len;
|
261
|
+
if ( c != BACKSLASH) {
|
262
|
+
/* Common case, no substitution, no escaping, */
|
263
|
+
/* just copy the char to the dest buf. */
|
264
|
+
rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
|
265
|
+
continue;
|
266
|
+
}
|
267
|
+
if (replIdx >= replacementLength) {
|
268
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
|
269
|
+
break;
|
270
|
+
}
|
271
|
+
/* Pick up a capture group number if one follows. */
|
272
|
+
numDigits = 0;
|
273
|
+
groupNum = 0;
|
274
|
+
for (;;) {
|
275
|
+
if (replIdx >= replacementLength) {
|
276
|
+
break;
|
277
|
+
}
|
278
|
+
digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
279
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
280
|
+
if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
|
281
|
+
break;
|
282
|
+
}
|
283
|
+
replIdx += c_len;
|
284
|
+
groupNum=groupNum*10 + (digitC - '0');
|
285
|
+
numDigits++;
|
286
|
+
if (numDigits >= 2) { /* limit 99 groups */
|
287
|
+
break;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
if (numDigits == 0) {
|
291
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
292
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
293
|
+
backslash). */
|
294
|
+
int p_len = c_len;
|
295
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
296
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
297
|
+
switch(c) {
|
298
|
+
case '&' : // matched substring
|
299
|
+
rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
|
300
|
+
replIdx += c_len;
|
301
|
+
break;
|
302
|
+
case '`' : // prematch
|
303
|
+
rb_str_buf_cat(ret, matchText, region->beg[0]);
|
304
|
+
replIdx += c_len;
|
305
|
+
break;
|
306
|
+
case '\'': // postmatch
|
307
|
+
rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
|
308
|
+
replIdx += c_len;
|
309
|
+
break;
|
310
|
+
case '\\': // literal backslash
|
311
|
+
// place single backslash
|
312
|
+
rb_str_buf_cat(ret, replacementText+replIdx, c_len);
|
313
|
+
replIdx += c_len;
|
314
|
+
break;
|
315
|
+
case '+': // last matched group
|
316
|
+
replIdx += c_len;
|
317
|
+
for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
|
318
|
+
g_start = region->beg[ groupNum ];
|
319
|
+
g_end = region->end[ groupNum ];
|
320
|
+
if( g_start != -1 ) {
|
321
|
+
rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
|
322
|
+
break;
|
323
|
+
}
|
324
|
+
}
|
325
|
+
break;
|
326
|
+
case '<': // named group references \<name>
|
327
|
+
name_pos = replIdx+c_len;
|
328
|
+
name_end = name_start = replIdx+c_len;
|
329
|
+
while(name_pos < replacementLength) {
|
330
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
|
331
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
|
332
|
+
name_pos += c_len;
|
333
|
+
if( c == '>') break;
|
334
|
+
if( ONIGENC_IS_CODE_WORD(enc, c) ) {
|
335
|
+
name_end += c_len;
|
336
|
+
} else {
|
337
|
+
break;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
if( c != '>' || name_end == name_start ) {
|
341
|
+
// place backslash and '<'
|
342
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
343
|
+
replIdx += c_len;
|
344
|
+
} else {
|
345
|
+
// lookup for group and subst for that value
|
346
|
+
groupNum = onig_name_to_backref_number( oregexp->reg,
|
347
|
+
replacementText+name_start, replacementText+name_end, region);
|
348
|
+
if( groupNum >= 0 ) {
|
349
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum],
|
350
|
+
region->end[groupNum]-region->beg[groupNum]);
|
351
|
+
}
|
352
|
+
replIdx = name_pos;
|
353
|
+
}
|
354
|
+
break;
|
355
|
+
default:
|
356
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
357
|
+
replIdx += c_len;
|
358
|
+
|
359
|
+
}
|
360
|
+
} else {
|
361
|
+
/* Finally, append the capture group data to the destination. */
|
362
|
+
if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
|
363
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
|
364
|
+
}
|
365
|
+
}
|
366
|
+
}
|
367
|
+
return ret;
|
368
|
+
}
|
369
|
+
|
370
|
+
static inline void
|
371
|
+
str_mod_check(s, p, len)
|
372
|
+
VALUE s;
|
373
|
+
char *p;
|
374
|
+
long len;
|
375
|
+
{
|
376
|
+
if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
|
377
|
+
rb_raise(rb_eRuntimeError, "string modified");
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
static VALUE
|
382
|
+
oregexp_gsub(self, argc, argv, bang, once, region)
|
383
|
+
VALUE self; // pattern
|
384
|
+
int argc; // should be 1 if block given
|
385
|
+
VALUE *argv; // either replacement string
|
386
|
+
int bang;
|
387
|
+
int once;
|
388
|
+
OnigRegion *region;
|
389
|
+
{
|
390
|
+
VALUE repl;
|
391
|
+
long beg,
|
392
|
+
end,
|
393
|
+
len,
|
394
|
+
prev_end;
|
395
|
+
int tainted = 0,
|
396
|
+
iter = 0;
|
397
|
+
|
398
|
+
VALUE buf, curr_repl, block_res;
|
399
|
+
ORegexp *oregexp;
|
400
|
+
OnigEncoding enc;
|
401
|
+
|
402
|
+
if (argc == 1 && rb_block_given_p()) {
|
403
|
+
iter = 1;
|
404
|
+
} else if (argc == 2) {
|
405
|
+
repl = argv[1];
|
406
|
+
Check_Type(repl, T_STRING);
|
407
|
+
if (OBJ_TAINTED(argv[1]))
|
408
|
+
tainted = 1;
|
409
|
+
} else {
|
410
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
|
411
|
+
}
|
412
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
413
|
+
|
414
|
+
VALUE string_str = StringValue( argv[0] );
|
415
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
416
|
+
int str_len = RSTRING(string_str)->len;
|
417
|
+
|
418
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
419
|
+
|
420
|
+
if (beg < 0) {
|
421
|
+
/* no match */
|
422
|
+
if (bang)
|
423
|
+
return Qnil;
|
424
|
+
return rb_str_dup(string_str);
|
425
|
+
}
|
426
|
+
end = 0;
|
427
|
+
buf = rb_str_buf_new(str_len);
|
428
|
+
enc = onig_get_encoding( oregexp->reg );
|
429
|
+
do {
|
430
|
+
prev_end = end;
|
431
|
+
beg = region->beg[0];
|
432
|
+
end = region->end[0];
|
433
|
+
rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
|
434
|
+
if ( iter ) {
|
435
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
436
|
+
rb_backref_set(match_data);
|
437
|
+
rb_match_busy(match_data);
|
438
|
+
block_res = rb_yield( match_data );
|
439
|
+
str_mod_check( string_str, str_ptr, str_len);
|
440
|
+
curr_repl = rb_obj_as_string(block_res);
|
441
|
+
rb_str_append(buf, curr_repl);
|
442
|
+
} else {
|
443
|
+
oregexp_append_replacement(self, string_str, repl, region, buf);
|
444
|
+
}
|
445
|
+
if( once ) break;
|
446
|
+
// find next match
|
447
|
+
if( end == beg) {
|
448
|
+
/*
|
449
|
+
* Always consume at least one character of the input string
|
450
|
+
* in order to prevent infinite loops.
|
451
|
+
*/
|
452
|
+
if( str_len <= end )
|
453
|
+
break;
|
454
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
455
|
+
rb_str_buf_cat( buf, str_ptr+end, len);
|
456
|
+
end += len;
|
457
|
+
}
|
458
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
459
|
+
str_ptr+end, str_ptr + str_len,
|
460
|
+
region, ONIG_OPTION_NONE);
|
461
|
+
} while ( beg >= 0);
|
462
|
+
rb_str_buf_cat( buf, str_ptr+end, str_len - end);
|
463
|
+
|
464
|
+
if(tainted)
|
465
|
+
OBJ_INFECT(buf, repl);
|
466
|
+
OBJ_INFECT(buf, string_str);
|
467
|
+
if (bang) {
|
468
|
+
rb_funcall(string_str, rb_intern("replace"), 1, buf);
|
469
|
+
return string_str;
|
470
|
+
} else {
|
471
|
+
return buf;
|
472
|
+
}
|
473
|
+
}
|
474
|
+
|
475
|
+
typedef struct gsub_packet_t {
|
476
|
+
VALUE self; // pattern
|
477
|
+
int argc; // should be 1 if block given
|
478
|
+
VALUE *argv; // either replacement string
|
479
|
+
int bang;
|
480
|
+
int once;
|
481
|
+
OnigRegion *region;
|
482
|
+
} gsub_packet;
|
483
|
+
static VALUE oregexp_packed_gsub( gsub_packet* args ) {
|
484
|
+
return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
|
485
|
+
}
|
486
|
+
void oregexp_cleanup_region(OnigRegion * region){
|
487
|
+
onig_region_free(region, 1);
|
488
|
+
}
|
489
|
+
static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
|
490
|
+
VALUE self; // pattern
|
491
|
+
int argc; // should be 1 if block given
|
492
|
+
VALUE *argv; // either replacement string
|
493
|
+
int bang;
|
494
|
+
int once;
|
495
|
+
{
|
496
|
+
OnigRegion * region = onig_region_new();
|
497
|
+
gsub_packet call_args = {self, argc, argv, bang, once, region};
|
498
|
+
return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
499
|
+
}
|
500
|
+
|
501
|
+
/**
|
502
|
+
* call-seq:
|
503
|
+
* rxp.gsub(str, replacement)
|
504
|
+
* rxp.gsub(str) {|match_data| ... }
|
505
|
+
*
|
506
|
+
* Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
|
507
|
+
* replaced with either _replacement_ or the value of the block.
|
508
|
+
*
|
509
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
510
|
+
* and so on may be used to interpolate successive groups in the match.
|
511
|
+
*
|
512
|
+
* In the block form, the current MatchData object is passed in as a
|
513
|
+
* parameter. The value returned by the block will be substituted for
|
514
|
+
* the match on each call.
|
515
|
+
*
|
516
|
+
**/
|
517
|
+
static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
|
518
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 0);
|
519
|
+
}
|
520
|
+
|
521
|
+
/**
|
522
|
+
* call-seq:
|
523
|
+
* rxp.sub(str, replacement)
|
524
|
+
* rxp.sub(str) {|match_data| ... }
|
525
|
+
*
|
526
|
+
* Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
|
527
|
+
* replaced with either _replacement_ or the value of the block.
|
528
|
+
*
|
529
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
530
|
+
* and so on may be used to interpolate successive groups in the match.
|
531
|
+
*
|
532
|
+
* In the block form, the current MatchData object is passed in as a
|
533
|
+
* parameter. The value returned by the block will be substituted for
|
534
|
+
* the match on each call.
|
535
|
+
*
|
536
|
+
**/
|
537
|
+
static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
|
538
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 1);
|
539
|
+
}
|
540
|
+
|
541
|
+
/**
|
542
|
+
* call-seq:
|
543
|
+
* rxp.gsub!(str, replacement)
|
544
|
+
* rxp.gsub!(str) {|match_data| ... }
|
545
|
+
*
|
546
|
+
* Performs the substitutions of ORegexp#gsub in place, returning
|
547
|
+
* _str_, or _nil_ if no substitutions were performed.
|
548
|
+
*
|
549
|
+
**/
|
550
|
+
static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
|
551
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 0);
|
552
|
+
}
|
553
|
+
|
554
|
+
/**
|
555
|
+
* call-seq:
|
556
|
+
* oregexp.sub!(str, replacement)
|
557
|
+
* oregexp.sub!(str) {|match_data| ... }
|
558
|
+
*
|
559
|
+
* Performs the substitutions of ORegexp#sub in place, returning
|
560
|
+
* _str_, or _nil_ if no substitutions were performed.
|
561
|
+
*
|
562
|
+
**/
|
563
|
+
static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
|
564
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 1);
|
565
|
+
}
|
566
|
+
|
567
|
+
static VALUE
|
568
|
+
oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
|
569
|
+
{
|
570
|
+
long beg,
|
571
|
+
len,
|
572
|
+
end;
|
573
|
+
int iter = 0;
|
574
|
+
|
575
|
+
VALUE matches;
|
576
|
+
ORegexp *oregexp;
|
577
|
+
OnigEncoding enc;
|
578
|
+
|
579
|
+
if ( rb_block_given_p()) {
|
580
|
+
iter = 1;
|
581
|
+
}
|
582
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
583
|
+
|
584
|
+
VALUE string_str = StringValue( str );
|
585
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
586
|
+
int str_len = RSTRING(string_str)->len;
|
587
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
588
|
+
if (beg < 0) {
|
589
|
+
/* no match */
|
590
|
+
return Qnil;
|
591
|
+
}
|
592
|
+
matches = rb_ary_new();
|
593
|
+
enc = onig_get_encoding( oregexp -> reg );
|
594
|
+
do {
|
595
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
596
|
+
end = region->end[0];
|
597
|
+
rb_ary_push( matches, match_data );
|
598
|
+
if ( iter )
|
599
|
+
rb_yield( match_data );
|
600
|
+
// find next match
|
601
|
+
if( end == beg) {
|
602
|
+
/*
|
603
|
+
* Always consume at least one character of the input string
|
604
|
+
* in order to prevent infinite loops.
|
605
|
+
*/
|
606
|
+
if( str_len <= end )
|
607
|
+
break;
|
608
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
609
|
+
end += len;
|
610
|
+
}
|
611
|
+
|
612
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
613
|
+
str_ptr+end, str_ptr + str_len,
|
614
|
+
region, ONIG_OPTION_NONE);
|
615
|
+
} while ( beg >= 0);
|
616
|
+
|
617
|
+
return matches;
|
618
|
+
}
|
619
|
+
|
620
|
+
struct scan_packet {
|
621
|
+
VALUE self, str;
|
622
|
+
OnigRegion * region;
|
623
|
+
};
|
624
|
+
static VALUE oregexp_packed_scan( struct scan_packet * args) {
|
625
|
+
return oregexp_scan(args->self, args->str, args->region);
|
626
|
+
}
|
627
|
+
/**
|
628
|
+
* call-seq:
|
629
|
+
* rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
|
630
|
+
* rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
|
631
|
+
*
|
632
|
+
* Both forms iterate through _str_, matching the pattern. For each match,
|
633
|
+
* a MatchData object is generated and passed to the block, and
|
634
|
+
* added to the resulting array of MatchData objects.
|
635
|
+
*
|
636
|
+
* If _str_ does not match pattern, _nil_ is returned.
|
637
|
+
*
|
638
|
+
**/
|
639
|
+
static VALUE oregexp_m_scan(VALUE self, VALUE str) {
|
640
|
+
OnigRegion * region = onig_region_new();
|
641
|
+
struct scan_packet call_args = {self, str, region};
|
642
|
+
return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
643
|
+
}
|
644
|
+
|
645
|
+
/**
|
646
|
+
* call-seq:
|
647
|
+
* rxp === str => true or false
|
648
|
+
*
|
649
|
+
* Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
|
650
|
+
*
|
651
|
+
* a = "HELLO"
|
652
|
+
* case a
|
653
|
+
* when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
654
|
+
* when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
655
|
+
* else; print "Mixed case\n"
|
656
|
+
* end
|
657
|
+
*
|
658
|
+
* <em>produces:</em>
|
659
|
+
*
|
660
|
+
* Upper case
|
661
|
+
*
|
662
|
+
**/
|
663
|
+
|
664
|
+
static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
|
665
|
+
VALUE match;
|
666
|
+
|
667
|
+
if (TYPE(str) != T_STRING) {
|
668
|
+
str = rb_check_string_type(str);
|
669
|
+
if (NIL_P(str)) {
|
670
|
+
return Qfalse;
|
671
|
+
}
|
672
|
+
}
|
673
|
+
StringValue(str);
|
674
|
+
match = oregexp_match(self, str);
|
675
|
+
if (Qnil == match) {
|
676
|
+
return Qfalse;
|
677
|
+
}
|
678
|
+
return Qtrue;
|
679
|
+
}
|
680
|
+
/*
|
681
|
+
* call-seq:
|
682
|
+
* rxp =~ string => int or nil
|
683
|
+
*
|
684
|
+
* Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
685
|
+
* start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
686
|
+
* <code>MatchData</code> or <code>nil</code>.
|
687
|
+
*
|
688
|
+
* ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
689
|
+
* ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
690
|
+
**/
|
691
|
+
static VALUE oregexp_match_op(VALUE self, VALUE str) {
|
692
|
+
VALUE ret = oregexp_match(self, str);
|
693
|
+
if(ret == Qnil)
|
694
|
+
return Qnil;
|
695
|
+
return INT2FIX(RMATCH(ret)->regs->beg[0]);
|
696
|
+
}
|
697
|
+
|
698
|
+
void Init_oregexp() {
|
699
|
+
mOniguruma = rb_define_module("Oniguruma");
|
700
|
+
VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
|
701
|
+
rb_define_alloc_func(cORegexp, oregexp_allocate);
|
702
|
+
rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
|
703
|
+
rb_define_method( cORegexp, "match", oregexp_match, 1 );
|
704
|
+
rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
|
705
|
+
rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
|
706
|
+
rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
|
707
|
+
rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
|
708
|
+
rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
|
709
|
+
rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
|
710
|
+
rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
|
711
|
+
rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
|
712
|
+
}
|