oniguruma 1.0.1-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,712 @@
1
+ #include <ruby.h>
2
+ #include <oniguruma.h>
3
+ /*
4
+ TODO:
5
+ - Add named backreferences.
6
+ */
7
+
8
+ typedef struct _oregexp {
9
+ regex_t * reg;
10
+ } ORegexp;
11
+
12
+ VALUE mOniguruma;
13
+ VALUE nameHash;
14
+
15
+ static void oregexp_free( ORegexp * oregexp) {
16
+ onig_free( oregexp->reg );
17
+ free( oregexp );
18
+ }
19
+
20
+ static VALUE oregexp_allocate( VALUE klass ) {
21
+ ORegexp * oregexp = malloc( sizeof( ORegexp ) );
22
+ oregexp->reg = NULL;
23
+ return Data_Wrap_Struct( klass, 0, oregexp_free, oregexp );
24
+ }
25
+
26
+
27
+ static OnigEncodingType * int2encoding( VALUE v_index ) {
28
+ int index;
29
+ if( ! NIL_P(v_index) ) {
30
+ index = FIX2INT(v_index);
31
+ switch( index ) {
32
+ case 0: return ONIG_ENCODING_ASCII;
33
+ case 1: return ONIG_ENCODING_ISO_8859_1;
34
+ case 2: return ONIG_ENCODING_ISO_8859_2;
35
+ case 3: return ONIG_ENCODING_ISO_8859_3;
36
+ case 4: return ONIG_ENCODING_ISO_8859_4;
37
+ case 5: return ONIG_ENCODING_ISO_8859_5;
38
+ case 6: return ONIG_ENCODING_ISO_8859_6;
39
+ case 7: return ONIG_ENCODING_ISO_8859_7;
40
+ case 8: return ONIG_ENCODING_ISO_8859_8;
41
+ case 9: return ONIG_ENCODING_ISO_8859_9;
42
+ case 10: return ONIG_ENCODING_ISO_8859_10;
43
+ case 11: return ONIG_ENCODING_ISO_8859_11;
44
+ case 12: return ONIG_ENCODING_ISO_8859_11;
45
+ case 13: return ONIG_ENCODING_ISO_8859_13;
46
+ case 14: return ONIG_ENCODING_ISO_8859_14;
47
+ case 15: return ONIG_ENCODING_ISO_8859_15;
48
+ case 16: return ONIG_ENCODING_ISO_8859_16;
49
+ case 17: return ONIG_ENCODING_UTF8;
50
+ case 18: return ONIG_ENCODING_UTF16_BE;
51
+ case 19: return ONIG_ENCODING_UTF16_LE;
52
+ case 20: return ONIG_ENCODING_UTF32_BE;
53
+ case 21: return ONIG_ENCODING_UTF32_LE;
54
+ case 22: return ONIG_ENCODING_EUC_JP;
55
+ case 23: return ONIG_ENCODING_EUC_TW;
56
+ case 24: return ONIG_ENCODING_EUC_KR;
57
+ case 25: return ONIG_ENCODING_EUC_CN;
58
+ case 26: return ONIG_ENCODING_SJIS;
59
+ /*case 27: return ONIG_ENCODING_KOI8;*/
60
+ case 28: return ONIG_ENCODING_KOI8_R;
61
+ #if ONIGURUMA_VERSION_MAJOR == 5
62
+ case 29: return ONIG_ENCODING_CP1251;
63
+ #endif
64
+ case 30: return ONIG_ENCODING_BIG5;
65
+ case 31: return ONIG_ENCODING_GB18030;
66
+ case 32: return ONIG_ENCODING_UNDEF;
67
+ }
68
+ }
69
+ return ONIG_ENCODING_UNDEF;
70
+ }
71
+
72
+ static OnigSyntaxType * int2syntax( VALUE v_index ) {
73
+ int index;
74
+ if( ! NIL_P(v_index) ) {
75
+ index = FIX2INT(v_index);
76
+ switch( index ) {
77
+ case 0: return ONIG_SYNTAX_ASIS;
78
+ case 1: return ONIG_SYNTAX_POSIX_BASIC;
79
+ case 2: return ONIG_SYNTAX_POSIX_EXTENDED;
80
+ case 3: return ONIG_SYNTAX_EMACS;
81
+ case 4: return ONIG_SYNTAX_GREP;
82
+ case 5: return ONIG_SYNTAX_GNU_REGEX;
83
+ case 6: return ONIG_SYNTAX_JAVA;
84
+ case 7: return ONIG_SYNTAX_PERL;
85
+ case 8: return ONIG_SYNTAX_PERL_NG;
86
+ case 9: return ONIG_SYNTAX_RUBY;
87
+ case 10: return ONIG_SYNTAX_DEFAULT;
88
+ }
89
+ }
90
+ return ONIG_SYNTAX_DEFAULT;
91
+ }
92
+
93
+ struct callback_packet {
94
+ VALUE hash;
95
+ OnigRegion * region;
96
+ };
97
+
98
+ static int name_callback(
99
+ const UChar* name,
100
+ const UChar* name_end,
101
+ int ngroup_num,
102
+ int* group_nums,
103
+ regex_t* reg,
104
+ struct callback_packet* arg
105
+ ) {
106
+ int i, gn;
107
+ VALUE nameHash = arg->hash;
108
+
109
+ for (i = 0; i < ngroup_num; i++) {
110
+ gn = group_nums[i];
111
+ rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
112
+ }
113
+ return 0;
114
+ }
115
+
116
+ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
117
+ ORegexp *oregexp;
118
+ Data_Get_Struct( self, ORegexp, oregexp );
119
+
120
+ VALUE pattern_str = StringValue( pattern );
121
+ rb_iv_set( self, "@pattern", pattern_str );
122
+ rb_iv_set( self, "@options", options );
123
+ UChar* pat_ptr = RSTRING(pattern_str)->ptr;
124
+ int pat_len = RSTRING(pattern_str)->len;
125
+ VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
126
+ VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
127
+ VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
128
+ int iOptions = NUM2INT( rOptions );
129
+ OnigEncodingType * iEncoding = int2encoding( rEncoding );
130
+ OnigSyntaxType * iSyntax = int2syntax( rSyntax );
131
+
132
+
133
+ int r;
134
+ OnigErrorInfo einfo;
135
+ r = onig_new(&(oregexp->reg), pat_ptr, pat_ptr + pat_len, iOptions, iEncoding, iSyntax, &einfo);
136
+ if (r != ONIG_NORMAL) {
137
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
138
+ onig_error_code_to_str(s, r, &einfo);
139
+ rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
140
+ }
141
+ return self;
142
+ }
143
+
144
+ /* can't include re.h, since it conflicts with oniguruma typedefs */
145
+ struct RMatch {
146
+ struct RBasic basic;
147
+ VALUE str;
148
+ struct re_registers *regs;
149
+ };
150
+ #define RMATCH(obj) (R_CAST(RMatch)(obj))
151
+ void rb_match_busy _((VALUE));
152
+
153
+ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
154
+ VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
155
+ NEWOBJ(match, struct RMatch);
156
+ OBJSETUP(match, rb_cMatch, T_MATCH);
157
+ VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
158
+ int i , count = region->num_regs;
159
+ struct callback_packet packet;
160
+
161
+ match->str = rb_str_new4(string_str);
162
+ match->regs = ALLOC(struct re_registers);
163
+ match->regs->allocated = count;
164
+ match->regs->num_regs = count;
165
+ match->regs->beg = ALLOC_N(int, count);
166
+ match->regs->end = ALLOC_N(int, count);
167
+
168
+ for ( i = 0; i < count; i++){
169
+ match->regs->beg[i] = region->beg[i];
170
+ match->regs->end[i] = region->end[i];
171
+ }
172
+ rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
173
+ packet.region = region;
174
+ if( onig_number_of_names( oregexp->reg ) > 0 ) {
175
+ packet.hash = rb_hash_new();
176
+ onig_foreach_name(oregexp->reg, name_callback, &packet);
177
+ rb_iv_set((VALUE)match, "@named_captures", packet.hash);
178
+ }
179
+ return (VALUE)match;
180
+ }
181
+
182
+ /*
183
+ * call-seq:
184
+ * rxp.match(str) => matchdata or nil
185
+ *
186
+ * Returns a <code>MatchData</code> object describing the match, or
187
+ * <code>nil</code> if there was no match. This is equivalent to retrieving the
188
+ * value of the special variable <code>$~</code> following a normal match.
189
+ *
190
+ * /(.)(.)(.)/.match("abc")[2] #=> "b"
191
+ */
192
+ static VALUE oregexp_match( VALUE self, VALUE string ) {
193
+ ORegexp *oregexp;
194
+ Data_Get_Struct( self, ORegexp, oregexp );
195
+
196
+ VALUE string_str = StringValue( string );
197
+ UChar* str_ptr = RSTRING(string_str)->ptr;
198
+ int str_len = RSTRING(string_str)->len;
199
+
200
+ OnigRegion *region = onig_region_new();
201
+ int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
202
+ rb_backref_set(Qnil);
203
+ if (r >= 0) {
204
+ VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
205
+ onig_region_free(region, 1 );
206
+ rb_backref_set(matchData);
207
+ rb_match_busy(matchData);
208
+ return matchData;
209
+ } else if (r == ONIG_MISMATCH) {
210
+ onig_region_free(region, 1 );
211
+ return Qnil;
212
+ } else {
213
+ onig_region_free(region, 1 );
214
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
215
+ onig_error_code_to_str(s, r);
216
+ rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
217
+ }
218
+
219
+ }
220
+
221
+ static const UChar BACKSLASH = 0x5c;
222
+
223
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
224
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
225
+ backslash). */
226
+
227
+ /* scan the replacement text, looking for substitutions (\n) and \escapes. */
228
+ static VALUE
229
+ oregexp_append_replacement(pat, src_text, repl_text, region, ret)
230
+ VALUE pat,
231
+ src_text,
232
+ repl_text;
233
+ OnigRegion * region;
234
+ VALUE ret;
235
+ {
236
+ ORegexp *oregexp;
237
+ int32_t replIdx = 0, name_pos, name_start, name_end ;
238
+ int32_t replacementLength = RSTRING(repl_text)->len;
239
+ UChar *replacementText = RSTRING(repl_text)->ptr;
240
+ UChar *replacementEnd = replacementText + (replacementLength-1);
241
+ long numDigits = 0;
242
+ long groupNum = 0, g_start, g_end;
243
+ OnigCodePoint digitC;
244
+ OnigEncoding enc;
245
+ const UChar * matchText;
246
+ long matchLen;
247
+
248
+ matchText = RSTRING(src_text)->ptr;
249
+ matchLen = RSTRING(src_text)->len;
250
+ Data_Get_Struct( pat, ORegexp, oregexp );
251
+ enc = onig_get_encoding( oregexp->reg );
252
+
253
+ while (replIdx < replacementLength) {
254
+ OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
255
+ int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
256
+ if( c_len == 0 ) {
257
+ rb_warn("Strange, for %d enc_len is 0", c);
258
+ c_len = 1;
259
+ }
260
+ replIdx += c_len;
261
+ if ( c != BACKSLASH) {
262
+ /* Common case, no substitution, no escaping, */
263
+ /* just copy the char to the dest buf. */
264
+ rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
265
+ continue;
266
+ }
267
+ if (replIdx >= replacementLength) {
268
+ rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
269
+ break;
270
+ }
271
+ /* Pick up a capture group number if one follows. */
272
+ numDigits = 0;
273
+ groupNum = 0;
274
+ for (;;) {
275
+ if (replIdx >= replacementLength) {
276
+ break;
277
+ }
278
+ digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
279
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
280
+ if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
281
+ break;
282
+ }
283
+ replIdx += c_len;
284
+ groupNum=groupNum*10 + (digitC - '0');
285
+ numDigits++;
286
+ if (numDigits >= 2) { /* limit 99 groups */
287
+ break;
288
+ }
289
+ }
290
+ if (numDigits == 0) {
291
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
292
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
293
+ backslash). */
294
+ int p_len = c_len;
295
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
296
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
297
+ switch(c) {
298
+ case '&' : // matched substring
299
+ rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
300
+ replIdx += c_len;
301
+ break;
302
+ case '`' : // prematch
303
+ rb_str_buf_cat(ret, matchText, region->beg[0]);
304
+ replIdx += c_len;
305
+ break;
306
+ case '\'': // postmatch
307
+ rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
308
+ replIdx += c_len;
309
+ break;
310
+ case '\\': // literal backslash
311
+ // place single backslash
312
+ rb_str_buf_cat(ret, replacementText+replIdx, c_len);
313
+ replIdx += c_len;
314
+ break;
315
+ case '+': // last matched group
316
+ replIdx += c_len;
317
+ for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
318
+ g_start = region->beg[ groupNum ];
319
+ g_end = region->end[ groupNum ];
320
+ if( g_start != -1 ) {
321
+ rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
322
+ break;
323
+ }
324
+ }
325
+ break;
326
+ case '<': // named group references \<name>
327
+ name_pos = replIdx+c_len;
328
+ name_end = name_start = replIdx+c_len;
329
+ while(name_pos < replacementLength) {
330
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
331
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
332
+ name_pos += c_len;
333
+ if( c == '>') break;
334
+ if( ONIGENC_IS_CODE_WORD(enc, c) ) {
335
+ name_end += c_len;
336
+ } else {
337
+ break;
338
+ }
339
+ }
340
+ if( c != '>' || name_end == name_start ) {
341
+ // place backslash and '<'
342
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
343
+ replIdx += c_len;
344
+ } else {
345
+ // lookup for group and subst for that value
346
+ groupNum = onig_name_to_backref_number( oregexp->reg,
347
+ replacementText+name_start, replacementText+name_end, region);
348
+ if( groupNum >= 0 ) {
349
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum],
350
+ region->end[groupNum]-region->beg[groupNum]);
351
+ }
352
+ replIdx = name_pos;
353
+ }
354
+ break;
355
+ default:
356
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
357
+ replIdx += c_len;
358
+
359
+ }
360
+ } else {
361
+ /* Finally, append the capture group data to the destination. */
362
+ if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
363
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
364
+ }
365
+ }
366
+ }
367
+ return ret;
368
+ }
369
+
370
+ static inline void
371
+ str_mod_check(s, p, len)
372
+ VALUE s;
373
+ char *p;
374
+ long len;
375
+ {
376
+ if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
377
+ rb_raise(rb_eRuntimeError, "string modified");
378
+ }
379
+ }
380
+
381
+ static VALUE
382
+ oregexp_gsub(self, argc, argv, bang, once, region)
383
+ VALUE self; // pattern
384
+ int argc; // should be 1 if block given
385
+ VALUE *argv; // either replacement string
386
+ int bang;
387
+ int once;
388
+ OnigRegion *region;
389
+ {
390
+ VALUE repl;
391
+ long beg,
392
+ end,
393
+ len,
394
+ prev_end;
395
+ int tainted = 0,
396
+ iter = 0;
397
+
398
+ VALUE buf, curr_repl, block_res;
399
+ ORegexp *oregexp;
400
+ OnigEncoding enc;
401
+
402
+ if (argc == 1 && rb_block_given_p()) {
403
+ iter = 1;
404
+ } else if (argc == 2) {
405
+ repl = argv[1];
406
+ Check_Type(repl, T_STRING);
407
+ if (OBJ_TAINTED(argv[1]))
408
+ tainted = 1;
409
+ } else {
410
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
411
+ }
412
+ Data_Get_Struct( self, ORegexp, oregexp );
413
+
414
+ VALUE string_str = StringValue( argv[0] );
415
+ UChar* str_ptr = RSTRING(string_str)->ptr;
416
+ int str_len = RSTRING(string_str)->len;
417
+
418
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
419
+
420
+ if (beg < 0) {
421
+ /* no match */
422
+ if (bang)
423
+ return Qnil;
424
+ return rb_str_dup(string_str);
425
+ }
426
+ end = 0;
427
+ buf = rb_str_buf_new(str_len);
428
+ enc = onig_get_encoding( oregexp->reg );
429
+ do {
430
+ prev_end = end;
431
+ beg = region->beg[0];
432
+ end = region->end[0];
433
+ rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
434
+ if ( iter ) {
435
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
436
+ rb_backref_set(match_data);
437
+ rb_match_busy(match_data);
438
+ block_res = rb_yield( match_data );
439
+ str_mod_check( string_str, str_ptr, str_len);
440
+ curr_repl = rb_obj_as_string(block_res);
441
+ rb_str_append(buf, curr_repl);
442
+ } else {
443
+ oregexp_append_replacement(self, string_str, repl, region, buf);
444
+ }
445
+ if( once ) break;
446
+ // find next match
447
+ if( end == beg) {
448
+ /*
449
+ * Always consume at least one character of the input string
450
+ * in order to prevent infinite loops.
451
+ */
452
+ if( str_len <= end )
453
+ break;
454
+ len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
455
+ rb_str_buf_cat( buf, str_ptr+end, len);
456
+ end += len;
457
+ }
458
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
459
+ str_ptr+end, str_ptr + str_len,
460
+ region, ONIG_OPTION_NONE);
461
+ } while ( beg >= 0);
462
+ rb_str_buf_cat( buf, str_ptr+end, str_len - end);
463
+
464
+ if(tainted)
465
+ OBJ_INFECT(buf, repl);
466
+ OBJ_INFECT(buf, string_str);
467
+ if (bang) {
468
+ rb_funcall(string_str, rb_intern("replace"), 1, buf);
469
+ return string_str;
470
+ } else {
471
+ return buf;
472
+ }
473
+ }
474
+
475
+ typedef struct gsub_packet_t {
476
+ VALUE self; // pattern
477
+ int argc; // should be 1 if block given
478
+ VALUE *argv; // either replacement string
479
+ int bang;
480
+ int once;
481
+ OnigRegion *region;
482
+ } gsub_packet;
483
+ static VALUE oregexp_packed_gsub( gsub_packet* args ) {
484
+ return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
485
+ }
486
+ void oregexp_cleanup_region(OnigRegion * region){
487
+ onig_region_free(region, 1);
488
+ }
489
+ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
490
+ VALUE self; // pattern
491
+ int argc; // should be 1 if block given
492
+ VALUE *argv; // either replacement string
493
+ int bang;
494
+ int once;
495
+ {
496
+ OnigRegion * region = onig_region_new();
497
+ gsub_packet call_args = {self, argc, argv, bang, once, region};
498
+ return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
499
+ }
500
+
501
+ /**
502
+ * call-seq:
503
+ * rxp.gsub(str, replacement)
504
+ * rxp.gsub(str) {|match_data| ... }
505
+ *
506
+ * Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
507
+ * replaced with either _replacement_ or the value of the block.
508
+ *
509
+ * If a string is used as the replacement, the sequences \1, \2,
510
+ * and so on may be used to interpolate successive groups in the match.
511
+ *
512
+ * In the block form, the current MatchData object is passed in as a
513
+ * parameter. The value returned by the block will be substituted for
514
+ * the match on each call.
515
+ *
516
+ **/
517
+ static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
518
+ return oregexp_safe_gsub(self, argc, argv, 0, 0);
519
+ }
520
+
521
+ /**
522
+ * call-seq:
523
+ * rxp.sub(str, replacement)
524
+ * rxp.sub(str) {|match_data| ... }
525
+ *
526
+ * Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
527
+ * replaced with either _replacement_ or the value of the block.
528
+ *
529
+ * If a string is used as the replacement, the sequences \1, \2,
530
+ * and so on may be used to interpolate successive groups in the match.
531
+ *
532
+ * In the block form, the current MatchData object is passed in as a
533
+ * parameter. The value returned by the block will be substituted for
534
+ * the match on each call.
535
+ *
536
+ **/
537
+ static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
538
+ return oregexp_safe_gsub(self, argc, argv, 0, 1);
539
+ }
540
+
541
+ /**
542
+ * call-seq:
543
+ * rxp.gsub!(str, replacement)
544
+ * rxp.gsub!(str) {|match_data| ... }
545
+ *
546
+ * Performs the substitutions of ORegexp#gsub in place, returning
547
+ * _str_, or _nil_ if no substitutions were performed.
548
+ *
549
+ **/
550
+ static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
551
+ return oregexp_safe_gsub(self, argc, argv, 1, 0);
552
+ }
553
+
554
+ /**
555
+ * call-seq:
556
+ * oregexp.sub!(str, replacement)
557
+ * oregexp.sub!(str) {|match_data| ... }
558
+ *
559
+ * Performs the substitutions of ORegexp#sub in place, returning
560
+ * _str_, or _nil_ if no substitutions were performed.
561
+ *
562
+ **/
563
+ static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
564
+ return oregexp_safe_gsub(self, argc, argv, 1, 1);
565
+ }
566
+
567
+ static VALUE
568
+ oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
569
+ {
570
+ long beg,
571
+ len,
572
+ end;
573
+ int iter = 0;
574
+
575
+ VALUE matches;
576
+ ORegexp *oregexp;
577
+ OnigEncoding enc;
578
+
579
+ if ( rb_block_given_p()) {
580
+ iter = 1;
581
+ }
582
+ Data_Get_Struct( self, ORegexp, oregexp );
583
+
584
+ VALUE string_str = StringValue( str );
585
+ UChar* str_ptr = RSTRING(string_str)->ptr;
586
+ int str_len = RSTRING(string_str)->len;
587
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
588
+ if (beg < 0) {
589
+ /* no match */
590
+ return Qnil;
591
+ }
592
+ matches = rb_ary_new();
593
+ enc = onig_get_encoding( oregexp -> reg );
594
+ do {
595
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
596
+ end = region->end[0];
597
+ rb_ary_push( matches, match_data );
598
+ if ( iter )
599
+ rb_yield( match_data );
600
+ // find next match
601
+ if( end == beg) {
602
+ /*
603
+ * Always consume at least one character of the input string
604
+ * in order to prevent infinite loops.
605
+ */
606
+ if( str_len <= end )
607
+ break;
608
+ len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
609
+ end += len;
610
+ }
611
+
612
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
613
+ str_ptr+end, str_ptr + str_len,
614
+ region, ONIG_OPTION_NONE);
615
+ } while ( beg >= 0);
616
+
617
+ return matches;
618
+ }
619
+
620
+ struct scan_packet {
621
+ VALUE self, str;
622
+ OnigRegion * region;
623
+ };
624
+ static VALUE oregexp_packed_scan( struct scan_packet * args) {
625
+ return oregexp_scan(args->self, args->str, args->region);
626
+ }
627
+ /**
628
+ * call-seq:
629
+ * rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
630
+ * rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
631
+ *
632
+ * Both forms iterate through _str_, matching the pattern. For each match,
633
+ * a MatchData object is generated and passed to the block, and
634
+ * added to the resulting array of MatchData objects.
635
+ *
636
+ * If _str_ does not match pattern, _nil_ is returned.
637
+ *
638
+ **/
639
+ static VALUE oregexp_m_scan(VALUE self, VALUE str) {
640
+ OnigRegion * region = onig_region_new();
641
+ struct scan_packet call_args = {self, str, region};
642
+ return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
643
+ }
644
+
645
+ /**
646
+ * call-seq:
647
+ * rxp === str => true or false
648
+ *
649
+ * Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
650
+ *
651
+ * a = "HELLO"
652
+ * case a
653
+ * when ORegexp.new('^[a-z]*$'); print "Lower case\n"
654
+ * when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
655
+ * else; print "Mixed case\n"
656
+ * end
657
+ *
658
+ * <em>produces:</em>
659
+ *
660
+ * Upper case
661
+ *
662
+ **/
663
+
664
+ static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
665
+ VALUE match;
666
+
667
+ if (TYPE(str) != T_STRING) {
668
+ str = rb_check_string_type(str);
669
+ if (NIL_P(str)) {
670
+ return Qfalse;
671
+ }
672
+ }
673
+ StringValue(str);
674
+ match = oregexp_match(self, str);
675
+ if (Qnil == match) {
676
+ return Qfalse;
677
+ }
678
+ return Qtrue;
679
+ }
680
+ /*
681
+ * call-seq:
682
+ * rxp =~ string => int or nil
683
+ *
684
+ * Matches <code>rxp</code> against <code>string</code>, returning the offset of the
685
+ * start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
686
+ * <code>MatchData</code> or <code>nil</code>.
687
+ *
688
+ * ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
689
+ * ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
690
+ **/
691
+ static VALUE oregexp_match_op(VALUE self, VALUE str) {
692
+ VALUE ret = oregexp_match(self, str);
693
+ if(ret == Qnil)
694
+ return Qnil;
695
+ return INT2FIX(RMATCH(ret)->regs->beg[0]);
696
+ }
697
+
698
+ void Init_oregexp() {
699
+ mOniguruma = rb_define_module("Oniguruma");
700
+ VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
701
+ rb_define_alloc_func(cORegexp, oregexp_allocate);
702
+ rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
703
+ rb_define_method( cORegexp, "match", oregexp_match, 1 );
704
+ rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
705
+ rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
706
+ rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
707
+ rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
708
+ rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
709
+ rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
710
+ rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
711
+ rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
712
+ }