jasherai-oniguruma 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/extconf.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'mkmf'
2
+ dir_config("onig")
3
+ if have_library("onig")
4
+ $CFLAGS='-Wall'
5
+ create_makefile( "oregexp" )
6
+ else
7
+ puts "Cannot find lib-oniguruma. Set location with e.g. --with-onig-dir=/opt/local"
8
+ exit 1
9
+ end
data/ext/oregexp.c ADDED
@@ -0,0 +1,739 @@
1
+ #include <ruby.h>
2
+ #include <oniguruma.h>
3
+ /*
4
+ TODO:
5
+ - Complete oregexp_match with range parameter.
6
+ */
7
+
8
+ typedef struct _oregexp {
9
+ regex_t * reg;
10
+ } ORegexp;
11
+
12
+ VALUE mOniguruma;
13
+ VALUE nameHash;
14
+
15
+ static void oregexp_free( ORegexp * oregexp) {
16
+ onig_free( oregexp->reg );
17
+ free( oregexp );
18
+ }
19
+
20
+ static VALUE oregexp_allocate( VALUE klass ) {
21
+ ORegexp * oregexp = malloc( sizeof( ORegexp ) );
22
+ oregexp->reg = NULL;
23
+ return Data_Wrap_Struct( klass, 0, oregexp_free, oregexp );
24
+ }
25
+
26
+
27
+ static OnigEncodingType * int2encoding( VALUE v_index ) {
28
+ int index;
29
+ if( ! NIL_P(v_index) ) {
30
+ index = FIX2INT(v_index);
31
+ switch( index ) {
32
+ case 0: return ONIG_ENCODING_ASCII;
33
+ case 1: return ONIG_ENCODING_ISO_8859_1;
34
+ case 2: return ONIG_ENCODING_ISO_8859_2;
35
+ case 3: return ONIG_ENCODING_ISO_8859_3;
36
+ case 4: return ONIG_ENCODING_ISO_8859_4;
37
+ case 5: return ONIG_ENCODING_ISO_8859_5;
38
+ case 6: return ONIG_ENCODING_ISO_8859_6;
39
+ case 7: return ONIG_ENCODING_ISO_8859_7;
40
+ case 8: return ONIG_ENCODING_ISO_8859_8;
41
+ case 9: return ONIG_ENCODING_ISO_8859_9;
42
+ case 10: return ONIG_ENCODING_ISO_8859_10;
43
+ case 11: return ONIG_ENCODING_ISO_8859_11;
44
+ case 12: return ONIG_ENCODING_ISO_8859_11;
45
+ case 13: return ONIG_ENCODING_ISO_8859_13;
46
+ case 14: return ONIG_ENCODING_ISO_8859_14;
47
+ case 15: return ONIG_ENCODING_ISO_8859_15;
48
+ case 16: return ONIG_ENCODING_ISO_8859_16;
49
+ case 17: return ONIG_ENCODING_UTF8;
50
+ case 22: return ONIG_ENCODING_EUC_JP;
51
+ case 23: return ONIG_ENCODING_EUC_TW;
52
+ case 24: return ONIG_ENCODING_EUC_KR;
53
+ case 25: return ONIG_ENCODING_EUC_CN;
54
+ case 26: return ONIG_ENCODING_SJIS;
55
+ /*case 27: return ONIG_ENCODING_KOI8;*/
56
+ case 28: return ONIG_ENCODING_KOI8_R;
57
+ #if ONIGURUMA_VERSION_MAJOR == 5
58
+ case 29: return ONIG_ENCODING_CP1251;
59
+ #endif
60
+ case 30: return ONIG_ENCODING_BIG5;
61
+ case 32: return ONIG_ENCODING_UNDEF;
62
+ }
63
+ }
64
+ return ONIG_ENCODING_UNDEF;
65
+ }
66
+
67
+ static OnigSyntaxType * int2syntax( VALUE v_index ) {
68
+ int index;
69
+ if( ! NIL_P(v_index) ) {
70
+ index = FIX2INT(v_index);
71
+ switch( index ) {
72
+ case 1: return ONIG_SYNTAX_POSIX_BASIC;
73
+ case 2: return ONIG_SYNTAX_POSIX_EXTENDED;
74
+ case 3: return ONIG_SYNTAX_EMACS;
75
+ case 4: return ONIG_SYNTAX_GREP;
76
+ case 5: return ONIG_SYNTAX_GNU_REGEX;
77
+ case 6: return ONIG_SYNTAX_JAVA;
78
+ case 7: return ONIG_SYNTAX_PERL;
79
+ case 9: return ONIG_SYNTAX_RUBY;
80
+ case 10: return ONIG_SYNTAX_DEFAULT;
81
+ }
82
+ }
83
+ return ONIG_SYNTAX_DEFAULT;
84
+ }
85
+
86
+ struct callback_packet {
87
+ VALUE hash;
88
+ OnigRegion * region;
89
+ };
90
+
91
+ static int name_callback(
92
+ const UChar* name,
93
+ const UChar* name_end,
94
+ int ngroup_num,
95
+ int* group_nums,
96
+ regex_t* reg,
97
+ struct callback_packet* arg
98
+ ) {
99
+ int i, gn;
100
+ VALUE nameHash = arg->hash;
101
+
102
+ for (i = 0; i < ngroup_num; i++) {
103
+ gn = group_nums[i];
104
+ rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
105
+ }
106
+ return 0;
107
+ }
108
+
109
+ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
110
+ ORegexp *oregexp;
111
+ Data_Get_Struct( self, ORegexp, oregexp );
112
+
113
+ VALUE pattern_str = StringValue( pattern );
114
+ rb_iv_set( self, "@pattern", pattern_str );
115
+ rb_iv_set( self, "@options", options );
116
+ UChar* pat_ptr = RSTRING(pattern_str)->ptr;
117
+ int pat_len = RSTRING(pattern_str)->len;
118
+ VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
119
+ VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
120
+ VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
121
+ int iOptions = NUM2INT( rOptions );
122
+ OnigEncodingType * iEncoding = int2encoding( rEncoding );
123
+ OnigSyntaxType * iSyntax = int2syntax( rSyntax );
124
+
125
+
126
+ int r;
127
+ OnigErrorInfo einfo;
128
+ r = onig_new(&(oregexp->reg), pat_ptr, pat_ptr + pat_len, iOptions, iEncoding, iSyntax, &einfo);
129
+ if (r != ONIG_NORMAL) {
130
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
131
+ onig_error_code_to_str(s, r, &einfo);
132
+ rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
133
+ }
134
+ return self;
135
+ }
136
+
137
+ /* can't include re.h, since it conflicts with oniguruma typedefs */
138
+ struct RMatch {
139
+ struct RBasic basic;
140
+ VALUE str;
141
+ struct re_registers *regs;
142
+ };
143
+ #define RMATCH(obj) (R_CAST(RMatch)(obj))
144
+ void rb_match_busy _((VALUE));
145
+
146
+ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
147
+ VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
148
+ NEWOBJ(match, struct RMatch);
149
+ OBJSETUP(match, rb_cMatch, T_MATCH);
150
+ VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
151
+ int i , count = region->num_regs;
152
+ struct callback_packet packet;
153
+
154
+ match->str = rb_str_new4(string_str);
155
+ match->regs = ALLOC(struct re_registers);
156
+ match->regs->allocated = count;
157
+ match->regs->num_regs = count;
158
+ match->regs->beg = ALLOC_N(int, count);
159
+ match->regs->end = ALLOC_N(int, count);
160
+
161
+ for ( i = 0; i < count; i++){
162
+ match->regs->beg[i] = region->beg[i];
163
+ match->regs->end[i] = region->end[i];
164
+ }
165
+ rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
166
+ packet.region = region;
167
+ if( onig_number_of_names( oregexp->reg ) > 0 ) {
168
+ packet.hash = rb_hash_new();
169
+ onig_foreach_name(oregexp->reg, name_callback, &packet);
170
+ rb_iv_set((VALUE)match, "@named_captures", packet.hash);
171
+ }
172
+ return (VALUE)match;
173
+ }
174
+
175
+ /**
176
+ * call-seq:
177
+ * rxp.match(str) => matchdata or nil
178
+ * rxp.match(str, begin, end) => matchdata or nil
179
+ *
180
+ * Returns a <code>MatchData</code> object describing the match, or
181
+ * <code>nil</code> if there was no match. This is equivalent to retrieving the
182
+ * value of the special variable <code>$~</code> following a normal match.
183
+ *
184
+ * ORegexp.new('(.)(.)(.)').match("abc")[2] #=> "b"
185
+ *
186
+ * The second form allows to perform the match in a region
187
+ * defined by <code>begin</code> and <code>end</code> while
188
+ * still taking into account look-behinds and look-forwards.
189
+ *
190
+ * ORegexp.new('1*2*').match('11221122').offset => [4,8]
191
+ * ORegexp.new('(?<=2)1*2*').match('11221122').offset => [4,8]
192
+ *
193
+ * Compare with:
194
+ *
195
+ * ORegexp.new('(?<=2)1*2*').match('11221122'[4..-1]) => nil
196
+ */
197
+ static VALUE oregexp_match( int argc, VALUE * argv, VALUE self ) {
198
+ ORegexp *oregexp;
199
+ Data_Get_Struct( self, ORegexp, oregexp );
200
+
201
+
202
+ if ( argc == 0 || argc > 2) {
203
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
204
+ exit;
205
+ }
206
+
207
+ VALUE string_str = StringValue( argv[0] );
208
+ UChar* str_ptr = RSTRING(string_str)->ptr;
209
+ int str_len = RSTRING(string_str)->len;
210
+
211
+ int begin = 0;
212
+ int end = str_len;
213
+
214
+ if (argc > 1 ) {
215
+ begin = NUM2INT( argv[1] );
216
+ }
217
+ // if (argc > 2) {
218
+ // end = NUM2INT( argv[2] );
219
+ // }
220
+
221
+
222
+ OnigRegion *region = onig_region_new();
223
+ int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr + begin, str_ptr + end, region, ONIG_OPTION_NONE);
224
+ rb_backref_set(Qnil);
225
+ if (r >= 0) {
226
+ VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
227
+ onig_region_free(region, 1 );
228
+ rb_backref_set(matchData);
229
+ rb_match_busy(matchData);
230
+ return matchData;
231
+ } else if (r == ONIG_MISMATCH) {
232
+ onig_region_free(region, 1 );
233
+ return Qnil;
234
+ } else {
235
+ onig_region_free(region, 1 );
236
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
237
+ onig_error_code_to_str(s, r);
238
+ rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
239
+ }
240
+
241
+ }
242
+
243
+ static const UChar BACKSLASH = 0x5c;
244
+
245
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
246
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
247
+ backslash). */
248
+
249
+ /* scan the replacement text, looking for substitutions (\n) and \escapes. */
250
+ static VALUE
251
+ oregexp_append_replacement(pat, src_text, repl_text, region, ret)
252
+ VALUE pat,
253
+ src_text,
254
+ repl_text;
255
+ OnigRegion * region;
256
+ VALUE ret;
257
+ {
258
+ ORegexp *oregexp;
259
+ int32_t replIdx = 0, name_pos, name_start, name_end ;
260
+ int32_t replacementLength = RSTRING(repl_text)->len;
261
+ UChar *replacementText = RSTRING(repl_text)->ptr;
262
+ UChar *replacementEnd = replacementText + (replacementLength-1);
263
+ long numDigits = 0;
264
+ long groupNum = 0, g_start, g_end;
265
+ OnigCodePoint digitC;
266
+ OnigEncoding enc;
267
+ const UChar * matchText;
268
+ long matchLen;
269
+
270
+ matchText = RSTRING(src_text)->ptr;
271
+ matchLen = RSTRING(src_text)->len;
272
+ Data_Get_Struct( pat, ORegexp, oregexp );
273
+ enc = onig_get_encoding( oregexp->reg );
274
+
275
+ while (replIdx < replacementLength) {
276
+ OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
277
+ int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
278
+ if( c_len == 0 ) {
279
+ rb_warn("Strange, for %d enc_len is 0", c);
280
+ c_len = 1;
281
+ }
282
+ replIdx += c_len;
283
+ if ( c != BACKSLASH) {
284
+ /* Common case, no substitution, no escaping, */
285
+ /* just copy the char to the dest buf. */
286
+ rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
287
+ continue;
288
+ }
289
+ if (replIdx >= replacementLength) {
290
+ rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
291
+ break;
292
+ }
293
+ /* Pick up a capture group number if one follows. */
294
+ numDigits = 0;
295
+ groupNum = 0;
296
+ for (;;) {
297
+ if (replIdx >= replacementLength) {
298
+ break;
299
+ }
300
+ digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
301
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
302
+ if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
303
+ break;
304
+ }
305
+ replIdx += c_len;
306
+ groupNum=groupNum*10 + (digitC - '0');
307
+ numDigits++;
308
+ if (numDigits >= 2) { /* limit 99 groups */
309
+ break;
310
+ }
311
+ }
312
+ if (numDigits == 0) {
313
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
314
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
315
+ backslash). */
316
+ int p_len = c_len;
317
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
318
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
319
+ switch(c) {
320
+ case '&' : // matched substring
321
+ rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
322
+ replIdx += c_len;
323
+ break;
324
+ case '`' : // prematch
325
+ rb_str_buf_cat(ret, matchText, region->beg[0]);
326
+ replIdx += c_len;
327
+ break;
328
+ case '\'': // postmatch
329
+ rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
330
+ replIdx += c_len;
331
+ break;
332
+ case '\\': // literal backslash
333
+ // place single backslash
334
+ rb_str_buf_cat(ret, replacementText+replIdx, c_len);
335
+ replIdx += c_len;
336
+ break;
337
+ case '+': // last matched group
338
+ replIdx += c_len;
339
+ for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
340
+ g_start = region->beg[ groupNum ];
341
+ g_end = region->end[ groupNum ];
342
+ if( g_start != -1 ) {
343
+ rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
344
+ break;
345
+ }
346
+ }
347
+ break;
348
+ case '<': // named group references \<name>
349
+ name_pos = replIdx+c_len;
350
+ name_end = name_start = replIdx+c_len;
351
+ while(name_pos < replacementLength) {
352
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
353
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
354
+ name_pos += c_len;
355
+ if( c == '>') break;
356
+ if( ONIGENC_IS_CODE_WORD(enc, c) ) {
357
+ name_end += c_len;
358
+ } else {
359
+ break;
360
+ }
361
+ }
362
+ if( c != '>' || name_end == name_start ) {
363
+ // place backslash and '<'
364
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
365
+ replIdx += c_len;
366
+ } else {
367
+ // lookup for group and subst for that value
368
+ groupNum = onig_name_to_backref_number( oregexp->reg,
369
+ replacementText+name_start, replacementText+name_end, region);
370
+ if( groupNum >= 0 ) {
371
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum],
372
+ region->end[groupNum]-region->beg[groupNum]);
373
+ }
374
+ replIdx = name_pos;
375
+ }
376
+ break;
377
+ default:
378
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
379
+ replIdx += c_len;
380
+
381
+ }
382
+ } else {
383
+ /* Finally, append the capture group data to the destination. */
384
+ if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
385
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
386
+ }
387
+ }
388
+ }
389
+ return ret;
390
+ }
391
+
392
+ static inline void
393
+ str_mod_check(s, p, len)
394
+ VALUE s;
395
+ char *p;
396
+ long len;
397
+ {
398
+ if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
399
+ rb_raise(rb_eRuntimeError, "string modified");
400
+ }
401
+ }
402
+
403
+ static VALUE
404
+ oregexp_gsub(self, argc, argv, bang, once, region)
405
+ VALUE self; // pattern
406
+ int argc; // should be 1 if block given
407
+ VALUE *argv; // either replacement string
408
+ int bang;
409
+ int once;
410
+ OnigRegion *region;
411
+ {
412
+ VALUE repl;
413
+ long beg,
414
+ end,
415
+ len,
416
+ prev_end;
417
+ int tainted = 0,
418
+ iter = 0;
419
+
420
+ VALUE buf, curr_repl, block_res;
421
+ ORegexp *oregexp;
422
+ OnigEncoding enc;
423
+
424
+ if (argc == 1 && rb_block_given_p()) {
425
+ iter = 1;
426
+ } else if (argc == 2) {
427
+ repl = argv[1];
428
+ Check_Type(repl, T_STRING);
429
+ if (OBJ_TAINTED(argv[1]))
430
+ tainted = 1;
431
+ } else {
432
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
433
+ }
434
+ Data_Get_Struct( self, ORegexp, oregexp );
435
+
436
+ VALUE string_str = StringValue( argv[0] );
437
+ UChar* str_ptr = RSTRING(string_str)->ptr;
438
+ int str_len = RSTRING(string_str)->len;
439
+
440
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
441
+
442
+ if (beg < 0) {
443
+ /* no match */
444
+ if (bang)
445
+ return Qnil;
446
+ return rb_str_dup(string_str);
447
+ }
448
+ end = 0;
449
+ buf = rb_str_buf_new(str_len);
450
+ enc = onig_get_encoding( oregexp->reg );
451
+ do {
452
+ prev_end = end;
453
+ beg = region->beg[0];
454
+ end = region->end[0];
455
+ rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
456
+ if ( iter ) {
457
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
458
+ rb_backref_set(match_data);
459
+ rb_match_busy(match_data);
460
+ block_res = rb_yield( match_data );
461
+ str_mod_check( string_str, str_ptr, str_len);
462
+ curr_repl = rb_obj_as_string(block_res);
463
+ rb_str_append(buf, curr_repl);
464
+ } else {
465
+ oregexp_append_replacement(self, string_str, repl, region, buf);
466
+ }
467
+ if( once ) break;
468
+ // find next match
469
+ if( end == beg) {
470
+ /*
471
+ * Always consume at least one character of the input string
472
+ * in order to prevent infinite loops.
473
+ */
474
+ if( str_len <= end )
475
+ break;
476
+ len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
477
+ rb_str_buf_cat( buf, str_ptr+end, len);
478
+ end += len;
479
+ }
480
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
481
+ str_ptr+end, str_ptr + str_len,
482
+ region, ONIG_OPTION_NONE);
483
+ } while ( beg >= 0);
484
+ rb_str_buf_cat( buf, str_ptr+end, str_len - end);
485
+
486
+ if(tainted)
487
+ OBJ_INFECT(buf, repl);
488
+ OBJ_INFECT(buf, string_str);
489
+ if (bang) {
490
+ rb_funcall(string_str, rb_intern("replace"), 1, buf);
491
+ return string_str;
492
+ } else {
493
+ return buf;
494
+ }
495
+ }
496
+
497
+ typedef struct gsub_packet_t {
498
+ VALUE self; // pattern
499
+ int argc; // should be 1 if block given
500
+ VALUE *argv; // either replacement string
501
+ int bang;
502
+ int once;
503
+ OnigRegion *region;
504
+ } gsub_packet;
505
+ static VALUE oregexp_packed_gsub( gsub_packet* args ) {
506
+ return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
507
+ }
508
+ void oregexp_cleanup_region(OnigRegion * region){
509
+ onig_region_free(region, 1);
510
+ }
511
+ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
512
+ VALUE self; // pattern
513
+ int argc; // should be 1 if block given
514
+ VALUE *argv; // either replacement string
515
+ int bang;
516
+ int once;
517
+ {
518
+ OnigRegion * region = onig_region_new();
519
+ gsub_packet call_args = {self, argc, argv, bang, once, region};
520
+ return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
521
+ }
522
+
523
+ /**
524
+ * call-seq:
525
+ * rxp.gsub(str, replacement)
526
+ * rxp.gsub(str) {|match_data| ... }
527
+ *
528
+ * Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
529
+ * replaced with either _replacement_ or the value of the block.
530
+ *
531
+ * If a string is used as the replacement, the sequences \1, \2,
532
+ * and so on may be used to interpolate successive groups in the match.
533
+ *
534
+ * In the block form, the current MatchData object is passed in as a
535
+ * parameter. The value returned by the block will be substituted for
536
+ * the match on each call.
537
+ *
538
+ **/
539
+ static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
540
+ return oregexp_safe_gsub(self, argc, argv, 0, 0);
541
+ }
542
+
543
+ /**
544
+ * call-seq:
545
+ * rxp.sub(str, replacement)
546
+ * rxp.sub(str) {|match_data| ... }
547
+ *
548
+ * Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
549
+ * replaced with either _replacement_ or the value of the block.
550
+ *
551
+ * If a string is used as the replacement, the sequences \1, \2,
552
+ * and so on may be used to interpolate successive groups in the match.
553
+ *
554
+ * In the block form, the current MatchData object is passed in as a
555
+ * parameter. The value returned by the block will be substituted for
556
+ * the match on each call.
557
+ *
558
+ **/
559
+ static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
560
+ return oregexp_safe_gsub(self, argc, argv, 0, 1);
561
+ }
562
+
563
+ /**
564
+ * call-seq:
565
+ * rxp.gsub!(str, replacement)
566
+ * rxp.gsub!(str) {|match_data| ... }
567
+ *
568
+ * Performs the substitutions of ORegexp#gsub in place, returning
569
+ * _str_, or _nil_ if no substitutions were performed.
570
+ *
571
+ **/
572
+ static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
573
+ return oregexp_safe_gsub(self, argc, argv, 1, 0);
574
+ }
575
+
576
+ /**
577
+ * call-seq:
578
+ * oregexp.sub!(str, replacement)
579
+ * oregexp.sub!(str) {|match_data| ... }
580
+ *
581
+ * Performs the substitutions of ORegexp#sub in place, returning
582
+ * _str_, or _nil_ if no substitutions were performed.
583
+ *
584
+ */
585
+ static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
586
+ return oregexp_safe_gsub(self, argc, argv, 1, 1);
587
+ }
588
+
589
+ static VALUE
590
+ oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
591
+ {
592
+ long beg,
593
+ len,
594
+ end;
595
+ int iter = 0;
596
+
597
+ VALUE matches;
598
+ ORegexp *oregexp;
599
+ OnigEncoding enc;
600
+
601
+ if ( rb_block_given_p()) {
602
+ iter = 1;
603
+ }
604
+ Data_Get_Struct( self, ORegexp, oregexp );
605
+
606
+ VALUE string_str = StringValue( str );
607
+ UChar* str_ptr = RSTRING(string_str)->ptr;
608
+ int str_len = RSTRING(string_str)->len;
609
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
610
+ if (beg < 0) {
611
+ /* no match */
612
+ return Qnil;
613
+ }
614
+ matches = rb_ary_new();
615
+ enc = onig_get_encoding( oregexp -> reg );
616
+ do {
617
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
618
+ end = region->end[0];
619
+ rb_ary_push( matches, match_data );
620
+ if ( iter )
621
+ rb_yield( match_data );
622
+ // find next match
623
+ if( end == beg) {
624
+ /*
625
+ * Always consume at least one character of the input string
626
+ * in order to prevent infinite loops.
627
+ */
628
+ if( str_len <= end )
629
+ break;
630
+ len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
631
+ end += len;
632
+ }
633
+
634
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
635
+ str_ptr+end, str_ptr + str_len,
636
+ region, ONIG_OPTION_NONE);
637
+ } while ( beg >= 0);
638
+
639
+ return matches;
640
+ }
641
+
642
+ struct scan_packet {
643
+ VALUE self, str;
644
+ OnigRegion * region;
645
+ };
646
+ static VALUE oregexp_packed_scan( struct scan_packet * args) {
647
+ return oregexp_scan(args->self, args->str, args->region);
648
+ }
649
+
650
+ /**
651
+ * call-seq:
652
+ * rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
653
+ * rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
654
+ *
655
+ * Both forms iterate through _str_, matching the pattern. For each match,
656
+ * a MatchData object is generated and passed to the block, and
657
+ * added to the resulting array of MatchData objects.
658
+ *
659
+ * If _str_ does not match pattern, _nil_ is returned.
660
+ *
661
+ **/
662
+ static VALUE oregexp_m_scan(VALUE self, VALUE str) {
663
+ OnigRegion * region = onig_region_new();
664
+ struct scan_packet call_args = {self, str, region};
665
+ return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
666
+ }
667
+
668
+ /**
669
+ * call-seq:
670
+ * rxp === str => true or false
671
+ *
672
+ * Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
673
+ *
674
+ * a = "HELLO"
675
+ * case a
676
+ * when ORegexp.new('^[a-z]*$'); print "Lower case\n"
677
+ * when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
678
+ * else; print "Mixed case\n"
679
+ * end
680
+ *
681
+ * <em>produces:</em>
682
+ *
683
+ * Upper case
684
+ *
685
+ **/
686
+
687
+ static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
688
+ VALUE match;
689
+
690
+ if (TYPE(str) != T_STRING) {
691
+ str = rb_check_string_type(str);
692
+ if (NIL_P(str)) {
693
+ return Qfalse;
694
+ }
695
+ }
696
+ StringValue(str);
697
+ VALUE args[] = {str};
698
+ match = oregexp_match(1, args, self);
699
+ if (Qnil == match) {
700
+ return Qfalse;
701
+ }
702
+ return Qtrue;
703
+ }
704
+
705
+ /**
706
+ * call-seq:
707
+ * rxp =~ string => int or nil
708
+ *
709
+ * Matches <code>rxp</code> against <code>string</code>, returning the offset of the
710
+ * start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
711
+ * <code>MatchData</code> or <code>nil</code>.
712
+ *
713
+ * ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
714
+ * ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
715
+ */
716
+
717
+ static VALUE oregexp_match_op(VALUE self, VALUE str) {
718
+ VALUE args[] = {str};
719
+ VALUE ret = oregexp_match(1, args, self);
720
+ if(ret == Qnil)
721
+ return Qnil;
722
+ return INT2FIX(RMATCH(ret)->regs->beg[0]);
723
+ }
724
+
725
+ void Init_oregexp() {
726
+ mOniguruma = rb_define_module("Oniguruma");
727
+ VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
728
+ rb_define_alloc_func(cORegexp, oregexp_allocate);
729
+ rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
730
+ rb_define_method( cORegexp, "match", oregexp_match, -1 );
731
+ rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
732
+ rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
733
+ rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
734
+ rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
735
+ rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
736
+ rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
737
+ rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
738
+ rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
739
+ }