oniguruma 0.9.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +31 -0
- data/README.txt +3 -2
- data/Rakefile +1 -1
- data/ext/oregexp.c +259 -30
- data/lib/oniguruma.rb +226 -118
- data/test/test_oniguruma.rb +108 -24
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,34 @@
|
|
1
|
+
== 1.0.0 / 2007-03-27
|
2
|
+
* Added documentation for MatchData.
|
3
|
+
* Added ogsub, ogsub!, sub and sub! to ::String.
|
4
|
+
* Removed ::String definitions from tests.
|
5
|
+
* Now the minimal recommended version of oniglib is 5.5 or higher.
|
6
|
+
* Removed ugly #if statements from c code.
|
7
|
+
* Do not create @named_captures hash if there are no named groups for regexp -- somewhat improve speed for repetive calls
|
8
|
+
* Fixed usage of named backreferences in gsub with non-ascii names
|
9
|
+
* Move ORegexp#=~ to C code, make it work just like Regexp#=~, i.e. set $~. Throw ArgumentError instead of Exception if pattern does not compile
|
10
|
+
* Fix implementation of ORegexp#===, so it now does not raise errors in case statement anymore
|
11
|
+
(resembles plain Ruby Regexp#=== behaviour)
|
12
|
+
* Modified begin, end and offset methods in MatchData to handle named groups and default to group 0.
|
13
|
+
* Exception is not longer thrown when in oregexp_make_match_data.
|
14
|
+
* Removed references to MultiMatchData from documentation
|
15
|
+
* Removed class MultiMatchData
|
16
|
+
* Fix off by one error in region->num_regs usage
|
17
|
+
* Fix dumb bug with zero-width matches that made infinite loops. now consume at least one char in gsub and scan
|
18
|
+
* ORegexp API changes:
|
19
|
+
* Pass only MatchData to sub/gsub with blocks
|
20
|
+
oregexp.sub( str ) {|match_data| ... }
|
21
|
+
oregexp.gsub( str ) {|match_data| ... }
|
22
|
+
* Add ORegexp#scan instead of match_all
|
23
|
+
oregexp.scan(str) {|match_data| ... } # => MultiMatchData
|
24
|
+
* Friendly way to set options
|
25
|
+
ORegexp.new( pattern, options_str, encoding, syntax)
|
26
|
+
ORegexp.new('\w+', 'imsx', 'koi8r', 'perl')
|
27
|
+
* Named backreferences in substitions
|
28
|
+
ORegexp.new('(?<pre>\w+)\d+(?<after>\w+)').sub('abc123def', '\<after>123\<pre>') #=> 'def123abc'
|
29
|
+
* couple of bugfixes with region's num_regs
|
30
|
+
* some docs for substitution methods added
|
31
|
+
|
1
32
|
== 0.9.1 / 2007-03-25
|
2
33
|
* FIX: Buggy resolution of numeric codes for encoding and syntax options (Nikolai Lugovoi)
|
3
34
|
* FIX: Buggy implementation of ORegexp#gsub and ORegexp#gsub methods. Now code is all C (Nikolai Lugovoi)
|
data/README.txt
CHANGED
@@ -8,6 +8,7 @@ Ruby bindings to the Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] regul
|
|
8
8
|
* Same interface than standard Regexp class (easy transition!).
|
9
9
|
* Support for named groups, look-ahead, look-behind, and other
|
10
10
|
cool features!
|
11
|
+
* Support for other regexp syntaxes (Perl, Python, Java, etc.)
|
11
12
|
|
12
13
|
== SYNOPSIS:
|
13
14
|
|
@@ -23,7 +24,7 @@ Consult the Syntax.txt[link:files/Syntax_txt.html] page.
|
|
23
24
|
|
24
25
|
== REQUIREMENTS:
|
25
26
|
|
26
|
-
* Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] library v.
|
27
|
+
* Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] library v. 5.5 or higher
|
27
28
|
|
28
29
|
== INSTALL:
|
29
30
|
|
@@ -43,7 +44,7 @@ sudo gem install -r oniguruma
|
|
43
44
|
|
44
45
|
== CREDITS:
|
45
46
|
|
46
|
-
* N. Lugovoi. ORegexp.sub and ORegexp.gsub code
|
47
|
+
* N. Lugovoi. ORegexp.sub and ORegexp.gsub code and lots of other stuff.
|
47
48
|
* K. Kosako. For his great library.
|
48
49
|
* A lot of the documentation has been copied from the original Ruby Regex documentation.
|
49
50
|
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@ require 'hoe'
|
|
3
3
|
|
4
4
|
class Hoe; def extra_deps; @extra_deps.reject { |x| Array(x).first == 'hoe' }; end end
|
5
5
|
|
6
|
-
Hoe.new('oniguruma', '0.
|
6
|
+
Hoe.new('oniguruma', '1.0.0') do |p|
|
7
7
|
p.rubyforge_name = 'oniguruma'
|
8
8
|
p.author = 'Dizan Vasquez'
|
9
9
|
p.email = 'dix_ans@yahoo.com'
|
data/ext/oregexp.c
CHANGED
@@ -101,15 +101,11 @@ static int name_callback(
|
|
101
101
|
regex_t* reg,
|
102
102
|
struct callback_packet* arg
|
103
103
|
) {
|
104
|
-
int i, gn
|
105
|
-
OnigRegion *region = arg->region;
|
104
|
+
int i, gn;
|
106
105
|
VALUE nameHash = arg->hash;
|
107
106
|
|
108
107
|
for (i = 0; i < ngroup_num; i++) {
|
109
108
|
gn = group_nums[i];
|
110
|
-
ref = onig_name_to_backref_number(reg, name, name_end, region);
|
111
|
-
if (ref != gn )
|
112
|
-
return 1;
|
113
109
|
rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
|
114
110
|
}
|
115
111
|
return 0;
|
@@ -124,10 +120,6 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
124
120
|
rb_iv_set( self, "@options", options );
|
125
121
|
UChar* pat_ptr = RSTRING(pattern_str)->ptr;
|
126
122
|
int pat_len = RSTRING(pattern_str)->len;
|
127
|
-
if( pat_len == 0 ) {
|
128
|
-
rb_raise(rb_eArgError, "Empty pattern makes no sense.");
|
129
|
-
}
|
130
|
-
|
131
123
|
VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
|
132
124
|
VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
|
133
125
|
VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
|
@@ -142,16 +134,19 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
142
134
|
if (r != ONIG_NORMAL) {
|
143
135
|
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
144
136
|
onig_error_code_to_str(s, r, &einfo);
|
145
|
-
rb_raise(
|
137
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
146
138
|
}
|
147
139
|
return self;
|
148
140
|
}
|
149
141
|
|
142
|
+
/* can't include re.h, since it conflicts with oniguruma typedefs */
|
150
143
|
struct RMatch {
|
151
144
|
struct RBasic basic;
|
152
145
|
VALUE str;
|
153
146
|
struct re_registers *regs;
|
154
147
|
};
|
148
|
+
#define RMATCH(obj) (R_CAST(RMatch)(obj))
|
149
|
+
void rb_match_busy _((VALUE));
|
155
150
|
|
156
151
|
static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
|
157
152
|
VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
|
@@ -163,21 +158,22 @@ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VAL
|
|
163
158
|
|
164
159
|
match->str = rb_str_new4(string_str);
|
165
160
|
match->regs = ALLOC(struct re_registers);
|
166
|
-
match->regs->allocated = count
|
161
|
+
match->regs->allocated = count;
|
167
162
|
match->regs->num_regs = count;
|
168
|
-
match->regs->beg = ALLOC_N(int,
|
169
|
-
match->regs->end = ALLOC_N(int,
|
163
|
+
match->regs->beg = ALLOC_N(int, count);
|
164
|
+
match->regs->end = ALLOC_N(int, count);
|
170
165
|
|
171
|
-
for ( i = 0; i
|
166
|
+
for ( i = 0; i < count; i++){
|
172
167
|
match->regs->beg[i] = region->beg[i];
|
173
168
|
match->regs->end[i] = region->end[i];
|
174
169
|
}
|
175
170
|
rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
|
176
171
|
packet.region = region;
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
172
|
+
if( onig_number_of_names( oregexp->reg ) > 0 ) {
|
173
|
+
packet.hash = rb_hash_new();
|
174
|
+
onig_foreach_name(oregexp->reg, name_callback, &packet);
|
175
|
+
rb_iv_set((VALUE)match, "@named_captures", packet.hash);
|
176
|
+
}
|
181
177
|
return (VALUE)match;
|
182
178
|
}
|
183
179
|
|
@@ -201,9 +197,12 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
201
197
|
|
202
198
|
OnigRegion *region = onig_region_new();
|
203
199
|
int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
200
|
+
rb_backref_set(Qnil);
|
204
201
|
if (r >= 0) {
|
205
202
|
VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
|
206
203
|
onig_region_free(region, 1 );
|
204
|
+
rb_backref_set(matchData);
|
205
|
+
rb_match_busy(matchData);
|
207
206
|
return matchData;
|
208
207
|
} else if (r == ONIG_MISMATCH) {
|
209
208
|
onig_region_free(region, 1 );
|
@@ -212,7 +211,7 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
212
211
|
onig_region_free(region, 1 );
|
213
212
|
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
214
213
|
onig_error_code_to_str(s, r);
|
215
|
-
rb_raise(
|
214
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
216
215
|
}
|
217
216
|
|
218
217
|
}
|
@@ -233,7 +232,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
233
232
|
{
|
234
233
|
ORegexp *oregexp;
|
235
234
|
VALUE ret;
|
236
|
-
int32_t replIdx = 0;
|
235
|
+
int32_t replIdx = 0, name_pos, name_start, name_end ;
|
237
236
|
int32_t replacementLength = RSTRING(repl_text)->len;
|
238
237
|
UChar *replacementText = RSTRING(repl_text)->ptr;
|
239
238
|
UChar *replacementEnd = replacementText + (replacementLength-1);
|
@@ -254,6 +253,10 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
254
253
|
while (replIdx < replacementLength) {
|
255
254
|
OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
256
255
|
int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
256
|
+
if( c_len == 0 ) {
|
257
|
+
rb_warn("Strange, for %d enc_len is 0", c);
|
258
|
+
c_len = 1;
|
259
|
+
}
|
257
260
|
replIdx += c_len;
|
258
261
|
if ( c != BACKSLASH) {
|
259
262
|
/* Common case, no substitution, no escaping, */
|
@@ -311,7 +314,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
311
314
|
break;
|
312
315
|
case '+': // last matched group
|
313
316
|
replIdx += c_len;
|
314
|
-
for(groupNum = region->num_regs; groupNum > 0; groupNum --) {
|
317
|
+
for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
|
315
318
|
g_start = region->beg[ groupNum ];
|
316
319
|
g_end = region->end[ groupNum ];
|
317
320
|
if( g_start != -1 ) {
|
@@ -320,7 +323,35 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
320
323
|
}
|
321
324
|
}
|
322
325
|
break;
|
323
|
-
|
326
|
+
case '<': // named group references \<name>
|
327
|
+
name_pos = replIdx+c_len;
|
328
|
+
name_end = name_start = replIdx+c_len;
|
329
|
+
while(name_pos < replacementLength) {
|
330
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
|
331
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
|
332
|
+
name_pos += c_len;
|
333
|
+
if( c == '>') break;
|
334
|
+
if( ONIGENC_IS_CODE_WORD(enc, c) ) {
|
335
|
+
name_end += c_len;
|
336
|
+
} else {
|
337
|
+
break;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
if( c != '>' || name_end == name_start ) {
|
341
|
+
// place backslash and '<'
|
342
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
343
|
+
replIdx += c_len;
|
344
|
+
} else {
|
345
|
+
// lookup for group and subst for that value
|
346
|
+
groupNum = onig_name_to_backref_number( oregexp->reg,
|
347
|
+
replacementText+name_start, replacementText+name_end, region);
|
348
|
+
if( groupNum >= 0 ) {
|
349
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum],
|
350
|
+
region->end[groupNum]-region->beg[groupNum]);
|
351
|
+
}
|
352
|
+
replIdx = name_pos;
|
353
|
+
}
|
354
|
+
break;
|
324
355
|
default:
|
325
356
|
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
326
357
|
replIdx += c_len;
|
@@ -328,7 +359,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
328
359
|
}
|
329
360
|
} else {
|
330
361
|
/* Finally, append the capture group data to the destination. */
|
331
|
-
if( groupNum < region->num_regs && region->beg[groupNum] >= 0
|
362
|
+
if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
|
332
363
|
rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
|
333
364
|
}
|
334
365
|
}
|
@@ -359,13 +390,15 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
359
390
|
VALUE repl;
|
360
391
|
long beg,
|
361
392
|
end,
|
393
|
+
len,
|
362
394
|
prev_end;
|
363
395
|
int tainted = 0,
|
364
396
|
iter = 0;
|
365
397
|
|
366
398
|
VALUE buf, curr_repl, block_res;
|
367
399
|
ORegexp *oregexp;
|
368
|
-
|
400
|
+
OnigEncoding enc;
|
401
|
+
|
369
402
|
if (argc == 1 && rb_block_given_p()) {
|
370
403
|
iter = 1;
|
371
404
|
} else if (argc == 2) {
|
@@ -392,6 +425,7 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
392
425
|
}
|
393
426
|
end = 0;
|
394
427
|
buf = rb_str_buf_new(str_len);
|
428
|
+
enc = onig_get_encoding( oregexp->reg );
|
395
429
|
do {
|
396
430
|
prev_end = end;
|
397
431
|
beg = region->beg[0];
|
@@ -400,12 +434,8 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
400
434
|
if ( iter ) {
|
401
435
|
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
402
436
|
rb_backref_set(match_data);
|
403
|
-
|
404
|
-
|
405
|
-
else {
|
406
|
-
VALUE match_string = rb_str_new( str_ptr+beg, end-beg);
|
407
|
-
block_res = rb_yield_values(2, match_string, match_data );
|
408
|
-
}
|
437
|
+
rb_match_busy(match_data);
|
438
|
+
block_res = rb_yield( match_data );
|
409
439
|
str_mod_check( string_str, str_ptr, str_len);
|
410
440
|
curr_repl = rb_obj_as_string(block_res);
|
411
441
|
} else {
|
@@ -414,6 +444,17 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
414
444
|
rb_str_append(buf, curr_repl);
|
415
445
|
if( once ) break;
|
416
446
|
// find next match
|
447
|
+
if( end == beg) {
|
448
|
+
/*
|
449
|
+
* Always consume at least one character of the input string
|
450
|
+
* in order to prevent infinite loops.
|
451
|
+
*/
|
452
|
+
if( str_len <= end )
|
453
|
+
break;
|
454
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
455
|
+
rb_str_buf_cat( buf, str_ptr+end, len);
|
456
|
+
end += len;
|
457
|
+
}
|
417
458
|
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
418
459
|
str_ptr+end, str_ptr + str_len,
|
419
460
|
region, ONIG_OPTION_NONE);
|
@@ -456,28 +497,216 @@ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
|
|
456
497
|
gsub_packet call_args = {self, argc, argv, bang, once, region};
|
457
498
|
return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
458
499
|
}
|
500
|
+
|
501
|
+
/**
|
502
|
+
* call-seq:
|
503
|
+
* rxp.gsub(str, replacement)
|
504
|
+
* rxp.gsub(str) {|match_data| ... }
|
505
|
+
*
|
506
|
+
* Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
|
507
|
+
* replaced with either _replacement_ or the value of the block.
|
508
|
+
*
|
509
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
510
|
+
* and so on may be used to interpolate successive groups in the match.
|
511
|
+
*
|
512
|
+
* In the block form, the current MatchData object is passed in as a
|
513
|
+
* parameter. The value returned by the block will be substituted for
|
514
|
+
* the match on each call.
|
515
|
+
*
|
516
|
+
**/
|
459
517
|
static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
|
460
518
|
return oregexp_safe_gsub(self, argc, argv, 0, 0);
|
461
519
|
}
|
520
|
+
|
521
|
+
/**
|
522
|
+
* call-seq:
|
523
|
+
* rxp.sub(str, replacement)
|
524
|
+
* rxp.sub(str) {|match_data| ... }
|
525
|
+
*
|
526
|
+
* Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
|
527
|
+
* replaced with either _replacement_ or the value of the block.
|
528
|
+
*
|
529
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
530
|
+
* and so on may be used to interpolate successive groups in the match.
|
531
|
+
*
|
532
|
+
* In the block form, the current MatchData object is passed in as a
|
533
|
+
* parameter. The value returned by the block will be substituted for
|
534
|
+
* the match on each call.
|
535
|
+
*
|
536
|
+
**/
|
462
537
|
static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
|
463
538
|
return oregexp_safe_gsub(self, argc, argv, 0, 1);
|
464
539
|
}
|
465
540
|
|
541
|
+
/**
|
542
|
+
* call-seq:
|
543
|
+
* rxp.gsub!(str, replacement)
|
544
|
+
* rxp.gsub!(str) {|match_data| ... }
|
545
|
+
*
|
546
|
+
* Performs the substitutions of ORegexp#gsub in place, returning
|
547
|
+
* _str_, or _nil_ if no substitutions were performed.
|
548
|
+
*
|
549
|
+
**/
|
466
550
|
static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
|
467
551
|
return oregexp_safe_gsub(self, argc, argv, 1, 0);
|
468
552
|
}
|
553
|
+
|
554
|
+
/**
|
555
|
+
* call-seq:
|
556
|
+
* oregexp.sub!(str, replacement)
|
557
|
+
* oregexp.sub!(str) {|match_data| ... }
|
558
|
+
*
|
559
|
+
* Performs the substitutions of ORegexp#sub in place, returning
|
560
|
+
* _str_, or _nil_ if no substitutions were performed.
|
561
|
+
*
|
562
|
+
**/
|
469
563
|
static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
|
470
564
|
return oregexp_safe_gsub(self, argc, argv, 1, 1);
|
471
565
|
}
|
472
566
|
|
567
|
+
static VALUE
|
568
|
+
oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
|
569
|
+
{
|
570
|
+
long beg,
|
571
|
+
len,
|
572
|
+
end;
|
573
|
+
int iter = 0;
|
574
|
+
|
575
|
+
VALUE matches;
|
576
|
+
ORegexp *oregexp;
|
577
|
+
OnigEncoding enc;
|
578
|
+
|
579
|
+
if ( rb_block_given_p()) {
|
580
|
+
iter = 1;
|
581
|
+
}
|
582
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
583
|
+
|
584
|
+
VALUE string_str = StringValue( str );
|
585
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
586
|
+
int str_len = RSTRING(string_str)->len;
|
587
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
588
|
+
if (beg < 0) {
|
589
|
+
/* no match */
|
590
|
+
return Qnil;
|
591
|
+
}
|
592
|
+
matches = rb_ary_new();
|
593
|
+
enc = onig_get_encoding( oregexp -> reg );
|
594
|
+
do {
|
595
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
596
|
+
end = region->end[0];
|
597
|
+
rb_ary_push( matches, match_data );
|
598
|
+
if ( iter )
|
599
|
+
rb_yield( match_data );
|
600
|
+
// find next match
|
601
|
+
if( end == beg) {
|
602
|
+
/*
|
603
|
+
* Always consume at least one character of the input string
|
604
|
+
* in order to prevent infinite loops.
|
605
|
+
*/
|
606
|
+
if( str_len <= end )
|
607
|
+
break;
|
608
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
609
|
+
end += len;
|
610
|
+
}
|
611
|
+
|
612
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
613
|
+
str_ptr+end, str_ptr + str_len,
|
614
|
+
region, ONIG_OPTION_NONE);
|
615
|
+
} while ( beg >= 0);
|
616
|
+
|
617
|
+
return matches;
|
618
|
+
}
|
619
|
+
|
620
|
+
struct scan_packet {
|
621
|
+
VALUE self, str;
|
622
|
+
OnigRegion * region;
|
623
|
+
};
|
624
|
+
static VALUE oregexp_packed_scan( struct scan_packet * args) {
|
625
|
+
return oregexp_scan(args->self, args->str, args->region);
|
626
|
+
}
|
627
|
+
/**
|
628
|
+
* call-seq:
|
629
|
+
* rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
|
630
|
+
* rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
|
631
|
+
*
|
632
|
+
* Both forms iterate through _str_, matching the pattern. For each match,
|
633
|
+
* a MatchData object is generated and passed to the block, and
|
634
|
+
* added to the resulting array of MatchData objects.
|
635
|
+
*
|
636
|
+
* If _str_ does not match pattern, _nil_ is returned.
|
637
|
+
*
|
638
|
+
**/
|
639
|
+
static VALUE oregexp_m_scan(VALUE self, VALUE str) {
|
640
|
+
OnigRegion * region = onig_region_new();
|
641
|
+
struct scan_packet call_args = {self, str, region};
|
642
|
+
return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
643
|
+
}
|
644
|
+
|
645
|
+
/**
|
646
|
+
* call-seq:
|
647
|
+
* rxp === str => true or false
|
648
|
+
*
|
649
|
+
* Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
|
650
|
+
*
|
651
|
+
* a = "HELLO"
|
652
|
+
* case a
|
653
|
+
* when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
654
|
+
* when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
655
|
+
* else; print "Mixed case\n"
|
656
|
+
* end
|
657
|
+
*
|
658
|
+
* <em>produces:</em>
|
659
|
+
*
|
660
|
+
* Upper case
|
661
|
+
*
|
662
|
+
**/
|
663
|
+
|
664
|
+
static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
|
665
|
+
VALUE match;
|
666
|
+
|
667
|
+
if (TYPE(str) != T_STRING) {
|
668
|
+
str = rb_check_string_type(str);
|
669
|
+
if (NIL_P(str)) {
|
670
|
+
return Qfalse;
|
671
|
+
}
|
672
|
+
}
|
673
|
+
StringValue(str);
|
674
|
+
match = oregexp_match(self, str);
|
675
|
+
if (Qnil == match) {
|
676
|
+
return Qfalse;
|
677
|
+
}
|
678
|
+
return Qtrue;
|
679
|
+
}
|
680
|
+
/*
|
681
|
+
* call-seq:
|
682
|
+
* rxp =~ string => int or nil
|
683
|
+
*
|
684
|
+
* Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
685
|
+
* start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
686
|
+
* <code>MatchData</code> or <code>nil</code>.
|
687
|
+
*
|
688
|
+
* ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
689
|
+
* ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
690
|
+
**/
|
691
|
+
static VALUE oregexp_match_op(VALUE self, VALUE str) {
|
692
|
+
VALUE ret = oregexp_match(self, str);
|
693
|
+
if(ret == Qnil)
|
694
|
+
return Qnil;
|
695
|
+
return INT2FIX(RMATCH(ret)->regs->beg[0]);
|
696
|
+
}
|
697
|
+
|
473
698
|
void Init_oregexp() {
|
474
699
|
mOniguruma = rb_define_module("Oniguruma");
|
475
700
|
VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
|
476
701
|
rb_define_alloc_func(cORegexp, oregexp_allocate);
|
477
702
|
rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
|
478
703
|
rb_define_method( cORegexp, "match", oregexp_match, 1 );
|
704
|
+
rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
|
479
705
|
rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
|
480
706
|
rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
|
481
707
|
rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
|
482
708
|
rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
|
709
|
+
rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
|
710
|
+
rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
|
711
|
+
rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
|
483
712
|
}
|
data/lib/oniguruma.rb
CHANGED
@@ -17,6 +17,20 @@ module Oniguruma
|
|
17
17
|
OPTION_MAXBIT = OPTION_POSIX_REGION
|
18
18
|
OPTION_DEFAULT = OPTION_NONE
|
19
19
|
|
20
|
+
OPTIONS_SHORTCUTS = {
|
21
|
+
'i' => OPTION_IGNORECASE,
|
22
|
+
'x' => OPTION_EXTEND,
|
23
|
+
'm' => OPTION_MULTILINE,
|
24
|
+
's' => OPTION_SINGLELINE,
|
25
|
+
'l' => OPTION_FIND_LONGEST,
|
26
|
+
'E' => OPTION_FIND_NOT_EMPTY,
|
27
|
+
'S' => OPTION_NEGATE_SINGLELINE,
|
28
|
+
'G' => OPTION_DONT_CAPTURE_GROUP,
|
29
|
+
'g' => OPTION_CAPTURE_GROUP,
|
30
|
+
'B' => OPTION_NOTBOL,
|
31
|
+
'E' => OPTION_NOTEOL,
|
32
|
+
}
|
33
|
+
|
20
34
|
SYNTAX_ASIS = 0
|
21
35
|
SYNTAX_POSIX_BASIC = 1
|
22
36
|
SYNTAX_POSIX_EXTENDED = 2
|
@@ -117,8 +131,12 @@ module Oniguruma
|
|
117
131
|
alias old_initialize initialize
|
118
132
|
# :startdoc:
|
119
133
|
|
134
|
+
# call-seq:
|
135
|
+
# ORegexp.new( pattern, options_hash )
|
136
|
+
# ORegexp.new( pattern, option_str, encoding_str=nil, syntax_str=nil)
|
137
|
+
#
|
120
138
|
# Constructs a new regular expression from <i>pattern</i>, which is a
|
121
|
-
# <code>String</code>. The
|
139
|
+
# <code>String</code>. The second parameter <i></i> may be a <code>Hash</code>
|
122
140
|
# of the form:
|
123
141
|
#
|
124
142
|
# <code>{ :options => option_value, :encoding => encoding_value, :syntax => syntax_value }</code>
|
@@ -135,9 +153,27 @@ module Oniguruma
|
|
135
153
|
#
|
136
154
|
# #Accept java syntax on SJIS encoding:
|
137
155
|
# r4 = ORegexp.new('ape', :syntax => SYNTAX_JAVA, :encoding => ENCODING_SJIS) #=> /ape/
|
156
|
+
#
|
157
|
+
# Second form uses string shortcuts to set options and encoding:
|
158
|
+
# r = ORegexp.new('cat', 'i', 'utf8', 'java')
|
138
159
|
|
139
|
-
def initialize( pattern,
|
160
|
+
def initialize( pattern, *args )
|
140
161
|
defaults = { :options => OPTION_DEFAULT, :encoding => ENCODING_ASCII, :syntax => SYNTAX_DEFAULT}
|
162
|
+
if args[0].is_a?(String)
|
163
|
+
options = {}
|
164
|
+
option_str, encoding_str, syntax_str = *args
|
165
|
+
opt = 0
|
166
|
+
option_str.each_byte {|x| opt |= (OPTIONS_SHORTCUTS[x.chr] || 0) }
|
167
|
+
options[:options] = opt
|
168
|
+
if encoding_str && Oniguruma::const_defined?("ENCODING_#{encoding_str.upcase}")
|
169
|
+
options[:encoding] = Oniguruma::const_get("ENCODING_#{encoding_str.upcase}")
|
170
|
+
end
|
171
|
+
if syntax_str && Oniguruma::const_defined?("SYNTAX_#{syntax_str.upcase}")
|
172
|
+
options[:syntax] = Oniguruma::const_get("SYNTAX_#{syntax_str.upcase}")
|
173
|
+
end
|
174
|
+
else
|
175
|
+
options = args[0] || {}
|
176
|
+
end
|
141
177
|
old_initialize( pattern, defaults.merge( options ).freeze )
|
142
178
|
end
|
143
179
|
|
@@ -241,131 +277,203 @@ module Oniguruma
|
|
241
277
|
end
|
242
278
|
|
243
279
|
# call-seq:
|
244
|
-
# rxp
|
245
|
-
#
|
246
|
-
# Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
247
|
-
# start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
248
|
-
# <code>MatchData</code> or <code>nil</code>.
|
249
|
-
#
|
250
|
-
# ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
251
|
-
# ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
252
|
-
|
253
|
-
def =~ string
|
254
|
-
return nil unless string
|
255
|
-
m = match( string )
|
256
|
-
return nil unless m
|
257
|
-
m.begin(0)
|
258
|
-
end
|
259
|
-
|
260
|
-
# call-seq:
|
261
|
-
# rxp === str => true or false
|
280
|
+
# rxp.source => str
|
262
281
|
#
|
263
|
-
#
|
264
|
-
#
|
265
|
-
#
|
266
|
-
# case a
|
267
|
-
# when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
268
|
-
# when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
269
|
-
# else; print "Mixed case\n"
|
270
|
-
# end
|
271
|
-
#
|
272
|
-
# <em>produces:</em>
|
273
|
-
#
|
274
|
-
# Upper case
|
275
|
-
|
276
|
-
alias === =~
|
277
|
-
|
282
|
+
# Returns the original string of the pattern.
|
283
|
+
#
|
284
|
+
# ORegex.new( 'ab+c', 'ix' ).source #=> "ab+c"
|
278
285
|
def source
|
279
286
|
@pattern.freeze
|
280
287
|
end
|
281
288
|
|
282
|
-
|
283
|
-
|
284
|
-
positions = []
|
285
|
-
position = 0
|
286
|
-
tmp_string = string
|
287
|
-
while tmp_string != ""
|
288
|
-
if m = match( tmp_string )
|
289
|
-
matches << m
|
290
|
-
positions << position
|
291
|
-
tmp_string = m.post_match
|
292
|
-
position += m.end(0)
|
293
|
-
#if m.end == m.begin
|
294
|
-
# tmp_string = tmp_string[1..-1]
|
295
|
-
# position += 1
|
296
|
-
#end
|
297
|
-
else
|
298
|
-
break
|
299
|
-
end
|
300
|
-
end
|
301
|
-
if matches.size > 0
|
302
|
-
MultiMatchData.new( string, matches, positions )
|
303
|
-
else
|
304
|
-
nil
|
305
|
-
end
|
306
|
-
end
|
289
|
+
alias match_all scan
|
290
|
+
|
307
291
|
end
|
308
292
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
def position index
|
317
|
-
@positions[index]
|
318
|
-
end
|
319
|
-
|
320
|
-
def [] ( value1, value2 = nil )
|
321
|
-
unless value2
|
322
|
-
@matches[value1]
|
323
|
-
else
|
324
|
-
@matches[value1, value2]
|
325
|
-
end
|
326
|
-
end
|
327
|
-
|
328
|
-
def begin index
|
329
|
-
@matches[index].begin(0) + @positions[index]
|
330
|
-
end
|
331
|
-
|
332
|
-
def end index
|
333
|
-
@matches[index].end(0) + @positions[index]
|
334
|
-
end
|
335
|
-
|
336
|
-
def length
|
337
|
-
@matches.size
|
338
|
-
end
|
339
|
-
alias size length
|
340
|
-
|
341
|
-
def offset index
|
342
|
-
[self.begin(index), self.end(index) ]
|
343
|
-
end
|
344
|
-
|
345
|
-
def string
|
346
|
-
@string.freeze
|
347
|
-
end
|
348
|
-
|
349
|
-
def to_a
|
350
|
-
@matches
|
351
|
-
end
|
352
|
-
|
353
|
-
def each
|
354
|
-
@matches.size.times do |i|
|
355
|
-
yield @matches[i], @positions[i]
|
356
|
-
end
|
357
|
-
end
|
293
|
+
end
|
294
|
+
|
295
|
+
class ::String
|
296
|
+
# Calls <code>Oniguruma::ORegexp#gsub</code> on this string.
|
297
|
+
def ogsub(*args)
|
298
|
+
Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
|
358
299
|
end
|
359
300
|
|
301
|
+
# Calls <code>Oniguruma::ORegexp#gsub!</code> on this string.
|
302
|
+
def ogsub!(*args)
|
303
|
+
Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
|
304
|
+
end
|
305
|
+
|
306
|
+
# Calls <code>Oniguruma::ORegexp#sub</code> on this string.
|
307
|
+
def osub(re, *args)
|
308
|
+
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
309
|
+
end
|
310
|
+
|
311
|
+
# Calls <code>Oniguruma::ORegexp#sub!</code> on this string.
|
312
|
+
def osub!(re, *args)
|
313
|
+
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
314
|
+
end
|
360
315
|
end
|
316
|
+
|
361
317
|
class ::MatchData
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
318
|
+
# call-seq:
|
319
|
+
# to_index[symbol] => int or nil
|
320
|
+
#
|
321
|
+
# Returns the group index for the corresponding named group, or
|
322
|
+
# <code>nil</code> if the group does not exist.
|
323
|
+
#
|
324
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
325
|
+
# m.to_index[:begin] #=> 1
|
326
|
+
# m.to_index[:unknown] #=> nil
|
327
|
+
def to_index symbol
|
328
|
+
@named_captures && @named_captures[symbol]
|
329
|
+
end
|
330
|
+
|
331
|
+
alias old_aref :[]
|
332
|
+
|
333
|
+
# call-seq:
|
334
|
+
# mtch[i] => obj
|
335
|
+
# mtch[start, length] => array
|
336
|
+
# mtch[range] => array
|
337
|
+
# mtch[symbol] => obj
|
338
|
+
#
|
339
|
+
# <code>MatchData</code> acts as an array, and may be
|
340
|
+
# accessed using the normal array indexing techniques. <i>mtch</i>[0] is
|
341
|
+
# equivalent to the special variable <code>$&</code>, and returns the entire
|
342
|
+
# matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
|
343
|
+
# of the matched backreferences (portions of the pattern between parentheses).
|
344
|
+
#
|
345
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
346
|
+
# m[0] #=> "HX1138"
|
347
|
+
# m[1, 2] #=> ["H", "X"]
|
348
|
+
# m[1..3] #=> ["H", "X", "113"]
|
349
|
+
# m[-3, 2] #=> ["X", "113"]
|
350
|
+
#
|
351
|
+
# If a symbol is used as index, the corresponding named group is returned,
|
352
|
+
# or <code>nil</code> if such a group does not exist.
|
353
|
+
#
|
354
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
355
|
+
# m[:begin] #=> "THX"
|
356
|
+
# m[:moddle] #=> "1"
|
357
|
+
# m[:end] #=> "138"
|
358
|
+
|
359
|
+
def [](*idx)
|
360
|
+
if idx[0].is_a?(Symbol)
|
361
|
+
k = to_index( idx[0] )
|
362
|
+
k && old_aref(k)
|
363
|
+
else
|
364
|
+
old_aref(*idx)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
alias old_begin :begin
|
369
|
+
|
370
|
+
# call-seq:
|
371
|
+
# mtch.begin(n) => integer
|
372
|
+
# mtch.begin => integer
|
373
|
+
# mtch.begin(symbol) => integer
|
374
|
+
#
|
375
|
+
# Returns the offset of the start of the <em>n</em>th element of the match
|
376
|
+
# array in the string.
|
377
|
+
#
|
378
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
379
|
+
# m.begin(0) #=> 1
|
380
|
+
# m.begin(2) #=> 2
|
381
|
+
#
|
382
|
+
# If no arguments are given, the index of the
|
383
|
+
# first matching character is returned.
|
384
|
+
#
|
385
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
386
|
+
# m.begin #=> 1
|
387
|
+
#
|
388
|
+
# If the argument is a symbol, then the beginning of the
|
389
|
+
# corresponding named group is returned, or <code>nil</code>
|
390
|
+
# if the group does not exist.
|
391
|
+
#
|
392
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
393
|
+
# m.begin(:middle) #=> 3
|
394
|
+
|
395
|
+
def begin(*idx)
|
396
|
+
if idx[0].is_a?(Symbol)
|
397
|
+
k = to_index( idx[0] )
|
398
|
+
k && old_begin(k)
|
399
|
+
elsif idx.empty?
|
400
|
+
old_begin( 0 )
|
401
|
+
else
|
402
|
+
old_begin(*idx)
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
alias old_end :end
|
407
|
+
|
408
|
+
# call-seq:
|
409
|
+
# mtch.end(n) => integer
|
410
|
+
#
|
411
|
+
# Returns the offset of the character immediately following the end of the
|
412
|
+
# <em>n</em>th element of the match array in the string.
|
413
|
+
#
|
414
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
415
|
+
# m.end(0) #=> 7
|
416
|
+
# m.end(2) #=> 3
|
417
|
+
#
|
418
|
+
# If no arguments are given, the index of the
|
419
|
+
# last matching character is returned.
|
420
|
+
#
|
421
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
422
|
+
# m.last #=> 7
|
423
|
+
#
|
424
|
+
# If the argument is a symbol, then the beginning of the
|
425
|
+
# corresponding named group is returned, or <code>nil</code>
|
426
|
+
# if the group does not exist.
|
427
|
+
#
|
428
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
429
|
+
# m.end(:middle) #=> 4
|
430
|
+
|
431
|
+
def end(*idx)
|
432
|
+
if idx[0].is_a?(Symbol)
|
433
|
+
k = to_index( idx[0] )
|
434
|
+
k && old_end(k)
|
435
|
+
elsif idx.empty?
|
436
|
+
old_end( 0 )
|
437
|
+
else
|
438
|
+
old_end(*idx)
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
alias old_offset :offset
|
443
|
+
|
444
|
+
# call-seq:
|
445
|
+
# mtch.offset(n) => array
|
446
|
+
# mtch.offset => array
|
447
|
+
# mtch.offset(symbol) => array
|
448
|
+
#
|
449
|
+
# Returns a two-element array containing the beginning and ending offsets of
|
450
|
+
# the <em>n</em>th match.
|
451
|
+
#
|
452
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
453
|
+
# m.offset(0) #=> [1, 7]
|
454
|
+
# m.offset(4) #=> [6, 7]
|
455
|
+
#
|
456
|
+
# If no arguments are given, the offsets of the entire
|
457
|
+
# sequence are returned.
|
458
|
+
#
|
459
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
460
|
+
# m.offset #=> [1, 7]
|
461
|
+
#
|
462
|
+
# If the argument is a symbol, then the offsets of the
|
463
|
+
# corresponding named group are returned, or <code>nil</code>
|
464
|
+
# if the group does not exist.
|
465
|
+
#
|
466
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
467
|
+
# m.end(:middle) #=> [3, 4]
|
468
|
+
|
469
|
+
def offset(*idx)
|
470
|
+
if idx[0].is_a?(Symbol)
|
471
|
+
k = to_index( idx[0] )
|
472
|
+
k && old_offset(k)
|
473
|
+
elsif idx.empty?
|
474
|
+
old_offset( 0 )
|
475
|
+
else
|
476
|
+
old_offset(*idx)
|
477
|
+
end
|
478
|
+
end
|
371
479
|
end
|
data/test/test_oniguruma.rb
CHANGED
@@ -27,7 +27,7 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def test_bad_initialization
|
30
|
-
assert_raises(
|
30
|
+
assert_raises(ArgumentError) do
|
31
31
|
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.))" )
|
32
32
|
end
|
33
33
|
end
|
@@ -53,7 +53,7 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
53
53
|
string = 'My favorite fruits are (?#fruit1), (?#fruit2), and (?#fruit3)'
|
54
54
|
assert_equal( "My favorite fruits are *, *, and *", reg.gsub( string, '*' ) )
|
55
55
|
fruits = { "fruit1" => "apples", "fruit2" => "bananas", "fruit3" => "grapes" }
|
56
|
-
assert_equal( "My favorite fruits are apples, bananas, and grapes", reg.gsub( string ) { |
|
56
|
+
assert_equal( "My favorite fruits are apples, bananas, and grapes", reg.gsub( string ) { |match| fruits[match[1]]} )
|
57
57
|
end
|
58
58
|
|
59
59
|
def test_eql
|
@@ -74,10 +74,23 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
74
74
|
|
75
75
|
assert_equal( "Upper case\n", result )
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
|
+
def test_case_eql_compat
|
79
|
+
# === method should not raise when used in case statements
|
80
|
+
a = Time.now
|
81
|
+
result = ""
|
82
|
+
case a
|
83
|
+
when /./ ; result = "rgx"
|
84
|
+
when Oniguruma::ORegexp.new('.'); result = "ore"
|
85
|
+
else; result = "else"
|
86
|
+
end
|
87
|
+
assert_equal( "else", result )
|
88
|
+
end
|
89
|
+
|
78
90
|
def test_operator_match
|
79
91
|
assert_equal( nil, Oniguruma::ORegexp.new( 'SIT' ) =~ "insensitive" )
|
80
92
|
assert_equal( 5, Oniguruma::ORegexp.new( 'SIT', :options => Oniguruma::OPTION_IGNORECASE ) =~ "insensitive" )
|
93
|
+
assert_equal( 5, Oniguruma::ORegexp.new( 'SIT', 'i' ) =~ "insensitive" )
|
81
94
|
end
|
82
95
|
|
83
96
|
# def test_operator_match_2
|
@@ -96,6 +109,8 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
96
109
|
def test_kcode
|
97
110
|
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.)" )
|
98
111
|
assert_equal( Oniguruma::ENCODING_ASCII, reg.kcode )
|
112
|
+
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.)", '', 'SJIS' )
|
113
|
+
assert_equal( Oniguruma::ENCODING_SJIS, reg.kcode )
|
99
114
|
end
|
100
115
|
|
101
116
|
def test_options
|
@@ -106,6 +121,40 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
106
121
|
string = '(?<=\n)\\.*ocatarinetabelachitchix'
|
107
122
|
assert_equal( string, Oniguruma::ORegexp.new( string ).source )
|
108
123
|
end
|
124
|
+
|
125
|
+
def test_named_sub_backrefs
|
126
|
+
re = Oniguruma::ORegexp.new('(?<pre>\w+?)\d+(?<after>\w+)')
|
127
|
+
assert_equal(' def123abc ', re.sub('abc123def', ' \<after>123\<pre> ') )
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_named_sub_backrefs_dupes
|
131
|
+
re = Oniguruma::ORegexp.new('(?<pre>\w+?)\d+(?<pre>\w+)')
|
132
|
+
assert_equal('123def', re.sub('abc123def', '123\<pre>') )
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_backref_set_for_match
|
136
|
+
re = Oniguruma::ORegexp.new('Date:(\d{4})/(\d{2})/(\d{2})')
|
137
|
+
assert re.match( "Date:2007/03/25" )
|
138
|
+
assert_not_nil $~
|
139
|
+
assert_equal "2007", $1
|
140
|
+
assert_equal "03", $2
|
141
|
+
assert_equal "25", $3
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_backref_set_for_match_op
|
145
|
+
re = Oniguruma::ORegexp.new('Date:(\d{4})/(\d{2})/(\d{2})')
|
146
|
+
assert re =~ "Date:2007/03/25"
|
147
|
+
assert_not_nil $~
|
148
|
+
assert_equal "2007", $1
|
149
|
+
assert_equal "03", $2
|
150
|
+
assert_equal "25", $3
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_multibyte_named_backrefs
|
154
|
+
r = Oniguruma::ORegexp.new('(?<группа>test).+(\k<группа>)', :encoding => Oniguruma::ENCODING_UTF8)
|
155
|
+
assert_equal "should !test!", r.sub("should test this damned test", '!\<группа>!')
|
156
|
+
end
|
157
|
+
|
109
158
|
end
|
110
159
|
|
111
160
|
class MatchDataTestCase < Test::Unit::TestCase
|
@@ -123,6 +172,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
123
172
|
|
124
173
|
def test_begin
|
125
174
|
matches = @reg.match( "THX1138." )
|
175
|
+
assert_equal( 1, matches.begin )
|
126
176
|
assert_equal( 1, matches.begin(0) )
|
127
177
|
assert_equal( 2, matches.begin(2) )
|
128
178
|
end
|
@@ -134,6 +184,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
134
184
|
|
135
185
|
def test_end
|
136
186
|
matches = @reg.match( "THX1138." )
|
187
|
+
assert_equal( 7, matches.end )
|
137
188
|
assert_equal( 7, matches.end(0) )
|
138
189
|
assert_equal( 3, matches.end(2) )
|
139
190
|
end
|
@@ -146,6 +197,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
146
197
|
|
147
198
|
def test_offset
|
148
199
|
matches = @reg.match( "THX1138." )
|
200
|
+
assert_equal( [1, 7], matches.offset )
|
149
201
|
assert_equal( [1, 7], matches.offset(0) )
|
150
202
|
assert_equal( [6, 7], matches.offset(4) )
|
151
203
|
end
|
@@ -189,9 +241,20 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
189
241
|
def test_match_all
|
190
242
|
reg = Oniguruma::ORegexp.new( 'ca' )
|
191
243
|
matches = reg.match_all( 'ocatacachaca' )
|
244
|
+
a = []
|
245
|
+
matches.each { |m| a << m.offset(0) }
|
246
|
+
assert_equal( [ [1,3], [5,7], [10,12] ], a)
|
192
247
|
assert_equal( 3, matches.size )
|
193
|
-
assert_equal(
|
194
|
-
assert_equal( "ca", matches.string[matches.begin(
|
248
|
+
assert_equal( 10, matches[2].begin( 0 ) )
|
249
|
+
assert_equal( "ca", matches[1].string[matches[1].begin( 0 )...matches[1].end( 0 )])
|
250
|
+
end
|
251
|
+
|
252
|
+
def test_scan
|
253
|
+
reg = Oniguruma::ORegexp.new( 'ca' )
|
254
|
+
a = []
|
255
|
+
matches = reg.match_all( 'ocatacachaca' ) { |m| a << m.offset(0) }
|
256
|
+
#assert_kind_of(Oniguruma::MultiMatchData, matches)
|
257
|
+
assert_equal( [ [1,3], [5,7], [10,12] ], a)
|
195
258
|
end
|
196
259
|
|
197
260
|
def test_match_empty_string
|
@@ -205,12 +268,32 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
205
268
|
reg = Oniguruma::ORegexp.new( '(?<begin>\()(?<body>.*)(?<end>\))', :options => Oniguruma::OPTION_MULTILINE )
|
206
269
|
matches = reg.match( "blah (content) blah" )
|
207
270
|
assert_not_nil( matches )
|
271
|
+
assert_equal $~, matches
|
208
272
|
assert_equal( '(', matches[:begin] )
|
209
273
|
assert_equal( 'content', matches[:body] )
|
210
274
|
assert_equal( ')', matches[:end] )
|
211
275
|
assert_equal( nil, matches[:inexistent])
|
212
276
|
end
|
213
277
|
|
278
|
+
def test_multibyte_named_backrefs
|
279
|
+
r = Oniguruma::ORegexp.new('(?<имя>test).+(\k<имя>)', :encoding => Oniguruma::ENCODING_UTF8)
|
280
|
+
assert_equal "should TEST", r.sub("should test this damned test") {|m| m[:"имя"].upcase }
|
281
|
+
end
|
282
|
+
|
283
|
+
def test_no_named_backrefs
|
284
|
+
r = Oniguruma::ORegexp.new('(.+).+(.+)')
|
285
|
+
r.match("text")
|
286
|
+
assert_not_nil $~
|
287
|
+
assert_equal 0, $~.instance_variables.size
|
288
|
+
r = Oniguruma::ORegexp.new('(?<a>.+).+(?<b>.+)')
|
289
|
+
r.match("text")
|
290
|
+
assert_not_nil $~
|
291
|
+
assert_equal 1, $~.instance_variables.size
|
292
|
+
|
293
|
+
end
|
294
|
+
|
295
|
+
# casefolding for full Unicode set is not present in versions prior to 5.
|
296
|
+
if Oniguruma::VERSION >= '5.0.0'
|
214
297
|
def test_utf8_ignore_case
|
215
298
|
reg = Oniguruma::ORegexp.new( '([а-я])+', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
216
299
|
matches = reg.match("Text: Ехал Грека Через Реку")
|
@@ -222,16 +305,17 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
222
305
|
|
223
306
|
def test_utf8_gsub
|
224
307
|
reg = Oniguruma::ORegexp.new( '([а-я])([а-я])([а-я]+)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
225
|
-
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|
|
308
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|m| m[1]*2+m[2]*2+m[3] }
|
226
309
|
assert_equal("Text: ЕЕххал ГГррека ЧЧеерез РРееку", new_str)
|
227
310
|
end
|
228
311
|
|
229
312
|
def test_utf8_gsub2
|
230
313
|
reg = Oniguruma::ORegexp.new( '[а-я]', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
231
|
-
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|
|
314
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|m| m[0]*2 }
|
232
315
|
assert_equal("Text: ЕЕххаалл ГГррееккаа ЧЧеерреезз РРееккуу", new_str)
|
233
316
|
end
|
234
|
-
|
317
|
+
end
|
318
|
+
|
235
319
|
def test_sub_compatibility
|
236
320
|
$x = "a.gif"
|
237
321
|
assert_equal("b.gif", $x.osub('.*\.([^\.]+)$', 'b.\1'))
|
@@ -242,36 +326,36 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
242
326
|
assert_equal("<a.gif>", $x.osub('.*\.([^\.]+)$', '<\&>'))
|
243
327
|
assert_equal("a.a.", $x.osub('(gif)', '\`') )
|
244
328
|
end
|
245
|
-
|
246
|
-
class ::String
|
247
|
-
def ogsub(*args)
|
248
|
-
Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
|
249
|
-
end
|
250
|
-
def ogsub!(*args)
|
251
|
-
Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
|
252
|
-
end
|
253
|
-
def osub(re, *args)
|
254
|
-
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
255
|
-
end
|
256
|
-
end
|
257
329
|
|
258
330
|
def test_gsub_compat
|
259
331
|
assert_equal("hello".ogsub('[aeiou]', '*') , "h*ll*")
|
260
332
|
assert_equal("hello".ogsub('([aeiou])', '<\1>') , "h<e>ll<o>")
|
261
333
|
i = 0
|
262
|
-
assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|
|
263
|
-
assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|
|
334
|
+
assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|m| i+=1; i.to_s})
|
335
|
+
assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|m| m[2] + m[1] })
|
264
336
|
a = "test"
|
265
337
|
a.ogsub!('t', a)
|
266
338
|
assert_equal("testestest", a)
|
267
339
|
end
|
268
340
|
|
269
341
|
def test_match_compat
|
270
|
-
t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|
|
342
|
+
t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|m| "#$2#$1" }
|
271
343
|
assert_equal("214365", t )
|
272
|
-
t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|
|
344
|
+
t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|m| "<#$1>" }
|
273
345
|
assert_equal( "h<e>ll<o>", t)
|
274
346
|
end
|
275
347
|
|
348
|
+
def _u16(str)
|
349
|
+
str.unpack("U*").pack("n*")
|
350
|
+
end
|
351
|
+
puts Oniguruma::VERSION
|
352
|
+
if Oniguruma::VERSION >= '4.0.0'
|
353
|
+
def test_utf16_gsub
|
354
|
+
r = Oniguruma::ORegexp.new( _u16('[aeiou]'), :encoding => Oniguruma::ENCODING_UTF16_BE)
|
355
|
+
assert_equal( _u16("h*ll*"), r.gsub( _u16("hello"), _u16('*')) )
|
356
|
+
r = Oniguruma::ORegexp.new( _u16('([aeiou])'), :encoding => Oniguruma::ENCODING_UTF16_BE)
|
357
|
+
assert_equal( _u16("h<e>\\ll<o>\\"), r.gsub( _u16("hello"), _u16('<\1>\\')) )
|
358
|
+
end
|
359
|
+
end
|
276
360
|
|
277
361
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: oniguruma
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-03-
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2007-03-27 00:00:00 +02:00
|
8
8
|
summary: Bindings for the oniguruma regular expression library
|
9
9
|
require_paths:
|
10
10
|
- lib
|