oniguruma 0.9.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +31 -0
- data/README.txt +3 -2
- data/Rakefile +1 -1
- data/ext/oregexp.c +259 -30
- data/lib/oniguruma.rb +226 -118
- data/test/test_oniguruma.rb +108 -24
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,34 @@
|
|
1
|
+
== 1.0.0 / 2007-03-27
|
2
|
+
* Added documentation for MatchData.
|
3
|
+
* Added ogsub, ogsub!, sub and sub! to ::String.
|
4
|
+
* Removed ::String definitions from tests.
|
5
|
+
* Now the minimal recommended version of oniglib is 5.5 or higher.
|
6
|
+
* Removed ugly #if statements from c code.
|
7
|
+
* Do not create @named_captures hash if there are no named groups for regexp -- somewhat improve speed for repetive calls
|
8
|
+
* Fixed usage of named backreferences in gsub with non-ascii names
|
9
|
+
* Move ORegexp#=~ to C code, make it work just like Regexp#=~, i.e. set $~. Throw ArgumentError instead of Exception if pattern does not compile
|
10
|
+
* Fix implementation of ORegexp#===, so it now does not raise errors in case statement anymore
|
11
|
+
(resembles plain Ruby Regexp#=== behaviour)
|
12
|
+
* Modified begin, end and offset methods in MatchData to handle named groups and default to group 0.
|
13
|
+
* Exception is not longer thrown when in oregexp_make_match_data.
|
14
|
+
* Removed references to MultiMatchData from documentation
|
15
|
+
* Removed class MultiMatchData
|
16
|
+
* Fix off by one error in region->num_regs usage
|
17
|
+
* Fix dumb bug with zero-width matches that made infinite loops. now consume at least one char in gsub and scan
|
18
|
+
* ORegexp API changes:
|
19
|
+
* Pass only MatchData to sub/gsub with blocks
|
20
|
+
oregexp.sub( str ) {|match_data| ... }
|
21
|
+
oregexp.gsub( str ) {|match_data| ... }
|
22
|
+
* Add ORegexp#scan instead of match_all
|
23
|
+
oregexp.scan(str) {|match_data| ... } # => MultiMatchData
|
24
|
+
* Friendly way to set options
|
25
|
+
ORegexp.new( pattern, options_str, encoding, syntax)
|
26
|
+
ORegexp.new('\w+', 'imsx', 'koi8r', 'perl')
|
27
|
+
* Named backreferences in substitions
|
28
|
+
ORegexp.new('(?<pre>\w+)\d+(?<after>\w+)').sub('abc123def', '\<after>123\<pre>') #=> 'def123abc'
|
29
|
+
* couple of bugfixes with region's num_regs
|
30
|
+
* some docs for substitution methods added
|
31
|
+
|
1
32
|
== 0.9.1 / 2007-03-25
|
2
33
|
* FIX: Buggy resolution of numeric codes for encoding and syntax options (Nikolai Lugovoi)
|
3
34
|
* FIX: Buggy implementation of ORegexp#gsub and ORegexp#gsub methods. Now code is all C (Nikolai Lugovoi)
|
data/README.txt
CHANGED
@@ -8,6 +8,7 @@ Ruby bindings to the Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] regul
|
|
8
8
|
* Same interface than standard Regexp class (easy transition!).
|
9
9
|
* Support for named groups, look-ahead, look-behind, and other
|
10
10
|
cool features!
|
11
|
+
* Support for other regexp syntaxes (Perl, Python, Java, etc.)
|
11
12
|
|
12
13
|
== SYNOPSIS:
|
13
14
|
|
@@ -23,7 +24,7 @@ Consult the Syntax.txt[link:files/Syntax_txt.html] page.
|
|
23
24
|
|
24
25
|
== REQUIREMENTS:
|
25
26
|
|
26
|
-
* Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] library v.
|
27
|
+
* Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] library v. 5.5 or higher
|
27
28
|
|
28
29
|
== INSTALL:
|
29
30
|
|
@@ -43,7 +44,7 @@ sudo gem install -r oniguruma
|
|
43
44
|
|
44
45
|
== CREDITS:
|
45
46
|
|
46
|
-
* N. Lugovoi. ORegexp.sub and ORegexp.gsub code
|
47
|
+
* N. Lugovoi. ORegexp.sub and ORegexp.gsub code and lots of other stuff.
|
47
48
|
* K. Kosako. For his great library.
|
48
49
|
* A lot of the documentation has been copied from the original Ruby Regex documentation.
|
49
50
|
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@ require 'hoe'
|
|
3
3
|
|
4
4
|
class Hoe; def extra_deps; @extra_deps.reject { |x| Array(x).first == 'hoe' }; end end
|
5
5
|
|
6
|
-
Hoe.new('oniguruma', '0.
|
6
|
+
Hoe.new('oniguruma', '1.0.0') do |p|
|
7
7
|
p.rubyforge_name = 'oniguruma'
|
8
8
|
p.author = 'Dizan Vasquez'
|
9
9
|
p.email = 'dix_ans@yahoo.com'
|
data/ext/oregexp.c
CHANGED
@@ -101,15 +101,11 @@ static int name_callback(
|
|
101
101
|
regex_t* reg,
|
102
102
|
struct callback_packet* arg
|
103
103
|
) {
|
104
|
-
int i, gn
|
105
|
-
OnigRegion *region = arg->region;
|
104
|
+
int i, gn;
|
106
105
|
VALUE nameHash = arg->hash;
|
107
106
|
|
108
107
|
for (i = 0; i < ngroup_num; i++) {
|
109
108
|
gn = group_nums[i];
|
110
|
-
ref = onig_name_to_backref_number(reg, name, name_end, region);
|
111
|
-
if (ref != gn )
|
112
|
-
return 1;
|
113
109
|
rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
|
114
110
|
}
|
115
111
|
return 0;
|
@@ -124,10 +120,6 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
124
120
|
rb_iv_set( self, "@options", options );
|
125
121
|
UChar* pat_ptr = RSTRING(pattern_str)->ptr;
|
126
122
|
int pat_len = RSTRING(pattern_str)->len;
|
127
|
-
if( pat_len == 0 ) {
|
128
|
-
rb_raise(rb_eArgError, "Empty pattern makes no sense.");
|
129
|
-
}
|
130
|
-
|
131
123
|
VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
|
132
124
|
VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
|
133
125
|
VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
|
@@ -142,16 +134,19 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
142
134
|
if (r != ONIG_NORMAL) {
|
143
135
|
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
144
136
|
onig_error_code_to_str(s, r, &einfo);
|
145
|
-
rb_raise(
|
137
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
146
138
|
}
|
147
139
|
return self;
|
148
140
|
}
|
149
141
|
|
142
|
+
/* can't include re.h, since it conflicts with oniguruma typedefs */
|
150
143
|
struct RMatch {
|
151
144
|
struct RBasic basic;
|
152
145
|
VALUE str;
|
153
146
|
struct re_registers *regs;
|
154
147
|
};
|
148
|
+
#define RMATCH(obj) (R_CAST(RMatch)(obj))
|
149
|
+
void rb_match_busy _((VALUE));
|
155
150
|
|
156
151
|
static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
|
157
152
|
VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
|
@@ -163,21 +158,22 @@ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VAL
|
|
163
158
|
|
164
159
|
match->str = rb_str_new4(string_str);
|
165
160
|
match->regs = ALLOC(struct re_registers);
|
166
|
-
match->regs->allocated = count
|
161
|
+
match->regs->allocated = count;
|
167
162
|
match->regs->num_regs = count;
|
168
|
-
match->regs->beg = ALLOC_N(int,
|
169
|
-
match->regs->end = ALLOC_N(int,
|
163
|
+
match->regs->beg = ALLOC_N(int, count);
|
164
|
+
match->regs->end = ALLOC_N(int, count);
|
170
165
|
|
171
|
-
for ( i = 0; i
|
166
|
+
for ( i = 0; i < count; i++){
|
172
167
|
match->regs->beg[i] = region->beg[i];
|
173
168
|
match->regs->end[i] = region->end[i];
|
174
169
|
}
|
175
170
|
rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
|
176
171
|
packet.region = region;
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
172
|
+
if( onig_number_of_names( oregexp->reg ) > 0 ) {
|
173
|
+
packet.hash = rb_hash_new();
|
174
|
+
onig_foreach_name(oregexp->reg, name_callback, &packet);
|
175
|
+
rb_iv_set((VALUE)match, "@named_captures", packet.hash);
|
176
|
+
}
|
181
177
|
return (VALUE)match;
|
182
178
|
}
|
183
179
|
|
@@ -201,9 +197,12 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
201
197
|
|
202
198
|
OnigRegion *region = onig_region_new();
|
203
199
|
int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
200
|
+
rb_backref_set(Qnil);
|
204
201
|
if (r >= 0) {
|
205
202
|
VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
|
206
203
|
onig_region_free(region, 1 );
|
204
|
+
rb_backref_set(matchData);
|
205
|
+
rb_match_busy(matchData);
|
207
206
|
return matchData;
|
208
207
|
} else if (r == ONIG_MISMATCH) {
|
209
208
|
onig_region_free(region, 1 );
|
@@ -212,7 +211,7 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
212
211
|
onig_region_free(region, 1 );
|
213
212
|
char s[ONIG_MAX_ERROR_MESSAGE_LEN];
|
214
213
|
onig_error_code_to_str(s, r);
|
215
|
-
rb_raise(
|
214
|
+
rb_raise(rb_eArgError, "Oniguruma Error: %s", s);
|
216
215
|
}
|
217
216
|
|
218
217
|
}
|
@@ -233,7 +232,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
233
232
|
{
|
234
233
|
ORegexp *oregexp;
|
235
234
|
VALUE ret;
|
236
|
-
int32_t replIdx = 0;
|
235
|
+
int32_t replIdx = 0, name_pos, name_start, name_end ;
|
237
236
|
int32_t replacementLength = RSTRING(repl_text)->len;
|
238
237
|
UChar *replacementText = RSTRING(repl_text)->ptr;
|
239
238
|
UChar *replacementEnd = replacementText + (replacementLength-1);
|
@@ -254,6 +253,10 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
254
253
|
while (replIdx < replacementLength) {
|
255
254
|
OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
256
255
|
int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
256
|
+
if( c_len == 0 ) {
|
257
|
+
rb_warn("Strange, for %d enc_len is 0", c);
|
258
|
+
c_len = 1;
|
259
|
+
}
|
257
260
|
replIdx += c_len;
|
258
261
|
if ( c != BACKSLASH) {
|
259
262
|
/* Common case, no substitution, no escaping, */
|
@@ -311,7 +314,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
311
314
|
break;
|
312
315
|
case '+': // last matched group
|
313
316
|
replIdx += c_len;
|
314
|
-
for(groupNum = region->num_regs; groupNum > 0; groupNum --) {
|
317
|
+
for(groupNum = region->num_regs-1; groupNum > 0; groupNum --) {
|
315
318
|
g_start = region->beg[ groupNum ];
|
316
319
|
g_end = region->end[ groupNum ];
|
317
320
|
if( g_start != -1 ) {
|
@@ -320,7 +323,35 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
320
323
|
}
|
321
324
|
}
|
322
325
|
break;
|
323
|
-
|
326
|
+
case '<': // named group references \<name>
|
327
|
+
name_pos = replIdx+c_len;
|
328
|
+
name_end = name_start = replIdx+c_len;
|
329
|
+
while(name_pos < replacementLength) {
|
330
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+name_pos, replacementEnd);
|
331
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+name_pos) ;
|
332
|
+
name_pos += c_len;
|
333
|
+
if( c == '>') break;
|
334
|
+
if( ONIGENC_IS_CODE_WORD(enc, c) ) {
|
335
|
+
name_end += c_len;
|
336
|
+
} else {
|
337
|
+
break;
|
338
|
+
}
|
339
|
+
}
|
340
|
+
if( c != '>' || name_end == name_start ) {
|
341
|
+
// place backslash and '<'
|
342
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
343
|
+
replIdx += c_len;
|
344
|
+
} else {
|
345
|
+
// lookup for group and subst for that value
|
346
|
+
groupNum = onig_name_to_backref_number( oregexp->reg,
|
347
|
+
replacementText+name_start, replacementText+name_end, region);
|
348
|
+
if( groupNum >= 0 ) {
|
349
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum],
|
350
|
+
region->end[groupNum]-region->beg[groupNum]);
|
351
|
+
}
|
352
|
+
replIdx = name_pos;
|
353
|
+
}
|
354
|
+
break;
|
324
355
|
default:
|
325
356
|
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
326
357
|
replIdx += c_len;
|
@@ -328,7 +359,7 @@ oregexp_get_replacement(pat, src_text, repl_text, region)
|
|
328
359
|
}
|
329
360
|
} else {
|
330
361
|
/* Finally, append the capture group data to the destination. */
|
331
|
-
if( groupNum < region->num_regs && region->beg[groupNum] >= 0
|
362
|
+
if( groupNum < region->num_regs && region->beg[groupNum] >= 0 ) {
|
332
363
|
rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
|
333
364
|
}
|
334
365
|
}
|
@@ -359,13 +390,15 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
359
390
|
VALUE repl;
|
360
391
|
long beg,
|
361
392
|
end,
|
393
|
+
len,
|
362
394
|
prev_end;
|
363
395
|
int tainted = 0,
|
364
396
|
iter = 0;
|
365
397
|
|
366
398
|
VALUE buf, curr_repl, block_res;
|
367
399
|
ORegexp *oregexp;
|
368
|
-
|
400
|
+
OnigEncoding enc;
|
401
|
+
|
369
402
|
if (argc == 1 && rb_block_given_p()) {
|
370
403
|
iter = 1;
|
371
404
|
} else if (argc == 2) {
|
@@ -392,6 +425,7 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
392
425
|
}
|
393
426
|
end = 0;
|
394
427
|
buf = rb_str_buf_new(str_len);
|
428
|
+
enc = onig_get_encoding( oregexp->reg );
|
395
429
|
do {
|
396
430
|
prev_end = end;
|
397
431
|
beg = region->beg[0];
|
@@ -400,12 +434,8 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
400
434
|
if ( iter ) {
|
401
435
|
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
402
436
|
rb_backref_set(match_data);
|
403
|
-
|
404
|
-
|
405
|
-
else {
|
406
|
-
VALUE match_string = rb_str_new( str_ptr+beg, end-beg);
|
407
|
-
block_res = rb_yield_values(2, match_string, match_data );
|
408
|
-
}
|
437
|
+
rb_match_busy(match_data);
|
438
|
+
block_res = rb_yield( match_data );
|
409
439
|
str_mod_check( string_str, str_ptr, str_len);
|
410
440
|
curr_repl = rb_obj_as_string(block_res);
|
411
441
|
} else {
|
@@ -414,6 +444,17 @@ oregexp_gsub(self, argc, argv, bang, once, region)
|
|
414
444
|
rb_str_append(buf, curr_repl);
|
415
445
|
if( once ) break;
|
416
446
|
// find next match
|
447
|
+
if( end == beg) {
|
448
|
+
/*
|
449
|
+
* Always consume at least one character of the input string
|
450
|
+
* in order to prevent infinite loops.
|
451
|
+
*/
|
452
|
+
if( str_len <= end )
|
453
|
+
break;
|
454
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
455
|
+
rb_str_buf_cat( buf, str_ptr+end, len);
|
456
|
+
end += len;
|
457
|
+
}
|
417
458
|
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
418
459
|
str_ptr+end, str_ptr + str_len,
|
419
460
|
region, ONIG_OPTION_NONE);
|
@@ -456,28 +497,216 @@ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
|
|
456
497
|
gsub_packet call_args = {self, argc, argv, bang, once, region};
|
457
498
|
return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
458
499
|
}
|
500
|
+
|
501
|
+
/**
|
502
|
+
* call-seq:
|
503
|
+
* rxp.gsub(str, replacement)
|
504
|
+
* rxp.gsub(str) {|match_data| ... }
|
505
|
+
*
|
506
|
+
* Returns a copy of _str_ with _all_ occurrences of _rxp_ pattern
|
507
|
+
* replaced with either _replacement_ or the value of the block.
|
508
|
+
*
|
509
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
510
|
+
* and so on may be used to interpolate successive groups in the match.
|
511
|
+
*
|
512
|
+
* In the block form, the current MatchData object is passed in as a
|
513
|
+
* parameter. The value returned by the block will be substituted for
|
514
|
+
* the match on each call.
|
515
|
+
*
|
516
|
+
**/
|
459
517
|
static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
|
460
518
|
return oregexp_safe_gsub(self, argc, argv, 0, 0);
|
461
519
|
}
|
520
|
+
|
521
|
+
/**
|
522
|
+
* call-seq:
|
523
|
+
* rxp.sub(str, replacement)
|
524
|
+
* rxp.sub(str) {|match_data| ... }
|
525
|
+
*
|
526
|
+
* Returns a copy of _str_ with the _first_ occurrence of _rxp_ pattern
|
527
|
+
* replaced with either _replacement_ or the value of the block.
|
528
|
+
*
|
529
|
+
* If a string is used as the replacement, the sequences \1, \2,
|
530
|
+
* and so on may be used to interpolate successive groups in the match.
|
531
|
+
*
|
532
|
+
* In the block form, the current MatchData object is passed in as a
|
533
|
+
* parameter. The value returned by the block will be substituted for
|
534
|
+
* the match on each call.
|
535
|
+
*
|
536
|
+
**/
|
462
537
|
static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
|
463
538
|
return oregexp_safe_gsub(self, argc, argv, 0, 1);
|
464
539
|
}
|
465
540
|
|
541
|
+
/**
|
542
|
+
* call-seq:
|
543
|
+
* rxp.gsub!(str, replacement)
|
544
|
+
* rxp.gsub!(str) {|match_data| ... }
|
545
|
+
*
|
546
|
+
* Performs the substitutions of ORegexp#gsub in place, returning
|
547
|
+
* _str_, or _nil_ if no substitutions were performed.
|
548
|
+
*
|
549
|
+
**/
|
466
550
|
static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
|
467
551
|
return oregexp_safe_gsub(self, argc, argv, 1, 0);
|
468
552
|
}
|
553
|
+
|
554
|
+
/**
|
555
|
+
* call-seq:
|
556
|
+
* oregexp.sub!(str, replacement)
|
557
|
+
* oregexp.sub!(str) {|match_data| ... }
|
558
|
+
*
|
559
|
+
* Performs the substitutions of ORegexp#sub in place, returning
|
560
|
+
* _str_, or _nil_ if no substitutions were performed.
|
561
|
+
*
|
562
|
+
**/
|
469
563
|
static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
|
470
564
|
return oregexp_safe_gsub(self, argc, argv, 1, 1);
|
471
565
|
}
|
472
566
|
|
567
|
+
static VALUE
|
568
|
+
oregexp_scan(VALUE self, VALUE str, OnigRegion * region)
|
569
|
+
{
|
570
|
+
long beg,
|
571
|
+
len,
|
572
|
+
end;
|
573
|
+
int iter = 0;
|
574
|
+
|
575
|
+
VALUE matches;
|
576
|
+
ORegexp *oregexp;
|
577
|
+
OnigEncoding enc;
|
578
|
+
|
579
|
+
if ( rb_block_given_p()) {
|
580
|
+
iter = 1;
|
581
|
+
}
|
582
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
583
|
+
|
584
|
+
VALUE string_str = StringValue( str );
|
585
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
586
|
+
int str_len = RSTRING(string_str)->len;
|
587
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
588
|
+
if (beg < 0) {
|
589
|
+
/* no match */
|
590
|
+
return Qnil;
|
591
|
+
}
|
592
|
+
matches = rb_ary_new();
|
593
|
+
enc = onig_get_encoding( oregexp -> reg );
|
594
|
+
do {
|
595
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
596
|
+
end = region->end[0];
|
597
|
+
rb_ary_push( matches, match_data );
|
598
|
+
if ( iter )
|
599
|
+
rb_yield( match_data );
|
600
|
+
// find next match
|
601
|
+
if( end == beg) {
|
602
|
+
/*
|
603
|
+
* Always consume at least one character of the input string
|
604
|
+
* in order to prevent infinite loops.
|
605
|
+
*/
|
606
|
+
if( str_len <= end )
|
607
|
+
break;
|
608
|
+
len = ONIGENC_MBC_ENC_LEN(enc, str_ptr + end);
|
609
|
+
end += len;
|
610
|
+
}
|
611
|
+
|
612
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
613
|
+
str_ptr+end, str_ptr + str_len,
|
614
|
+
region, ONIG_OPTION_NONE);
|
615
|
+
} while ( beg >= 0);
|
616
|
+
|
617
|
+
return matches;
|
618
|
+
}
|
619
|
+
|
620
|
+
struct scan_packet {
|
621
|
+
VALUE self, str;
|
622
|
+
OnigRegion * region;
|
623
|
+
};
|
624
|
+
static VALUE oregexp_packed_scan( struct scan_packet * args) {
|
625
|
+
return oregexp_scan(args->self, args->str, args->region);
|
626
|
+
}
|
627
|
+
/**
|
628
|
+
* call-seq:
|
629
|
+
* rxp.scan(str) # => [matchdata1, matchdata2,...] or nil
|
630
|
+
* rxp.scan(str) {|match_data| ... } # => [matchdata1, matchdata2,...] or nil
|
631
|
+
*
|
632
|
+
* Both forms iterate through _str_, matching the pattern. For each match,
|
633
|
+
* a MatchData object is generated and passed to the block, and
|
634
|
+
* added to the resulting array of MatchData objects.
|
635
|
+
*
|
636
|
+
* If _str_ does not match pattern, _nil_ is returned.
|
637
|
+
*
|
638
|
+
**/
|
639
|
+
static VALUE oregexp_m_scan(VALUE self, VALUE str) {
|
640
|
+
OnigRegion * region = onig_region_new();
|
641
|
+
struct scan_packet call_args = {self, str, region};
|
642
|
+
return rb_ensure( oregexp_packed_scan, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
643
|
+
}
|
644
|
+
|
645
|
+
/**
|
646
|
+
* call-seq:
|
647
|
+
* rxp === str => true or false
|
648
|
+
*
|
649
|
+
* Case Equality---Synonym for <code>ORegexp#=~</code> used in case statements.
|
650
|
+
*
|
651
|
+
* a = "HELLO"
|
652
|
+
* case a
|
653
|
+
* when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
654
|
+
* when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
655
|
+
* else; print "Mixed case\n"
|
656
|
+
* end
|
657
|
+
*
|
658
|
+
* <em>produces:</em>
|
659
|
+
*
|
660
|
+
* Upper case
|
661
|
+
*
|
662
|
+
**/
|
663
|
+
|
664
|
+
static VALUE oregexp_m_eqq(VALUE self, VALUE str) {
|
665
|
+
VALUE match;
|
666
|
+
|
667
|
+
if (TYPE(str) != T_STRING) {
|
668
|
+
str = rb_check_string_type(str);
|
669
|
+
if (NIL_P(str)) {
|
670
|
+
return Qfalse;
|
671
|
+
}
|
672
|
+
}
|
673
|
+
StringValue(str);
|
674
|
+
match = oregexp_match(self, str);
|
675
|
+
if (Qnil == match) {
|
676
|
+
return Qfalse;
|
677
|
+
}
|
678
|
+
return Qtrue;
|
679
|
+
}
|
680
|
+
/*
|
681
|
+
* call-seq:
|
682
|
+
* rxp =~ string => int or nil
|
683
|
+
*
|
684
|
+
* Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
685
|
+
* start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
686
|
+
* <code>MatchData</code> or <code>nil</code>.
|
687
|
+
*
|
688
|
+
* ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
689
|
+
* ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
690
|
+
**/
|
691
|
+
static VALUE oregexp_match_op(VALUE self, VALUE str) {
|
692
|
+
VALUE ret = oregexp_match(self, str);
|
693
|
+
if(ret == Qnil)
|
694
|
+
return Qnil;
|
695
|
+
return INT2FIX(RMATCH(ret)->regs->beg[0]);
|
696
|
+
}
|
697
|
+
|
473
698
|
void Init_oregexp() {
|
474
699
|
mOniguruma = rb_define_module("Oniguruma");
|
475
700
|
VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
|
476
701
|
rb_define_alloc_func(cORegexp, oregexp_allocate);
|
477
702
|
rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
|
478
703
|
rb_define_method( cORegexp, "match", oregexp_match, 1 );
|
704
|
+
rb_define_method( cORegexp, "=~", oregexp_match_op, 1 );
|
479
705
|
rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
|
480
706
|
rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
|
481
707
|
rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
|
482
708
|
rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
|
709
|
+
rb_define_method( cORegexp, "scan", oregexp_m_scan, 1 );
|
710
|
+
rb_define_method( cORegexp, "===", oregexp_m_eqq, 1 );
|
711
|
+
rb_define_const( mOniguruma, "VERSION", rb_str_new2(onig_version()) );
|
483
712
|
}
|
data/lib/oniguruma.rb
CHANGED
@@ -17,6 +17,20 @@ module Oniguruma
|
|
17
17
|
OPTION_MAXBIT = OPTION_POSIX_REGION
|
18
18
|
OPTION_DEFAULT = OPTION_NONE
|
19
19
|
|
20
|
+
OPTIONS_SHORTCUTS = {
|
21
|
+
'i' => OPTION_IGNORECASE,
|
22
|
+
'x' => OPTION_EXTEND,
|
23
|
+
'm' => OPTION_MULTILINE,
|
24
|
+
's' => OPTION_SINGLELINE,
|
25
|
+
'l' => OPTION_FIND_LONGEST,
|
26
|
+
'E' => OPTION_FIND_NOT_EMPTY,
|
27
|
+
'S' => OPTION_NEGATE_SINGLELINE,
|
28
|
+
'G' => OPTION_DONT_CAPTURE_GROUP,
|
29
|
+
'g' => OPTION_CAPTURE_GROUP,
|
30
|
+
'B' => OPTION_NOTBOL,
|
31
|
+
'E' => OPTION_NOTEOL,
|
32
|
+
}
|
33
|
+
|
20
34
|
SYNTAX_ASIS = 0
|
21
35
|
SYNTAX_POSIX_BASIC = 1
|
22
36
|
SYNTAX_POSIX_EXTENDED = 2
|
@@ -117,8 +131,12 @@ module Oniguruma
|
|
117
131
|
alias old_initialize initialize
|
118
132
|
# :startdoc:
|
119
133
|
|
134
|
+
# call-seq:
|
135
|
+
# ORegexp.new( pattern, options_hash )
|
136
|
+
# ORegexp.new( pattern, option_str, encoding_str=nil, syntax_str=nil)
|
137
|
+
#
|
120
138
|
# Constructs a new regular expression from <i>pattern</i>, which is a
|
121
|
-
# <code>String</code>. The
|
139
|
+
# <code>String</code>. The second parameter <i></i> may be a <code>Hash</code>
|
122
140
|
# of the form:
|
123
141
|
#
|
124
142
|
# <code>{ :options => option_value, :encoding => encoding_value, :syntax => syntax_value }</code>
|
@@ -135,9 +153,27 @@ module Oniguruma
|
|
135
153
|
#
|
136
154
|
# #Accept java syntax on SJIS encoding:
|
137
155
|
# r4 = ORegexp.new('ape', :syntax => SYNTAX_JAVA, :encoding => ENCODING_SJIS) #=> /ape/
|
156
|
+
#
|
157
|
+
# Second form uses string shortcuts to set options and encoding:
|
158
|
+
# r = ORegexp.new('cat', 'i', 'utf8', 'java')
|
138
159
|
|
139
|
-
def initialize( pattern,
|
160
|
+
def initialize( pattern, *args )
|
140
161
|
defaults = { :options => OPTION_DEFAULT, :encoding => ENCODING_ASCII, :syntax => SYNTAX_DEFAULT}
|
162
|
+
if args[0].is_a?(String)
|
163
|
+
options = {}
|
164
|
+
option_str, encoding_str, syntax_str = *args
|
165
|
+
opt = 0
|
166
|
+
option_str.each_byte {|x| opt |= (OPTIONS_SHORTCUTS[x.chr] || 0) }
|
167
|
+
options[:options] = opt
|
168
|
+
if encoding_str && Oniguruma::const_defined?("ENCODING_#{encoding_str.upcase}")
|
169
|
+
options[:encoding] = Oniguruma::const_get("ENCODING_#{encoding_str.upcase}")
|
170
|
+
end
|
171
|
+
if syntax_str && Oniguruma::const_defined?("SYNTAX_#{syntax_str.upcase}")
|
172
|
+
options[:syntax] = Oniguruma::const_get("SYNTAX_#{syntax_str.upcase}")
|
173
|
+
end
|
174
|
+
else
|
175
|
+
options = args[0] || {}
|
176
|
+
end
|
141
177
|
old_initialize( pattern, defaults.merge( options ).freeze )
|
142
178
|
end
|
143
179
|
|
@@ -241,131 +277,203 @@ module Oniguruma
|
|
241
277
|
end
|
242
278
|
|
243
279
|
# call-seq:
|
244
|
-
# rxp
|
245
|
-
#
|
246
|
-
# Matches <code>rxp</code> against <code>string</code>, returning the offset of the
|
247
|
-
# start of the match or <code>nil</code> if the match failed. Sets $~ to the corresponding
|
248
|
-
# <code>MatchData</code> or <code>nil</code>.
|
249
|
-
#
|
250
|
-
# ORegexp.new( 'SIT' ) =~ "insensitive" #=> nil
|
251
|
-
# ORegexp.new( 'SIT', :options => OPTION_IGNORECASE ) =~ "insensitive" #=> 5
|
252
|
-
|
253
|
-
def =~ string
|
254
|
-
return nil unless string
|
255
|
-
m = match( string )
|
256
|
-
return nil unless m
|
257
|
-
m.begin(0)
|
258
|
-
end
|
259
|
-
|
260
|
-
# call-seq:
|
261
|
-
# rxp === str => true or false
|
280
|
+
# rxp.source => str
|
262
281
|
#
|
263
|
-
#
|
264
|
-
#
|
265
|
-
#
|
266
|
-
# case a
|
267
|
-
# when ORegexp.new('^[a-z]*$'); print "Lower case\n"
|
268
|
-
# when ORegexp.new('^[A-Z]*$'); print "Upper case\n"
|
269
|
-
# else; print "Mixed case\n"
|
270
|
-
# end
|
271
|
-
#
|
272
|
-
# <em>produces:</em>
|
273
|
-
#
|
274
|
-
# Upper case
|
275
|
-
|
276
|
-
alias === =~
|
277
|
-
|
282
|
+
# Returns the original string of the pattern.
|
283
|
+
#
|
284
|
+
# ORegex.new( 'ab+c', 'ix' ).source #=> "ab+c"
|
278
285
|
def source
|
279
286
|
@pattern.freeze
|
280
287
|
end
|
281
288
|
|
282
|
-
|
283
|
-
|
284
|
-
positions = []
|
285
|
-
position = 0
|
286
|
-
tmp_string = string
|
287
|
-
while tmp_string != ""
|
288
|
-
if m = match( tmp_string )
|
289
|
-
matches << m
|
290
|
-
positions << position
|
291
|
-
tmp_string = m.post_match
|
292
|
-
position += m.end(0)
|
293
|
-
#if m.end == m.begin
|
294
|
-
# tmp_string = tmp_string[1..-1]
|
295
|
-
# position += 1
|
296
|
-
#end
|
297
|
-
else
|
298
|
-
break
|
299
|
-
end
|
300
|
-
end
|
301
|
-
if matches.size > 0
|
302
|
-
MultiMatchData.new( string, matches, positions )
|
303
|
-
else
|
304
|
-
nil
|
305
|
-
end
|
306
|
-
end
|
289
|
+
alias match_all scan
|
290
|
+
|
307
291
|
end
|
308
292
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
def position index
|
317
|
-
@positions[index]
|
318
|
-
end
|
319
|
-
|
320
|
-
def [] ( value1, value2 = nil )
|
321
|
-
unless value2
|
322
|
-
@matches[value1]
|
323
|
-
else
|
324
|
-
@matches[value1, value2]
|
325
|
-
end
|
326
|
-
end
|
327
|
-
|
328
|
-
def begin index
|
329
|
-
@matches[index].begin(0) + @positions[index]
|
330
|
-
end
|
331
|
-
|
332
|
-
def end index
|
333
|
-
@matches[index].end(0) + @positions[index]
|
334
|
-
end
|
335
|
-
|
336
|
-
def length
|
337
|
-
@matches.size
|
338
|
-
end
|
339
|
-
alias size length
|
340
|
-
|
341
|
-
def offset index
|
342
|
-
[self.begin(index), self.end(index) ]
|
343
|
-
end
|
344
|
-
|
345
|
-
def string
|
346
|
-
@string.freeze
|
347
|
-
end
|
348
|
-
|
349
|
-
def to_a
|
350
|
-
@matches
|
351
|
-
end
|
352
|
-
|
353
|
-
def each
|
354
|
-
@matches.size.times do |i|
|
355
|
-
yield @matches[i], @positions[i]
|
356
|
-
end
|
357
|
-
end
|
293
|
+
end
|
294
|
+
|
295
|
+
class ::String
|
296
|
+
# Calls <code>Oniguruma::ORegexp#gsub</code> on this string.
|
297
|
+
def ogsub(*args)
|
298
|
+
Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
|
358
299
|
end
|
359
300
|
|
301
|
+
# Calls <code>Oniguruma::ORegexp#gsub!</code> on this string.
|
302
|
+
def ogsub!(*args)
|
303
|
+
Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
|
304
|
+
end
|
305
|
+
|
306
|
+
# Calls <code>Oniguruma::ORegexp#sub</code> on this string.
|
307
|
+
def osub(re, *args)
|
308
|
+
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
309
|
+
end
|
310
|
+
|
311
|
+
# Calls <code>Oniguruma::ORegexp#sub!</code> on this string.
|
312
|
+
def osub!(re, *args)
|
313
|
+
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
314
|
+
end
|
360
315
|
end
|
316
|
+
|
361
317
|
class ::MatchData
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
318
|
+
# call-seq:
|
319
|
+
# to_index[symbol] => int or nil
|
320
|
+
#
|
321
|
+
# Returns the group index for the corresponding named group, or
|
322
|
+
# <code>nil</code> if the group does not exist.
|
323
|
+
#
|
324
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
325
|
+
# m.to_index[:begin] #=> 1
|
326
|
+
# m.to_index[:unknown] #=> nil
|
327
|
+
def to_index symbol
|
328
|
+
@named_captures && @named_captures[symbol]
|
329
|
+
end
|
330
|
+
|
331
|
+
alias old_aref :[]
|
332
|
+
|
333
|
+
# call-seq:
|
334
|
+
# mtch[i] => obj
|
335
|
+
# mtch[start, length] => array
|
336
|
+
# mtch[range] => array
|
337
|
+
# mtch[symbol] => obj
|
338
|
+
#
|
339
|
+
# <code>MatchData</code> acts as an array, and may be
|
340
|
+
# accessed using the normal array indexing techniques. <i>mtch</i>[0] is
|
341
|
+
# equivalent to the special variable <code>$&</code>, and returns the entire
|
342
|
+
# matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
|
343
|
+
# of the matched backreferences (portions of the pattern between parentheses).
|
344
|
+
#
|
345
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
346
|
+
# m[0] #=> "HX1138"
|
347
|
+
# m[1, 2] #=> ["H", "X"]
|
348
|
+
# m[1..3] #=> ["H", "X", "113"]
|
349
|
+
# m[-3, 2] #=> ["X", "113"]
|
350
|
+
#
|
351
|
+
# If a symbol is used as index, the corresponding named group is returned,
|
352
|
+
# or <code>nil</code> if such a group does not exist.
|
353
|
+
#
|
354
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
355
|
+
# m[:begin] #=> "THX"
|
356
|
+
# m[:moddle] #=> "1"
|
357
|
+
# m[:end] #=> "138"
|
358
|
+
|
359
|
+
def [](*idx)
|
360
|
+
if idx[0].is_a?(Symbol)
|
361
|
+
k = to_index( idx[0] )
|
362
|
+
k && old_aref(k)
|
363
|
+
else
|
364
|
+
old_aref(*idx)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
alias old_begin :begin
|
369
|
+
|
370
|
+
# call-seq:
|
371
|
+
# mtch.begin(n) => integer
|
372
|
+
# mtch.begin => integer
|
373
|
+
# mtch.begin(symbol) => integer
|
374
|
+
#
|
375
|
+
# Returns the offset of the start of the <em>n</em>th element of the match
|
376
|
+
# array in the string.
|
377
|
+
#
|
378
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
379
|
+
# m.begin(0) #=> 1
|
380
|
+
# m.begin(2) #=> 2
|
381
|
+
#
|
382
|
+
# If no arguments are given, the index of the
|
383
|
+
# first matching character is returned.
|
384
|
+
#
|
385
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
386
|
+
# m.begin #=> 1
|
387
|
+
#
|
388
|
+
# If the argument is a symbol, then the beginning of the
|
389
|
+
# corresponding named group is returned, or <code>nil</code>
|
390
|
+
# if the group does not exist.
|
391
|
+
#
|
392
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
393
|
+
# m.begin(:middle) #=> 3
|
394
|
+
|
395
|
+
def begin(*idx)
|
396
|
+
if idx[0].is_a?(Symbol)
|
397
|
+
k = to_index( idx[0] )
|
398
|
+
k && old_begin(k)
|
399
|
+
elsif idx.empty?
|
400
|
+
old_begin( 0 )
|
401
|
+
else
|
402
|
+
old_begin(*idx)
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
alias old_end :end
|
407
|
+
|
408
|
+
# call-seq:
|
409
|
+
# mtch.end(n) => integer
|
410
|
+
#
|
411
|
+
# Returns the offset of the character immediately following the end of the
|
412
|
+
# <em>n</em>th element of the match array in the string.
|
413
|
+
#
|
414
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
415
|
+
# m.end(0) #=> 7
|
416
|
+
# m.end(2) #=> 3
|
417
|
+
#
|
418
|
+
# If no arguments are given, the index of the
|
419
|
+
# last matching character is returned.
|
420
|
+
#
|
421
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
422
|
+
# m.last #=> 7
|
423
|
+
#
|
424
|
+
# If the argument is a symbol, then the beginning of the
|
425
|
+
# corresponding named group is returned, or <code>nil</code>
|
426
|
+
# if the group does not exist.
|
427
|
+
#
|
428
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
429
|
+
# m.end(:middle) #=> 4
|
430
|
+
|
431
|
+
def end(*idx)
|
432
|
+
if idx[0].is_a?(Symbol)
|
433
|
+
k = to_index( idx[0] )
|
434
|
+
k && old_end(k)
|
435
|
+
elsif idx.empty?
|
436
|
+
old_end( 0 )
|
437
|
+
else
|
438
|
+
old_end(*idx)
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
alias old_offset :offset
|
443
|
+
|
444
|
+
# call-seq:
|
445
|
+
# mtch.offset(n) => array
|
446
|
+
# mtch.offset => array
|
447
|
+
# mtch.offset(symbol) => array
|
448
|
+
#
|
449
|
+
# Returns a two-element array containing the beginning and ending offsets of
|
450
|
+
# the <em>n</em>th match.
|
451
|
+
#
|
452
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
453
|
+
# m.offset(0) #=> [1, 7]
|
454
|
+
# m.offset(4) #=> [6, 7]
|
455
|
+
#
|
456
|
+
# If no arguments are given, the offsets of the entire
|
457
|
+
# sequence are returned.
|
458
|
+
#
|
459
|
+
# m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
|
460
|
+
# m.offset #=> [1, 7]
|
461
|
+
#
|
462
|
+
# If the argument is a symbol, then the offsets of the
|
463
|
+
# corresponding named group are returned, or <code>nil</code>
|
464
|
+
# if the group does not exist.
|
465
|
+
#
|
466
|
+
# m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
|
467
|
+
# m.end(:middle) #=> [3, 4]
|
468
|
+
|
469
|
+
def offset(*idx)
|
470
|
+
if idx[0].is_a?(Symbol)
|
471
|
+
k = to_index( idx[0] )
|
472
|
+
k && old_offset(k)
|
473
|
+
elsif idx.empty?
|
474
|
+
old_offset( 0 )
|
475
|
+
else
|
476
|
+
old_offset(*idx)
|
477
|
+
end
|
478
|
+
end
|
371
479
|
end
|
data/test/test_oniguruma.rb
CHANGED
@@ -27,7 +27,7 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def test_bad_initialization
|
30
|
-
assert_raises(
|
30
|
+
assert_raises(ArgumentError) do
|
31
31
|
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.))" )
|
32
32
|
end
|
33
33
|
end
|
@@ -53,7 +53,7 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
53
53
|
string = 'My favorite fruits are (?#fruit1), (?#fruit2), and (?#fruit3)'
|
54
54
|
assert_equal( "My favorite fruits are *, *, and *", reg.gsub( string, '*' ) )
|
55
55
|
fruits = { "fruit1" => "apples", "fruit2" => "bananas", "fruit3" => "grapes" }
|
56
|
-
assert_equal( "My favorite fruits are apples, bananas, and grapes", reg.gsub( string ) { |
|
56
|
+
assert_equal( "My favorite fruits are apples, bananas, and grapes", reg.gsub( string ) { |match| fruits[match[1]]} )
|
57
57
|
end
|
58
58
|
|
59
59
|
def test_eql
|
@@ -74,10 +74,23 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
74
74
|
|
75
75
|
assert_equal( "Upper case\n", result )
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
|
+
def test_case_eql_compat
|
79
|
+
# === method should not raise when used in case statements
|
80
|
+
a = Time.now
|
81
|
+
result = ""
|
82
|
+
case a
|
83
|
+
when /./ ; result = "rgx"
|
84
|
+
when Oniguruma::ORegexp.new('.'); result = "ore"
|
85
|
+
else; result = "else"
|
86
|
+
end
|
87
|
+
assert_equal( "else", result )
|
88
|
+
end
|
89
|
+
|
78
90
|
def test_operator_match
|
79
91
|
assert_equal( nil, Oniguruma::ORegexp.new( 'SIT' ) =~ "insensitive" )
|
80
92
|
assert_equal( 5, Oniguruma::ORegexp.new( 'SIT', :options => Oniguruma::OPTION_IGNORECASE ) =~ "insensitive" )
|
93
|
+
assert_equal( 5, Oniguruma::ORegexp.new( 'SIT', 'i' ) =~ "insensitive" )
|
81
94
|
end
|
82
95
|
|
83
96
|
# def test_operator_match_2
|
@@ -96,6 +109,8 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
96
109
|
def test_kcode
|
97
110
|
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.)" )
|
98
111
|
assert_equal( Oniguruma::ENCODING_ASCII, reg.kcode )
|
112
|
+
reg = Oniguruma::ORegexp.new( "(3.)(.*)(3.)", '', 'SJIS' )
|
113
|
+
assert_equal( Oniguruma::ENCODING_SJIS, reg.kcode )
|
99
114
|
end
|
100
115
|
|
101
116
|
def test_options
|
@@ -106,6 +121,40 @@ class ORegexpTestCase < Test::Unit::TestCase
|
|
106
121
|
string = '(?<=\n)\\.*ocatarinetabelachitchix'
|
107
122
|
assert_equal( string, Oniguruma::ORegexp.new( string ).source )
|
108
123
|
end
|
124
|
+
|
125
|
+
def test_named_sub_backrefs
|
126
|
+
re = Oniguruma::ORegexp.new('(?<pre>\w+?)\d+(?<after>\w+)')
|
127
|
+
assert_equal(' def123abc ', re.sub('abc123def', ' \<after>123\<pre> ') )
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_named_sub_backrefs_dupes
|
131
|
+
re = Oniguruma::ORegexp.new('(?<pre>\w+?)\d+(?<pre>\w+)')
|
132
|
+
assert_equal('123def', re.sub('abc123def', '123\<pre>') )
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_backref_set_for_match
|
136
|
+
re = Oniguruma::ORegexp.new('Date:(\d{4})/(\d{2})/(\d{2})')
|
137
|
+
assert re.match( "Date:2007/03/25" )
|
138
|
+
assert_not_nil $~
|
139
|
+
assert_equal "2007", $1
|
140
|
+
assert_equal "03", $2
|
141
|
+
assert_equal "25", $3
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_backref_set_for_match_op
|
145
|
+
re = Oniguruma::ORegexp.new('Date:(\d{4})/(\d{2})/(\d{2})')
|
146
|
+
assert re =~ "Date:2007/03/25"
|
147
|
+
assert_not_nil $~
|
148
|
+
assert_equal "2007", $1
|
149
|
+
assert_equal "03", $2
|
150
|
+
assert_equal "25", $3
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_multibyte_named_backrefs
|
154
|
+
r = Oniguruma::ORegexp.new('(?<группа>test).+(\k<группа>)', :encoding => Oniguruma::ENCODING_UTF8)
|
155
|
+
assert_equal "should !test!", r.sub("should test this damned test", '!\<группа>!')
|
156
|
+
end
|
157
|
+
|
109
158
|
end
|
110
159
|
|
111
160
|
class MatchDataTestCase < Test::Unit::TestCase
|
@@ -123,6 +172,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
123
172
|
|
124
173
|
def test_begin
|
125
174
|
matches = @reg.match( "THX1138." )
|
175
|
+
assert_equal( 1, matches.begin )
|
126
176
|
assert_equal( 1, matches.begin(0) )
|
127
177
|
assert_equal( 2, matches.begin(2) )
|
128
178
|
end
|
@@ -134,6 +184,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
134
184
|
|
135
185
|
def test_end
|
136
186
|
matches = @reg.match( "THX1138." )
|
187
|
+
assert_equal( 7, matches.end )
|
137
188
|
assert_equal( 7, matches.end(0) )
|
138
189
|
assert_equal( 3, matches.end(2) )
|
139
190
|
end
|
@@ -146,6 +197,7 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
146
197
|
|
147
198
|
def test_offset
|
148
199
|
matches = @reg.match( "THX1138." )
|
200
|
+
assert_equal( [1, 7], matches.offset )
|
149
201
|
assert_equal( [1, 7], matches.offset(0) )
|
150
202
|
assert_equal( [6, 7], matches.offset(4) )
|
151
203
|
end
|
@@ -189,9 +241,20 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
189
241
|
def test_match_all
|
190
242
|
reg = Oniguruma::ORegexp.new( 'ca' )
|
191
243
|
matches = reg.match_all( 'ocatacachaca' )
|
244
|
+
a = []
|
245
|
+
matches.each { |m| a << m.offset(0) }
|
246
|
+
assert_equal( [ [1,3], [5,7], [10,12] ], a)
|
192
247
|
assert_equal( 3, matches.size )
|
193
|
-
assert_equal(
|
194
|
-
assert_equal( "ca", matches.string[matches.begin(
|
248
|
+
assert_equal( 10, matches[2].begin( 0 ) )
|
249
|
+
assert_equal( "ca", matches[1].string[matches[1].begin( 0 )...matches[1].end( 0 )])
|
250
|
+
end
|
251
|
+
|
252
|
+
def test_scan
|
253
|
+
reg = Oniguruma::ORegexp.new( 'ca' )
|
254
|
+
a = []
|
255
|
+
matches = reg.match_all( 'ocatacachaca' ) { |m| a << m.offset(0) }
|
256
|
+
#assert_kind_of(Oniguruma::MultiMatchData, matches)
|
257
|
+
assert_equal( [ [1,3], [5,7], [10,12] ], a)
|
195
258
|
end
|
196
259
|
|
197
260
|
def test_match_empty_string
|
@@ -205,12 +268,32 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
205
268
|
reg = Oniguruma::ORegexp.new( '(?<begin>\()(?<body>.*)(?<end>\))', :options => Oniguruma::OPTION_MULTILINE )
|
206
269
|
matches = reg.match( "blah (content) blah" )
|
207
270
|
assert_not_nil( matches )
|
271
|
+
assert_equal $~, matches
|
208
272
|
assert_equal( '(', matches[:begin] )
|
209
273
|
assert_equal( 'content', matches[:body] )
|
210
274
|
assert_equal( ')', matches[:end] )
|
211
275
|
assert_equal( nil, matches[:inexistent])
|
212
276
|
end
|
213
277
|
|
278
|
+
def test_multibyte_named_backrefs
|
279
|
+
r = Oniguruma::ORegexp.new('(?<имя>test).+(\k<имя>)', :encoding => Oniguruma::ENCODING_UTF8)
|
280
|
+
assert_equal "should TEST", r.sub("should test this damned test") {|m| m[:"имя"].upcase }
|
281
|
+
end
|
282
|
+
|
283
|
+
def test_no_named_backrefs
|
284
|
+
r = Oniguruma::ORegexp.new('(.+).+(.+)')
|
285
|
+
r.match("text")
|
286
|
+
assert_not_nil $~
|
287
|
+
assert_equal 0, $~.instance_variables.size
|
288
|
+
r = Oniguruma::ORegexp.new('(?<a>.+).+(?<b>.+)')
|
289
|
+
r.match("text")
|
290
|
+
assert_not_nil $~
|
291
|
+
assert_equal 1, $~.instance_variables.size
|
292
|
+
|
293
|
+
end
|
294
|
+
|
295
|
+
# casefolding for full Unicode set is not present in versions prior to 5.
|
296
|
+
if Oniguruma::VERSION >= '5.0.0'
|
214
297
|
def test_utf8_ignore_case
|
215
298
|
reg = Oniguruma::ORegexp.new( '([а-я])+', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
216
299
|
matches = reg.match("Text: Ехал Грека Через Реку")
|
@@ -222,16 +305,17 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
222
305
|
|
223
306
|
def test_utf8_gsub
|
224
307
|
reg = Oniguruma::ORegexp.new( '([а-я])([а-я])([а-я]+)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
225
|
-
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|
|
308
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|m| m[1]*2+m[2]*2+m[3] }
|
226
309
|
assert_equal("Text: ЕЕххал ГГррека ЧЧеерез РРееку", new_str)
|
227
310
|
end
|
228
311
|
|
229
312
|
def test_utf8_gsub2
|
230
313
|
reg = Oniguruma::ORegexp.new( '[а-я]', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
231
|
-
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|
|
314
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|m| m[0]*2 }
|
232
315
|
assert_equal("Text: ЕЕххаалл ГГррееккаа ЧЧеерреезз РРееккуу", new_str)
|
233
316
|
end
|
234
|
-
|
317
|
+
end
|
318
|
+
|
235
319
|
def test_sub_compatibility
|
236
320
|
$x = "a.gif"
|
237
321
|
assert_equal("b.gif", $x.osub('.*\.([^\.]+)$', 'b.\1'))
|
@@ -242,36 +326,36 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
242
326
|
assert_equal("<a.gif>", $x.osub('.*\.([^\.]+)$', '<\&>'))
|
243
327
|
assert_equal("a.a.", $x.osub('(gif)', '\`') )
|
244
328
|
end
|
245
|
-
|
246
|
-
class ::String
|
247
|
-
def ogsub(*args)
|
248
|
-
Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
|
249
|
-
end
|
250
|
-
def ogsub!(*args)
|
251
|
-
Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
|
252
|
-
end
|
253
|
-
def osub(re, *args)
|
254
|
-
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
255
|
-
end
|
256
|
-
end
|
257
329
|
|
258
330
|
def test_gsub_compat
|
259
331
|
assert_equal("hello".ogsub('[aeiou]', '*') , "h*ll*")
|
260
332
|
assert_equal("hello".ogsub('([aeiou])', '<\1>') , "h<e>ll<o>")
|
261
333
|
i = 0
|
262
|
-
assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|
|
263
|
-
assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|
|
334
|
+
assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|m| i+=1; i.to_s})
|
335
|
+
assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|m| m[2] + m[1] })
|
264
336
|
a = "test"
|
265
337
|
a.ogsub!('t', a)
|
266
338
|
assert_equal("testestest", a)
|
267
339
|
end
|
268
340
|
|
269
341
|
def test_match_compat
|
270
|
-
t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|
|
342
|
+
t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|m| "#$2#$1" }
|
271
343
|
assert_equal("214365", t )
|
272
|
-
t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|
|
344
|
+
t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|m| "<#$1>" }
|
273
345
|
assert_equal( "h<e>ll<o>", t)
|
274
346
|
end
|
275
347
|
|
348
|
+
def _u16(str)
|
349
|
+
str.unpack("U*").pack("n*")
|
350
|
+
end
|
351
|
+
puts Oniguruma::VERSION
|
352
|
+
if Oniguruma::VERSION >= '4.0.0'
|
353
|
+
def test_utf16_gsub
|
354
|
+
r = Oniguruma::ORegexp.new( _u16('[aeiou]'), :encoding => Oniguruma::ENCODING_UTF16_BE)
|
355
|
+
assert_equal( _u16("h*ll*"), r.gsub( _u16("hello"), _u16('*')) )
|
356
|
+
r = Oniguruma::ORegexp.new( _u16('([aeiou])'), :encoding => Oniguruma::ENCODING_UTF16_BE)
|
357
|
+
assert_equal( _u16("h<e>\\ll<o>\\"), r.gsub( _u16("hello"), _u16('<\1>\\')) )
|
358
|
+
end
|
359
|
+
end
|
276
360
|
|
277
361
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: oniguruma
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-03-
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2007-03-27 00:00:00 +02:00
|
8
8
|
summary: Bindings for the oniguruma regular expression library
|
9
9
|
require_paths:
|
10
10
|
- lib
|