oniguruma 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,4 +1,10 @@
1
- == 1.0.0 / 2007-03-19
1
+ == 0.9.1 / 2007-03-25
2
+ * FIX: Buggy resolution of numeric codes for encoding and syntax options (Nikolai Lugovoi)
3
+ * FIX: Buggy implementation of ORegexp#gsub and ORegexp#gsub methods. Now code is all C (Nikolai Lugovoi)
4
+ * Added documentation for class ORegexp
5
+ * Added regexp syntax documentation.
6
+
7
+ == 0.9.0 / 2007-03-19
2
8
 
3
9
  * 1 major enhancement
4
10
  * Birthday!
data/Manifest.txt CHANGED
@@ -1,6 +1,7 @@
1
1
  History.txt
2
2
  Manifest.txt
3
3
  README.txt
4
+ Syntax.txt
4
5
  Rakefile
5
6
  lib/oniguruma.rb
6
7
  ext/oregexp.c
data/README.txt CHANGED
@@ -16,6 +16,10 @@ Ruby bindings to the Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] regul
16
16
  puts match[0] <= 'terraforming'
17
17
  puts match[:before] <= 'terr'
18
18
  puts match[:after] <= 'forming'
19
+
20
+ == SYNTAX
21
+
22
+ Consult the Syntax.txt[link:files/Syntax_txt.html] page.
19
23
 
20
24
  == REQUIREMENTS:
21
25
 
@@ -39,8 +43,9 @@ sudo gem install -r oniguruma
39
43
 
40
44
  == CREDITS:
41
45
 
42
- * K.Kosako, for his great library.
43
- * A lot of the documentation has been copied from the orininal Ruby Regex documentation.
46
+ * N. Lugovoi. ORegexp.sub and ORegexp.gsub code, plus other patches.
47
+ * K. Kosako. For his great library.
48
+ * A lot of the documentation has been copied from the original Ruby Regex documentation.
44
49
 
45
50
  == LICENSE:
46
51
 
data/Rakefile CHANGED
@@ -3,7 +3,7 @@ require 'hoe'
3
3
 
4
4
  class Hoe; def extra_deps; @extra_deps.reject { |x| Array(x).first == 'hoe' }; end end
5
5
 
6
- Hoe.new('oniguruma', '0.9.0') do |p|
6
+ Hoe.new('oniguruma', '0.9.1') do |p|
7
7
  p.rubyforge_name = 'oniguruma'
8
8
  p.author = 'Dizan Vasquez'
9
9
  p.email = 'dix_ans@yahoo.com'
data/Syntax.txt ADDED
@@ -0,0 +1,396 @@
1
+ = RUBY REGULAR EXPRESSION SYNTAX
2
+
3
+
4
+ == Syntax Elements
5
+
6
+ [\] escape (enable or disable meta character meaning)
7
+ [|] alternation
8
+ [(...)] group
9
+ [[...]] character class
10
+
11
+
12
+ == Characters
13
+
14
+ [\t] horizontal tab (0x09)
15
+ [\v] vertical tab (0x0B)
16
+ [\n] newline (0x0A)
17
+ [\r] return (0x0D)
18
+ [\b] back space (0x08)
19
+
20
+ \b is effective in character class [...] only
21
+ [\f] form feed (0x0C)
22
+ [\a] bell (0x07)
23
+ [\e] escape (0x1B)
24
+ [\nnn] octal char (encoded byte value)
25
+ [\xHH] hexadecimal char (encoded byte value)
26
+ [\x{7HHHHHHH}] wide hexadecimal char (character code point value)
27
+ [\cx] control char (character code point value)
28
+ [\C-x] control char (character code point value)
29
+ [\M-x] meta (x|0x80) (character code point value)
30
+ [\M-\C-x] meta control char (character code point value)
31
+
32
+
33
+
34
+ == Character types
35
+
36
+ [.] any character (except newline)
37
+ [\w] word character
38
+
39
+ Not Unicode:
40
+ * alphanumeric, "_" and multibyte char.
41
+ Unicode:
42
+ * General_Category -- (Letter|Mark|Number|Connector_Punctuation)
43
+ [\W] non word char
44
+ [\s] whitespace char
45
+
46
+ Not Unicode:
47
+ * \t, \n, \v, \f, \r, \x20
48
+ Unicode:
49
+ * 0009, 000A, 000B, 000C, 000D, 0085(NEL),
50
+ * General_Category:
51
+ * -- Line_Separator
52
+ * -- Paragraph_Separator
53
+ * -- Space_Separator
54
+ [\S] non whitespace char
55
+ [\d] decimal digit char
56
+
57
+ Unicode: General_Category -- Decimal_Number
58
+ [\D] non decimal digit char
59
+ [\h] hexadecimal digit char [0-9a-fA-F]
60
+ [\H] non hexadecimal digit char
61
+
62
+
63
+ == Character Properties
64
+
65
+ \p{property-name}
66
+ \p{^property-name} (negative)
67
+ \P{property-name} (negative)
68
+
69
+ === property-name:
70
+
71
+ Works on all encodings:
72
+ * Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
73
+ Print, Punct, Space, Upper, XDigit, Word, ASCII,
74
+ Works on EUC_JP, Shift_JIS:
75
+ * Hiragana, Katakana
76
+ Works on UTF8, UTF16, UTF32:
77
+ * Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu,
78
+ M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps,
79
+ S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs,
80
+ Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese,
81
+ Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic,
82
+ Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
83
+ Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul,
84
+ Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana,
85
+ Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam,
86
+ Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian,
87
+ Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac,
88
+ Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan,
89
+ Tifinagh, Ugaritic, Yi
90
+
91
+ == Quantifiers
92
+
93
+ === Greedy
94
+
95
+ [?] 1 or 0 times
96
+ [*] 0 or more times
97
+ [+] 1 or more times
98
+ [{n,m}] at least n but not more than m times
99
+ [{n,}] at least n times
100
+ [{,n}] at least 0 but not more than n times ({0,n})
101
+ [{n}] n times
102
+
103
+ === Reluctant
104
+
105
+ [??] 1 or 0 times
106
+ [*?] 0 or more times
107
+ [+?] 1 or more times
108
+ [{n,m}?] at least n but not more than m times
109
+ [{n,}?] at least n times
110
+ [{,n}?] at least 0 but not more than n times (== {0,n}?)
111
+
112
+ === Possessive (greedy and does not backtrack after repeated)
113
+
114
+ [?+] 1 or 0 times
115
+ [*+] 0 or more times
116
+ [++] 1 or more times
117
+
118
+ ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
119
+
120
+
121
+ == Anchors
122
+
123
+ [^] beginning of the line
124
+ [$] end of the line
125
+ [\b] word boundary
126
+ [\B] not word boundary
127
+ [\A] beginning of string
128
+ [\Z] end of string, or before newline at the end
129
+ [\z] end of string
130
+ [\G] matching start position
131
+
132
+
133
+ == Character class
134
+
135
+ [^...] negative class (lowest precedence operator)
136
+ [x-y] range from x to y
137
+ [[...]] set (character class in character class)
138
+ [..&&..] intersection (low precedence at the next of ^)
139
+
140
+ If you want to use '[', '-', ']' as a normal character
141
+ in a character class, you should escape these characters by '\'.
142
+
143
+
144
+ POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
145
+
146
+ === Not Unicode Case:
147
+
148
+ [alnum] alphabet or digit char
149
+ [alpha] alphabet
150
+ [ascii] code value: [0 - 127]
151
+ [blank] \t, \x20
152
+ [cntrl] control
153
+ [digit] 0-9
154
+ [graph] include all of multibyte encoded characters
155
+ [lower] lower case
156
+ [print] include all of multibyte encoded characters
157
+ [punct] punctuation
158
+ [space] \t, \n, \v, \f, \r, \x20
159
+ [upper] upper case
160
+ [xdigit] 0-9, a-f, A-F
161
+ [word] alphanumeric, "_" and multibyte characters
162
+
163
+
164
+ === Unicode Case:
165
+
166
+ [alnum] Letter | Mark | Decimal_Number
167
+ [alpha] Letter | Mark
168
+ [ascii] 0000 - 007F
169
+ [blank] Space_Separator | 0009
170
+ [cntrl] Control | Format | Unassigned | Private_Use | Surrogate
171
+ [digit] Decimal_Number
172
+ [graph] [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
173
+ [lower] Lowercase_Letter
174
+ [print] [[:graph:]] | [[:space:]]
175
+ [punct] Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
176
+ Final_Punctuation | Initial_Punctuation | Other_Punctuation |
177
+ Open_Punctuation
178
+ [space] Space_Separator | Line_Separator | Paragraph_Separator |
179
+ 0009 | 000A | 000B | 000C | 000D | 0085
180
+ [upper] Uppercase_Letter
181
+ [xdigit] 0030 - 0039 | 0041 - 0046 | 0061 - 0066
182
+ (0-9, a-f, A-F)
183
+ [word] Letter | Mark | Decimal_Number | Connector_Punctuation
184
+
185
+
186
+
187
+ == Extended groups
188
+
189
+ [(?#...)] comment
190
+ [(?imx-imx)] option on/off:
191
+ * i: ignore case
192
+ * m: multi-line (dot(.) match newline)
193
+ * x: extended form
194
+ [(?imx-imx:subexp)] option on/off for subexp
195
+ [(?:subexp)] not captured group
196
+ [(subexp)] captured group
197
+ [(?=subexp)] look-ahead
198
+ [(?!subexp)] negative look-ahead
199
+ [(?<=subexp)] look-behind
200
+ [(?<!subexp)] negative look-behind
201
+
202
+ Subexp of look-behind must be fixed character length.
203
+ But different character length is allowed in top level
204
+ alternatives only.
205
+ ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
206
+
207
+ In negative-look-behind, captured group isn't allowed,
208
+ but shy group(?:) is allowed.
209
+ [(?>subexp)] atomic group
210
+ don't backtrack in subexp.
211
+ [(?<name>subexp)] define named group
212
+ (All characters of the name must be a word character.)
213
+
214
+ Not only a name but a number is assigned like a captured
215
+ group.
216
+
217
+ Assigning the same name as two or more subexps is allowed.
218
+ In this case, a subexp call can not be performed although
219
+ the back reference is possible.
220
+
221
+
222
+ == Back reference
223
+
224
+ [\n] back reference by group number (n >= 1)
225
+ [\k<name>] back reference by group name
226
+ In the back reference by the multiplex definition name,
227
+ a subexp with a large number is referred to preferentially.
228
+ (When not matched, a group of the small number is referred to.)
229
+
230
+ * Back reference by group number is forbidden if named group is defined
231
+ in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
232
+
233
+
234
+ === Back reference with nest level
235
+
236
+ [\k<name+n>] n: 0, 1, 2, ...
237
+ [\k<name-n>] n: 0, 1, 2, ...
238
+
239
+ Destinate relative nest level from back reference position.
240
+
241
+ Examples:
242
+ /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
243
+
244
+ r = ORegexp.compile(<<'__REGEXP__'.strip, :options => Oniguruma::EXTENDED)
245
+ (?<element> \g<stag> \g<content>* \g<etag> ){0}
246
+ (?<stag> < \g<name> \s* > ){0}
247
+ (?<name> [a-zA-Z_:]+ ){0}
248
+ (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
249
+ (?<etag> </ \k<name+1> >){0}
250
+ \g<element>
251
+ __REGEXP__
252
+
253
+ p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
254
+
255
+
256
+
257
+ === Subexp call ("Tanaka Akira special")
258
+
259
+ [\g<name>] call by group name
260
+ [\g<n>] call by group number (n >= 1)
261
+
262
+ * left-most recursive call is not allowed.
263
+
264
+ Example:
265
+ (?<name>a|\g<name>b) => error
266
+ (?<name>a|b\g<name>c) => OK
267
+ * Call by group number is forbidden if named group is defined in the pattern
268
+ and Oniguruma::OPTION_CAPTURE_GROUP is not set.
269
+ * If the option status of called group is different from calling position
270
+ then the group's option is effective.
271
+
272
+ Example:
273
+ (?-i:\g<name>)(?i:(?<name>a)){0} <i>matches "A"</i>
274
+
275
+
276
+ == Captured group
277
+
278
+ Behavior of the no-named group (...) changes with the following conditions.
279
+ (But named group is not changed.)
280
+
281
+ [case 1] <code>ORegexp.new( '...' )</code> (named group is not used, no option)
282
+
283
+ ... is treated as a captured group.
284
+ [case 2] <code>ORegexp.new( '...', :options => OPTION_DONT_CAPTURE_GROUP )</code> (named group is not used, 'g' option)
285
+
286
+ ... is treated as a no-captured group (?:...).
287
+
288
+ [case 3] <code>ORegexp.new( '...(?<name>...)...' )</code> (named group is used, no option)
289
+
290
+ (?<name>...) is treated as a no-captured group (?:...)
291
+
292
+ numbered-backref/call is not allowed.
293
+
294
+ [case 2] <code>ORegexp.new( '...', :options => OPTION_CAPTURE_GROUP )</code> (named group is used, 'G' option)
295
+
296
+ (?<name>...) is treated as a captured group (?:...)
297
+
298
+ numbered-backref/call is allowed.
299
+
300
+ where
301
+ * g: OPTION_DONT_CAPTURE_GROUP
302
+ * G: OPTION_CAPTURE_GROUP
303
+
304
+ ('g' and 'G' options are argued in ruby-dev ML)
305
+
306
+
307
+ == Syntax dependent options
308
+
309
+ === ONIG_SYNTAX_RUBY
310
+
311
+ [(?m)] dot(.) match newline
312
+
313
+ === ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
314
+
315
+ [(?s)] dot(.) match newline
316
+ [(?m)] ^ match after newline, $ match before newline
317
+
318
+ == Original extensions
319
+
320
+ * hexadecimal digit char type \h, \H
321
+ * named group (?<name>...)
322
+ * named backref \k<name>
323
+ * subexp call \g<name>, \g<group-num>
324
+
325
+
326
+ == Lacking features compare with perl 5.8.0
327
+
328
+ * \N{name}
329
+ * \l,\u,\L,\U, \X, \C
330
+ * (?{code})
331
+ * (??{code})
332
+ * (?(condition)yes-pat|no-pat)
333
+ * \Q...\E
334
+
335
+ This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
336
+
337
+
338
+ == Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
339
+
340
+ * add character property (\p{property}, \P{property})
341
+ * add hexadecimal digit char type (\h, \H)
342
+ * add look-behind
343
+
344
+ (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
345
+ * add possessive quantifier. ?+, *+, ++
346
+ * add operations in character class. [], &&
347
+
348
+ ('[' must be escaped as an usual char in character class.)
349
+ * add named group and subexp call.
350
+ * octal or hexadecimal number sequence can be treated as
351
+ a multibyte code char in character class if multibyte encoding
352
+ is specified.
353
+
354
+ (ex. <code>[\xa1\xa2], [\xa1\xa7-\xa4\xa1]</code>)
355
+ * allow the range of single byte char and multibyte char in character
356
+ class.
357
+
358
+ ex. <code>[a-<<any EUC-JP character>>]</code> in EUC-JP encoding.
359
+ * effect range of isolated option is to next ')'.
360
+ ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
361
+ * isolated option is not transparent to previous pattern.
362
+ ex. <code>a(?i)*</code> is a syntax error pattern.
363
+ * allowed incompleted left brace as an usual string.
364
+ ex. /{/, /({)/, /a{2,3/ etc...
365
+ * negative POSIX bracket [:^xxxx:] is supported.
366
+ * POSIX bracket [:ascii:] is added.
367
+ * repeat of look-ahead is not allowed.
368
+ ex. <code>(?=a)*</code>, <code>(?!b){5}</code>
369
+ * Ignore case option is effective to numbered character.
370
+ ex. <code>/\x61/i =~ "A"<code>
371
+ * In the range quantifier, the number of the minimum is omissible.
372
+
373
+ <code>/a{,n}/ == /a{0,n}/<code>
374
+
375
+ The simultanious abbreviation of the number of times of the minimum
376
+ and the maximum is not allowed. (/a{,}/)
377
+ * <code>a{n}?<code> is not a non-greedy operator.
378
+ <code>/a{n}?/ == /(?:a{n})?/<code>
379
+ * invalid back reference is checked and cause error.
380
+ /\1/, /(a)\2/
381
+ * Zero-length match in infinite repeat stops the repeat,
382
+ then changes of the capture group status are checked as stop condition.
383
+ /(?:()|())*\1\2/ =~ ""
384
+ /(?:\1a|())*/ =~ "a"
385
+
386
+
387
+ == Problems
388
+
389
+ * Invalid encoding byte sequence is not checked in UTF-8.
390
+
391
+ * Invalid first byte is treated as a character.
392
+ /./u =~ "\xa3"
393
+
394
+ * Incomplete byte sequence is not checked.
395
+ /\w+/ =~ "a\xf3\x8ec"
396
+
data/ext/extconf.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require 'mkmf'
2
2
  have_library("onig")
3
+ $CFLAGS='-Wall'
3
4
  create_makefile( "oregexp" )
data/ext/oregexp.c CHANGED
@@ -24,7 +24,10 @@ static VALUE oregexp_allocate( VALUE klass ) {
24
24
  }
25
25
 
26
26
 
27
- static OnigEncodingType * int2encoding( int index ) {
27
+ static OnigEncodingType * int2encoding( VALUE v_index ) {
28
+ int index;
29
+ if( ! NIL_P(v_index) ) {
30
+ index = FIX2INT(v_index);
28
31
  switch( index ) {
29
32
  case 0: return ONIG_ENCODING_ASCII;
30
33
  case 1: return ONIG_ENCODING_ISO_8859_1;
@@ -60,10 +63,14 @@ static OnigEncodingType * int2encoding( int index ) {
60
63
  case 31: return ONIG_ENCODING_GB18030;
61
64
  case 32: return ONIG_ENCODING_UNDEF;
62
65
  }
66
+ }
63
67
  return ONIG_ENCODING_UNDEF;
64
68
  }
65
69
 
66
- static OnigSyntaxType * int2syntax( int index ) {
70
+ static OnigSyntaxType * int2syntax( VALUE v_index ) {
71
+ int index;
72
+ if( ! NIL_P(v_index) ) {
73
+ index = FIX2INT(v_index);
67
74
  switch( index ) {
68
75
  case 0: return ONIG_SYNTAX_ASIS;
69
76
  case 1: return ONIG_SYNTAX_POSIX_BASIC;
@@ -77,25 +84,32 @@ static OnigSyntaxType * int2syntax( int index ) {
77
84
  case 9: return ONIG_SYNTAX_RUBY;
78
85
  case 10: return ONIG_SYNTAX_DEFAULT;
79
86
  }
87
+ }
80
88
  return ONIG_SYNTAX_DEFAULT;
81
89
  }
82
90
 
91
+ struct callback_packet {
92
+ VALUE hash;
93
+ OnigRegion * region;
94
+ };
95
+
83
96
  static int name_callback(
84
97
  const UChar* name,
85
98
  const UChar* name_end,
86
99
  int ngroup_num,
87
100
  int* group_nums,
88
101
  regex_t* reg,
89
- void* arg
102
+ struct callback_packet* arg
90
103
  ) {
91
104
  int i, gn, ref;
92
- OnigRegion *region = (OnigRegion* )arg;
105
+ OnigRegion *region = arg->region;
106
+ VALUE nameHash = arg->hash;
93
107
 
94
108
  for (i = 0; i < ngroup_num; i++) {
95
109
  gn = group_nums[i];
96
110
  ref = onig_name_to_backref_number(reg, name, name_end, region);
97
111
  if (ref != gn )
98
- rb_raise(rb_eException, "Oniguruma Error: group and backreference names are different");
112
+ return 1;
99
113
  rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
100
114
  }
101
115
  return 0;
@@ -110,13 +124,16 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
110
124
  rb_iv_set( self, "@options", options );
111
125
  UChar* pat_ptr = RSTRING(pattern_str)->ptr;
112
126
  int pat_len = RSTRING(pattern_str)->len;
127
+ if( pat_len == 0 ) {
128
+ rb_raise(rb_eArgError, "Empty pattern makes no sense.");
129
+ }
113
130
 
114
131
  VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
115
132
  VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
116
133
  VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
117
134
  int iOptions = NUM2INT( rOptions );
118
- int iEncoding = int2encoding( rEncoding );
119
- int iSyntax = int2syntax( rSyntax );
135
+ OnigEncodingType * iEncoding = int2encoding( rEncoding );
136
+ OnigSyntaxType * iSyntax = int2syntax( rSyntax );
120
137
 
121
138
 
122
139
  int r;
@@ -130,6 +147,40 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
130
147
  return self;
131
148
  }
132
149
 
150
+ struct RMatch {
151
+ struct RBasic basic;
152
+ VALUE str;
153
+ struct re_registers *regs;
154
+ };
155
+
156
+ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
157
+ VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
158
+ NEWOBJ(match, struct RMatch);
159
+ OBJSETUP(match, rb_cMatch, T_MATCH);
160
+ VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
161
+ int i , count = region->num_regs;
162
+ struct callback_packet packet;
163
+
164
+ match->str = rb_str_new4(string_str);
165
+ match->regs = ALLOC(struct re_registers);
166
+ match->regs->allocated = count+1;
167
+ match->regs->num_regs = count;
168
+ match->regs->beg = ALLOC_N(int, (count+1));
169
+ match->regs->end = ALLOC_N(int, (count+1));
170
+
171
+ for ( i = 0; i <= count; i++){
172
+ match->regs->beg[i] = region->beg[i];
173
+ match->regs->end[i] = region->end[i];
174
+ }
175
+ rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
176
+ packet.region = region;
177
+ packet.hash = rb_hash_new();
178
+ if( onig_foreach_name(oregexp->reg, name_callback, &packet) )
179
+ rb_raise(rb_eException, "Oniguruma Error: group and backreference names are different");
180
+ rb_iv_set((VALUE)match, "@named_captures", packet.hash);
181
+ return (VALUE)match;
182
+ }
183
+
133
184
  /*
134
185
  * call-seq:
135
186
  * rxp.match(str) => matchdata or nil
@@ -151,25 +202,7 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
151
202
  OnigRegion *region = onig_region_new();
152
203
  int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
153
204
  if (r >= 0) {
154
-
155
- VALUE begins = rb_ary_new();
156
- VALUE ends = rb_ary_new();
157
- nameHash = rb_hash_new();
158
-
159
- onig_foreach_name(oregexp->reg, name_callback, (void* )region);
160
-
161
-
162
- int i;
163
-
164
- for (i = 0; i < region->num_regs; i++) {
165
- rb_ary_push( begins, INT2FIX( region->beg[i] ) );
166
- rb_ary_push( ends, INT2FIX( region->end[i] ) );
167
- }
168
- VALUE kMatchData = rb_const_get( mOniguruma, rb_intern( "MatchData" ) );
169
- VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) );
170
- VALUE matchData = rb_funcall(kMatchData, rb_intern("new"), 4, string_str, begins, ends, nameHash );
171
- rb_cv_set( kORegexp, "@@last_match", matchData );
172
-
205
+ VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
173
206
  onig_region_free(region, 1 );
174
207
  return matchData;
175
208
  } else if (r == ONIG_MISMATCH) {
@@ -184,11 +217,267 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
184
217
 
185
218
  }
186
219
 
220
+ static const UChar BACKSLASH = 0x5c;
221
+
222
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
223
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
224
+ backslash). */
225
+
226
+ /* scan the replacement text, looking for substitutions (\n) and \escapes. */
227
+ static VALUE
228
+ oregexp_get_replacement(pat, src_text, repl_text, region)
229
+ VALUE pat,
230
+ src_text,
231
+ repl_text;
232
+ OnigRegion * region;
233
+ {
234
+ ORegexp *oregexp;
235
+ VALUE ret;
236
+ int32_t replIdx = 0;
237
+ int32_t replacementLength = RSTRING(repl_text)->len;
238
+ UChar *replacementText = RSTRING(repl_text)->ptr;
239
+ UChar *replacementEnd = replacementText + (replacementLength-1);
240
+ long numDigits = 0;
241
+ long groupNum = 0, g_start, g_end;
242
+ OnigCodePoint digitC;
243
+ OnigEncoding enc;
244
+ const UChar * matchText;
245
+ long matchLen;
246
+
247
+ matchText = RSTRING(src_text)->ptr;
248
+ matchLen = RSTRING(src_text)->len;
249
+ Data_Get_Struct( pat, ORegexp, oregexp );
250
+ enc = onig_get_encoding( oregexp->reg );
251
+
252
+ ret = rb_str_buf_new(RSTRING(repl_text)->len);
253
+
254
+ while (replIdx < replacementLength) {
255
+ OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
256
+ int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
257
+ replIdx += c_len;
258
+ if ( c != BACKSLASH) {
259
+ /* Common case, no substitution, no escaping, */
260
+ /* just copy the char to the dest buf. */
261
+ rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
262
+ continue;
263
+ }
264
+ if (replIdx >= replacementLength) {
265
+ rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
266
+ break;
267
+ }
268
+ /* Pick up a capture group number if one follows. */
269
+ numDigits = 0;
270
+ groupNum = 0;
271
+ for (;;) {
272
+ if (replIdx >= replacementLength) {
273
+ break;
274
+ }
275
+ digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
276
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
277
+ if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
278
+ break;
279
+ }
280
+ replIdx += c_len;
281
+ groupNum=groupNum*10 + (digitC - '0');
282
+ numDigits++;
283
+ if (numDigits >= 2) { /* limit 99 groups */
284
+ break;
285
+ }
286
+ }
287
+ if (numDigits == 0) {
288
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
289
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
290
+ backslash). */
291
+ int p_len = c_len;
292
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
293
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
294
+ switch(c) {
295
+ case '&' : // matched substring
296
+ rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
297
+ replIdx += c_len;
298
+ break;
299
+ case '`' : // prematch
300
+ rb_str_buf_cat(ret, matchText, region->beg[0]);
301
+ replIdx += c_len;
302
+ break;
303
+ case '\'': // postmatch
304
+ rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
305
+ replIdx += c_len;
306
+ break;
307
+ case '\\': // literal backslash
308
+ // place single backslash
309
+ rb_str_buf_cat(ret, replacementText+replIdx, c_len);
310
+ replIdx += c_len;
311
+ break;
312
+ case '+': // last matched group
313
+ replIdx += c_len;
314
+ for(groupNum = region->num_regs; groupNum > 0; groupNum --) {
315
+ g_start = region->beg[ groupNum ];
316
+ g_end = region->end[ groupNum ];
317
+ if( g_start != -1 ) {
318
+ rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
319
+ break;
320
+ }
321
+ }
322
+ break;
323
+
324
+ default:
325
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
326
+ replIdx += c_len;
327
+
328
+ }
329
+ } else {
330
+ /* Finally, append the capture group data to the destination. */
331
+ if( groupNum < region->num_regs && region->beg[groupNum] >= 0 && region->end[groupNum]>= region->beg[groupNum] ) {
332
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
333
+ }
334
+ }
335
+ }
336
+ return ret;
337
+ }
338
+
339
+ static inline void
340
+ str_mod_check(s, p, len)
341
+ VALUE s;
342
+ char *p;
343
+ long len;
344
+ {
345
+ if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
346
+ rb_raise(rb_eRuntimeError, "string modified");
347
+ }
348
+ }
349
+
350
+ static VALUE
351
+ oregexp_gsub(self, argc, argv, bang, once, region)
352
+ VALUE self; // pattern
353
+ int argc; // should be 1 if block given
354
+ VALUE *argv; // either replacement string
355
+ int bang;
356
+ int once;
357
+ OnigRegion *region;
358
+ {
359
+ VALUE repl;
360
+ long beg,
361
+ end,
362
+ prev_end;
363
+ int tainted = 0,
364
+ iter = 0;
365
+
366
+ VALUE buf, curr_repl, block_res;
367
+ ORegexp *oregexp;
368
+
369
+ if (argc == 1 && rb_block_given_p()) {
370
+ iter = 1;
371
+ } else if (argc == 2) {
372
+ repl = argv[1];
373
+ Check_Type(repl, T_STRING);
374
+ if (OBJ_TAINTED(argv[1]))
375
+ tainted = 1;
376
+ } else {
377
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
378
+ }
379
+ Data_Get_Struct( self, ORegexp, oregexp );
380
+
381
+ VALUE string_str = StringValue( argv[0] );
382
+ UChar* str_ptr = RSTRING(string_str)->ptr;
383
+ int str_len = RSTRING(string_str)->len;
384
+
385
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
386
+
387
+ if (beg < 0) {
388
+ /* no match */
389
+ if (bang)
390
+ return Qnil;
391
+ return rb_str_dup(string_str);
392
+ }
393
+ end = 0;
394
+ buf = rb_str_buf_new(str_len);
395
+ do {
396
+ prev_end = end;
397
+ beg = region->beg[0];
398
+ end = region->end[0];
399
+ rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
400
+ if ( iter ) {
401
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
402
+ rb_backref_set(match_data);
403
+ if( once )
404
+ block_res = rb_yield( match_data );
405
+ else {
406
+ VALUE match_string = rb_str_new( str_ptr+beg, end-beg);
407
+ block_res = rb_yield_values(2, match_string, match_data );
408
+ }
409
+ str_mod_check( string_str, str_ptr, str_len);
410
+ curr_repl = rb_obj_as_string(block_res);
411
+ } else {
412
+ curr_repl = oregexp_get_replacement(self, string_str, repl, region);
413
+ }
414
+ rb_str_append(buf, curr_repl);
415
+ if( once ) break;
416
+ // find next match
417
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
418
+ str_ptr+end, str_ptr + str_len,
419
+ region, ONIG_OPTION_NONE);
420
+ } while ( beg >= 0);
421
+ rb_str_buf_cat( buf, str_ptr+end, str_len - end);
422
+
423
+ if(tainted)
424
+ OBJ_INFECT(buf, repl);
425
+ OBJ_INFECT(buf, string_str);
426
+ if (bang) {
427
+ rb_funcall(string_str, rb_intern("replace"), 1, buf);
428
+ return string_str;
429
+ } else {
430
+ return buf;
431
+ }
432
+ }
433
+
434
+ typedef struct gsub_packet_t {
435
+ VALUE self; // pattern
436
+ int argc; // should be 1 if block given
437
+ VALUE *argv; // either replacement string
438
+ int bang;
439
+ int once;
440
+ OnigRegion *region;
441
+ } gsub_packet;
442
+ static VALUE oregexp_packed_gsub( gsub_packet* args ) {
443
+ return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
444
+ }
445
+ void oregexp_cleanup_region(OnigRegion * region){
446
+ onig_region_free(region, 1);
447
+ }
448
+ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
449
+ VALUE self; // pattern
450
+ int argc; // should be 1 if block given
451
+ VALUE *argv; // either replacement string
452
+ int bang;
453
+ int once;
454
+ {
455
+ OnigRegion * region = onig_region_new();
456
+ gsub_packet call_args = {self, argc, argv, bang, once, region};
457
+ return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
458
+ }
459
+ static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
460
+ return oregexp_safe_gsub(self, argc, argv, 0, 0);
461
+ }
462
+ static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
463
+ return oregexp_safe_gsub(self, argc, argv, 0, 1);
464
+ }
465
+
466
+ static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
467
+ return oregexp_safe_gsub(self, argc, argv, 1, 0);
468
+ }
469
+ static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
470
+ return oregexp_safe_gsub(self, argc, argv, 1, 1);
471
+ }
472
+
187
473
  void Init_oregexp() {
188
474
  mOniguruma = rb_define_module("Oniguruma");
189
475
  VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
190
476
  rb_define_alloc_func(cORegexp, oregexp_allocate);
191
477
  rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
192
478
  rb_define_method( cORegexp, "match", oregexp_match, 1 );
193
-
479
+ rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
480
+ rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
481
+ rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
482
+ rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
194
483
  }
data/lib/oniguruma.rb CHANGED
@@ -254,7 +254,7 @@ module Oniguruma
254
254
  return nil unless string
255
255
  m = match( string )
256
256
  return nil unless m
257
- m.begin
257
+ m.begin(0)
258
258
  end
259
259
 
260
260
  # call-seq:
@@ -289,7 +289,7 @@ module Oniguruma
289
289
  matches << m
290
290
  positions << position
291
291
  tmp_string = m.post_match
292
- position += m.end
292
+ position += m.end(0)
293
293
  #if m.end == m.begin
294
294
  # tmp_string = tmp_string[1..-1]
295
295
  # position += 1
@@ -304,51 +304,6 @@ module Oniguruma
304
304
  nil
305
305
  end
306
306
  end
307
-
308
- def sub string, replacement = nil
309
- matches = match( string )
310
- if matches
311
- replacement = yield matches[0] unless replacement
312
- string.sub( matches[0], replacement )
313
- else
314
- return string
315
- end
316
- end
317
-
318
- def gsub string, replacement = nil
319
- result = string
320
- matches = match_all( string )
321
- string_replace = replacement
322
- if matches
323
- matches.each do |m, p|
324
- replacement = yield( m[0], m ) unless string_replace
325
- result = result.sub( m[0], replacement )
326
- end
327
- end
328
- result
329
- end
330
-
331
- def sub! string, replacement = nil
332
- matches = match( string )
333
- if matches
334
- replacement = yield matches[0] unless replacement
335
- string.sub!( matches[0], replacement )
336
- else
337
- return string
338
- end
339
- end
340
-
341
- def gsub! string, replacement = nil
342
- matches = match_all( string )
343
- string_replace = replacement
344
- if matches
345
- matches.each do |m, p|
346
- replacement = yield( m[0], m ) unless string_replace
347
- string.sub!( m[0], replacement )
348
- end
349
- end
350
- string
351
- end
352
307
  end
353
308
 
354
309
  class MultiMatchData
@@ -371,11 +326,11 @@ module Oniguruma
371
326
  end
372
327
 
373
328
  def begin index
374
- @matches[index].begin + @positions[index]
329
+ @matches[index].begin(0) + @positions[index]
375
330
  end
376
331
 
377
332
  def end index
378
- @matches[index].end + @positions[index]
333
+ @matches[index].end(0) + @positions[index]
379
334
  end
380
335
 
381
336
  def length
@@ -402,90 +357,15 @@ module Oniguruma
402
357
  end
403
358
  end
404
359
 
405
- class MatchData
406
- def initialize( string, starts, ends, names )
407
- @string = string
408
- @starts = starts
409
- @ends = ends
410
- @matches = []
411
- @starts.size.times do |i|
412
- @matches << @string[@starts[i]...@ends[i]]
413
- end
414
- @match_count = @matches.size
415
- @start_pos = 0
416
- @names = names
417
- end
418
-
419
- def [] ( value1, value2 = nil )
420
- unless value2
421
- if index = to_index( value1 )
422
- @matches[index]
423
- else
424
- nil
425
- end
426
- else
427
- @matches[value1, value2]
428
- end
429
- end
430
-
431
- def to_index name
432
- if name.is_a? Symbol
433
- @names[name]
434
- else
435
- name
436
- end
437
- end
438
-
439
- def begin index = 0
440
- @starts[to_index( index )]
441
- end
442
-
443
- def end index = 0
444
- @ends[to_index( index )]
445
- end
446
-
447
- def captures
448
- @matches[1..-1]
449
- end
450
-
451
- def length
452
- @match_count
453
- end
454
- alias size length
455
-
456
- def offset index = 0
457
- [@starts[to_index( index )], @ends[to_index( index )]]
458
- end
459
-
460
- def post_match
461
- @string[@ends[0], @string.length]
462
- end
463
-
464
- def pre_match
465
- @string[0, @starts[0]]
466
- end
467
-
468
- def select &block
469
- @matches.select( &block )
470
- end
471
-
472
- def string
473
- @string.freeze
474
- end
475
-
476
- def to_a
477
- @matches
478
- end
479
-
480
- def to_s
481
- @matches[0]
482
- end
483
-
484
- def values_at *values
485
- result = []
486
- values.each { |v| result << @matches[v] }
487
- result
488
- end
489
- end
490
360
  end
491
-
361
+ class ::MatchData
362
+ alias old_aref :[]
363
+ def [](*idx)
364
+ if idx[0].is_a?(Symbol)
365
+ k = @named_captures && @named_captures[idx[0]]
366
+ k && old_aref(k)
367
+ else
368
+ old_aref(*idx)
369
+ end
370
+ end
371
+ end
@@ -210,5 +210,68 @@ class MatchDataTestCase < Test::Unit::TestCase
210
210
  assert_equal( ')', matches[:end] )
211
211
  assert_equal( nil, matches[:inexistent])
212
212
  end
213
+
214
+ def test_utf8_ignore_case
215
+ reg = Oniguruma::ORegexp.new( '([а-я])+', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
216
+ matches = reg.match("Text: Ехал Грека Через Реку")
217
+ assert_not_nil( matches )
218
+ assert_equal("Ехал", matches[0])
219
+ reg = Oniguruma::ORegexp.new( 'р(уби.*)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
220
+ assert_equal("*убил бы*", reg.gsub("Руби", '*\1л бы*') )
221
+ end
222
+
223
+ def test_utf8_gsub
224
+ reg = Oniguruma::ORegexp.new( '([а-я])([а-я])([а-я]+)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
225
+ new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| m[1]*2+m[2]*2+m[3] }
226
+ assert_equal("Text: ЕЕххал ГГррека ЧЧеерез РРееку", new_str)
227
+ end
228
+
229
+ def test_utf8_gsub2
230
+ reg = Oniguruma::ORegexp.new( '[а-я]', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
231
+ new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| s*2 }
232
+ assert_equal("Text: ЕЕххаалл ГГррееккаа ЧЧеерреезз РРееккуу", new_str)
233
+ end
213
234
 
235
+ def test_sub_compatibility
236
+ $x = "a.gif"
237
+ assert_equal("b.gif", $x.osub('.*\.([^\.]+)$', 'b.\1'))
238
+ assert_equal("\\.gif", $x.osub('.*\.([^\.]+)$', '\\.\1'))
239
+ assert_equal("gif", $x.osub('.*\.([^\.]+)$', '\1'))
240
+ assert_equal("", $x.osub('.*\.([^\.]+)$', '\2'))
241
+ assert_equal("ab", $x.osub('.*\.([^\.]+)$', 'a\2b'))
242
+ assert_equal("<a.gif>", $x.osub('.*\.([^\.]+)$', '<\&>'))
243
+ assert_equal("a.a.", $x.osub('(gif)', '\`') )
244
+ end
245
+
246
+ class ::String
247
+ def ogsub(*args)
248
+ Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
249
+ end
250
+ def ogsub!(*args)
251
+ Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
252
+ end
253
+ def osub(re, *args)
254
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
255
+ end
256
+ end
257
+
258
+ def test_gsub_compat
259
+ assert_equal("hello".ogsub('[aeiou]', '*') , "h*ll*")
260
+ assert_equal("hello".ogsub('([aeiou])', '<\1>') , "h<e>ll<o>")
261
+ i = 0
262
+ assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|s,m| i+=1; i.to_s})
263
+ assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| m[2] + m[1] })
264
+ a = "test"
265
+ a.ogsub!('t', a)
266
+ assert_equal("testestest", a)
267
+ end
268
+
269
+ def test_match_compat
270
+ t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| "#$2#$1" }
271
+ assert_equal("214365", t )
272
+ t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|s,m| "<#$1>" }
273
+ assert_equal( "h<e>ll<o>", t)
274
+ end
275
+
276
+
214
277
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: oniguruma
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.9.0
7
- date: 2007-03-22 00:00:00 +01:00
6
+ version: 0.9.1
7
+ date: 2007-03-25 00:00:00 +01:00
8
8
  summary: Bindings for the oniguruma regular expression library
9
9
  require_paths:
10
10
  - lib
@@ -33,6 +33,7 @@ files:
33
33
  - History.txt
34
34
  - Manifest.txt
35
35
  - README.txt
36
+ - Syntax.txt
36
37
  - Rakefile
37
38
  - lib/oniguruma.rb
38
39
  - ext/oregexp.c