oniguruma 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,4 +1,10 @@
1
- == 1.0.0 / 2007-03-19
1
+ == 0.9.1 / 2007-03-25
2
+ * FIX: Buggy resolution of numeric codes for encoding and syntax options (Nikolai Lugovoi)
3
+ * FIX: Buggy implementation of ORegexp#gsub and ORegexp#gsub methods. Now code is all C (Nikolai Lugovoi)
4
+ * Added documentation for class ORegexp
5
+ * Added regexp syntax documentation.
6
+
7
+ == 0.9.0 / 2007-03-19
2
8
 
3
9
  * 1 major enhancement
4
10
  * Birthday!
data/Manifest.txt CHANGED
@@ -1,6 +1,7 @@
1
1
  History.txt
2
2
  Manifest.txt
3
3
  README.txt
4
+ Syntax.txt
4
5
  Rakefile
5
6
  lib/oniguruma.rb
6
7
  ext/oregexp.c
data/README.txt CHANGED
@@ -16,6 +16,10 @@ Ruby bindings to the Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] regul
16
16
  puts match[0] <= 'terraforming'
17
17
  puts match[:before] <= 'terr'
18
18
  puts match[:after] <= 'forming'
19
+
20
+ == SYNTAX
21
+
22
+ Consult the Syntax.txt[link:files/Syntax_txt.html] page.
19
23
 
20
24
  == REQUIREMENTS:
21
25
 
@@ -39,8 +43,9 @@ sudo gem install -r oniguruma
39
43
 
40
44
  == CREDITS:
41
45
 
42
- * K.Kosako, for his great library.
43
- * A lot of the documentation has been copied from the orininal Ruby Regex documentation.
46
+ * N. Lugovoi. ORegexp.sub and ORegexp.gsub code, plus other patches.
47
+ * K. Kosako. For his great library.
48
+ * A lot of the documentation has been copied from the original Ruby Regex documentation.
44
49
 
45
50
  == LICENSE:
46
51
 
data/Rakefile CHANGED
@@ -3,7 +3,7 @@ require 'hoe'
3
3
 
4
4
  class Hoe; def extra_deps; @extra_deps.reject { |x| Array(x).first == 'hoe' }; end end
5
5
 
6
- Hoe.new('oniguruma', '0.9.0') do |p|
6
+ Hoe.new('oniguruma', '0.9.1') do |p|
7
7
  p.rubyforge_name = 'oniguruma'
8
8
  p.author = 'Dizan Vasquez'
9
9
  p.email = 'dix_ans@yahoo.com'
data/Syntax.txt ADDED
@@ -0,0 +1,396 @@
1
+ = RUBY REGULAR EXPRESSION SYNTAX
2
+
3
+
4
+ == Syntax Elements
5
+
6
+ [\] escape (enable or disable meta character meaning)
7
+ [|] alternation
8
+ [(...)] group
9
+ [[...]] character class
10
+
11
+
12
+ == Characters
13
+
14
+ [\t] horizontal tab (0x09)
15
+ [\v] vertical tab (0x0B)
16
+ [\n] newline (0x0A)
17
+ [\r] return (0x0D)
18
+ [\b] back space (0x08)
19
+
20
+ \b is effective in character class [...] only
21
+ [\f] form feed (0x0C)
22
+ [\a] bell (0x07)
23
+ [\e] escape (0x1B)
24
+ [\nnn] octal char (encoded byte value)
25
+ [\xHH] hexadecimal char (encoded byte value)
26
+ [\x{7HHHHHHH}] wide hexadecimal char (character code point value)
27
+ [\cx] control char (character code point value)
28
+ [\C-x] control char (character code point value)
29
+ [\M-x] meta (x|0x80) (character code point value)
30
+ [\M-\C-x] meta control char (character code point value)
31
+
32
+
33
+
34
+ == Character types
35
+
36
+ [.] any character (except newline)
37
+ [\w] word character
38
+
39
+ Not Unicode:
40
+ * alphanumeric, "_" and multibyte char.
41
+ Unicode:
42
+ * General_Category -- (Letter|Mark|Number|Connector_Punctuation)
43
+ [\W] non word char
44
+ [\s] whitespace char
45
+
46
+ Not Unicode:
47
+ * \t, \n, \v, \f, \r, \x20
48
+ Unicode:
49
+ * 0009, 000A, 000B, 000C, 000D, 0085(NEL),
50
+ * General_Category:
51
+ * -- Line_Separator
52
+ * -- Paragraph_Separator
53
+ * -- Space_Separator
54
+ [\S] non whitespace char
55
+ [\d] decimal digit char
56
+
57
+ Unicode: General_Category -- Decimal_Number
58
+ [\D] non decimal digit char
59
+ [\h] hexadecimal digit char [0-9a-fA-F]
60
+ [\H] non hexadecimal digit char
61
+
62
+
63
+ == Character Properties
64
+
65
+ \p{property-name}
66
+ \p{^property-name} (negative)
67
+ \P{property-name} (negative)
68
+
69
+ === property-name:
70
+
71
+ Works on all encodings:
72
+ * Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
73
+ Print, Punct, Space, Upper, XDigit, Word, ASCII,
74
+ Works on EUC_JP, Shift_JIS:
75
+ * Hiragana, Katakana
76
+ Works on UTF8, UTF16, UTF32:
77
+ * Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu,
78
+ M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps,
79
+ S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs,
80
+ Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese,
81
+ Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic,
82
+ Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
83
+ Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul,
84
+ Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana,
85
+ Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam,
86
+ Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian,
87
+ Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac,
88
+ Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan,
89
+ Tifinagh, Ugaritic, Yi
90
+
91
+ == Quantifiers
92
+
93
+ === Greedy
94
+
95
+ [?] 1 or 0 times
96
+ [*] 0 or more times
97
+ [+] 1 or more times
98
+ [{n,m}] at least n but not more than m times
99
+ [{n,}] at least n times
100
+ [{,n}] at least 0 but not more than n times ({0,n})
101
+ [{n}] n times
102
+
103
+ === Reluctant
104
+
105
+ [??] 1 or 0 times
106
+ [*?] 0 or more times
107
+ [+?] 1 or more times
108
+ [{n,m}?] at least n but not more than m times
109
+ [{n,}?] at least n times
110
+ [{,n}?] at least 0 but not more than n times (== {0,n}?)
111
+
112
+ === Possessive (greedy and does not backtrack after repeated)
113
+
114
+ [?+] 1 or 0 times
115
+ [*+] 0 or more times
116
+ [++] 1 or more times
117
+
118
+ ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
119
+
120
+
121
+ == Anchors
122
+
123
+ [^] beginning of the line
124
+ [$] end of the line
125
+ [\b] word boundary
126
+ [\B] not word boundary
127
+ [\A] beginning of string
128
+ [\Z] end of string, or before newline at the end
129
+ [\z] end of string
130
+ [\G] matching start position
131
+
132
+
133
+ == Character class
134
+
135
+ [^...] negative class (lowest precedence operator)
136
+ [x-y] range from x to y
137
+ [[...]] set (character class in character class)
138
+ [..&&..] intersection (low precedence at the next of ^)
139
+
140
+ If you want to use '[', '-', ']' as a normal character
141
+ in a character class, you should escape these characters by '\'.
142
+
143
+
144
+ POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
145
+
146
+ === Not Unicode Case:
147
+
148
+ [alnum] alphabet or digit char
149
+ [alpha] alphabet
150
+ [ascii] code value: [0 - 127]
151
+ [blank] \t, \x20
152
+ [cntrl] control
153
+ [digit] 0-9
154
+ [graph] include all of multibyte encoded characters
155
+ [lower] lower case
156
+ [print] include all of multibyte encoded characters
157
+ [punct] punctuation
158
+ [space] \t, \n, \v, \f, \r, \x20
159
+ [upper] upper case
160
+ [xdigit] 0-9, a-f, A-F
161
+ [word] alphanumeric, "_" and multibyte characters
162
+
163
+
164
+ === Unicode Case:
165
+
166
+ [alnum] Letter | Mark | Decimal_Number
167
+ [alpha] Letter | Mark
168
+ [ascii] 0000 - 007F
169
+ [blank] Space_Separator | 0009
170
+ [cntrl] Control | Format | Unassigned | Private_Use | Surrogate
171
+ [digit] Decimal_Number
172
+ [graph] [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
173
+ [lower] Lowercase_Letter
174
+ [print] [[:graph:]] | [[:space:]]
175
+ [punct] Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
176
+ Final_Punctuation | Initial_Punctuation | Other_Punctuation |
177
+ Open_Punctuation
178
+ [space] Space_Separator | Line_Separator | Paragraph_Separator |
179
+ 0009 | 000A | 000B | 000C | 000D | 0085
180
+ [upper] Uppercase_Letter
181
+ [xdigit] 0030 - 0039 | 0041 - 0046 | 0061 - 0066
182
+ (0-9, a-f, A-F)
183
+ [word] Letter | Mark | Decimal_Number | Connector_Punctuation
184
+
185
+
186
+
187
+ == Extended groups
188
+
189
+ [(?#...)] comment
190
+ [(?imx-imx)] option on/off:
191
+ * i: ignore case
192
+ * m: multi-line (dot(.) match newline)
193
+ * x: extended form
194
+ [(?imx-imx:subexp)] option on/off for subexp
195
+ [(?:subexp)] not captured group
196
+ [(subexp)] captured group
197
+ [(?=subexp)] look-ahead
198
+ [(?!subexp)] negative look-ahead
199
+ [(?<=subexp)] look-behind
200
+ [(?<!subexp)] negative look-behind
201
+
202
+ Subexp of look-behind must be fixed character length.
203
+ But different character length is allowed in top level
204
+ alternatives only.
205
+ ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
206
+
207
+ In negative-look-behind, captured group isn't allowed,
208
+ but shy group(?:) is allowed.
209
+ [(?>subexp)] atomic group
210
+ don't backtrack in subexp.
211
+ [(?<name>subexp)] define named group
212
+ (All characters of the name must be a word character.)
213
+
214
+ Not only a name but a number is assigned like a captured
215
+ group.
216
+
217
+ Assigning the same name as two or more subexps is allowed.
218
+ In this case, a subexp call can not be performed although
219
+ the back reference is possible.
220
+
221
+
222
+ == Back reference
223
+
224
+ [\n] back reference by group number (n >= 1)
225
+ [\k<name>] back reference by group name
226
+ In the back reference by the multiplex definition name,
227
+ a subexp with a large number is referred to preferentially.
228
+ (When not matched, a group of the small number is referred to.)
229
+
230
+ * Back reference by group number is forbidden if named group is defined
231
+ in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
232
+
233
+
234
+ === Back reference with nest level
235
+
236
+ [\k<name+n>] n: 0, 1, 2, ...
237
+ [\k<name-n>] n: 0, 1, 2, ...
238
+
239
+ Destinate relative nest level from back reference position.
240
+
241
+ Examples:
242
+ /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
243
+
244
+ r = ORegexp.compile(<<'__REGEXP__'.strip, :options => Oniguruma::EXTENDED)
245
+ (?<element> \g<stag> \g<content>* \g<etag> ){0}
246
+ (?<stag> < \g<name> \s* > ){0}
247
+ (?<name> [a-zA-Z_:]+ ){0}
248
+ (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
249
+ (?<etag> </ \k<name+1> >){0}
250
+ \g<element>
251
+ __REGEXP__
252
+
253
+ p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
254
+
255
+
256
+
257
+ === Subexp call ("Tanaka Akira special")
258
+
259
+ [\g<name>] call by group name
260
+ [\g<n>] call by group number (n >= 1)
261
+
262
+ * left-most recursive call is not allowed.
263
+
264
+ Example:
265
+ (?<name>a|\g<name>b) => error
266
+ (?<name>a|b\g<name>c) => OK
267
+ * Call by group number is forbidden if named group is defined in the pattern
268
+ and Oniguruma::OPTION_CAPTURE_GROUP is not set.
269
+ * If the option status of called group is different from calling position
270
+ then the group's option is effective.
271
+
272
+ Example:
273
+ (?-i:\g<name>)(?i:(?<name>a)){0} <i>matches "A"</i>
274
+
275
+
276
+ == Captured group
277
+
278
+ Behavior of the no-named group (...) changes with the following conditions.
279
+ (But named group is not changed.)
280
+
281
+ [case 1] <code>ORegexp.new( '...' )</code> (named group is not used, no option)
282
+
283
+ ... is treated as a captured group.
284
+ [case 2] <code>ORegexp.new( '...', :options => OPTION_DONT_CAPTURE_GROUP )</code> (named group is not used, 'g' option)
285
+
286
+ ... is treated as a no-captured group (?:...).
287
+
288
+ [case 3] <code>ORegexp.new( '...(?<name>...)...' )</code> (named group is used, no option)
289
+
290
+ (?<name>...) is treated as a no-captured group (?:...)
291
+
292
+ numbered-backref/call is not allowed.
293
+
294
+ [case 2] <code>ORegexp.new( '...', :options => OPTION_CAPTURE_GROUP )</code> (named group is used, 'G' option)
295
+
296
+ (?<name>...) is treated as a captured group (?:...)
297
+
298
+ numbered-backref/call is allowed.
299
+
300
+ where
301
+ * g: OPTION_DONT_CAPTURE_GROUP
302
+ * G: OPTION_CAPTURE_GROUP
303
+
304
+ ('g' and 'G' options are argued in ruby-dev ML)
305
+
306
+
307
+ == Syntax dependent options
308
+
309
+ === ONIG_SYNTAX_RUBY
310
+
311
+ [(?m)] dot(.) match newline
312
+
313
+ === ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
314
+
315
+ [(?s)] dot(.) match newline
316
+ [(?m)] ^ match after newline, $ match before newline
317
+
318
+ == Original extensions
319
+
320
+ * hexadecimal digit char type \h, \H
321
+ * named group (?<name>...)
322
+ * named backref \k<name>
323
+ * subexp call \g<name>, \g<group-num>
324
+
325
+
326
+ == Lacking features compare with perl 5.8.0
327
+
328
+ * \N{name}
329
+ * \l,\u,\L,\U, \X, \C
330
+ * (?{code})
331
+ * (??{code})
332
+ * (?(condition)yes-pat|no-pat)
333
+ * \Q...\E
334
+
335
+ This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
336
+
337
+
338
+ == Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
339
+
340
+ * add character property (\p{property}, \P{property})
341
+ * add hexadecimal digit char type (\h, \H)
342
+ * add look-behind
343
+
344
+ (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
345
+ * add possessive quantifier. ?+, *+, ++
346
+ * add operations in character class. [], &&
347
+
348
+ ('[' must be escaped as an usual char in character class.)
349
+ * add named group and subexp call.
350
+ * octal or hexadecimal number sequence can be treated as
351
+ a multibyte code char in character class if multibyte encoding
352
+ is specified.
353
+
354
+ (ex. <code>[\xa1\xa2], [\xa1\xa7-\xa4\xa1]</code>)
355
+ * allow the range of single byte char and multibyte char in character
356
+ class.
357
+
358
+ ex. <code>[a-<<any EUC-JP character>>]</code> in EUC-JP encoding.
359
+ * effect range of isolated option is to next ')'.
360
+ ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
361
+ * isolated option is not transparent to previous pattern.
362
+ ex. <code>a(?i)*</code> is a syntax error pattern.
363
+ * allowed incompleted left brace as an usual string.
364
+ ex. /{/, /({)/, /a{2,3/ etc...
365
+ * negative POSIX bracket [:^xxxx:] is supported.
366
+ * POSIX bracket [:ascii:] is added.
367
+ * repeat of look-ahead is not allowed.
368
+ ex. <code>(?=a)*</code>, <code>(?!b){5}</code>
369
+ * Ignore case option is effective to numbered character.
370
+ ex. <code>/\x61/i =~ "A"<code>
371
+ * In the range quantifier, the number of the minimum is omissible.
372
+
373
+ <code>/a{,n}/ == /a{0,n}/<code>
374
+
375
+ The simultanious abbreviation of the number of times of the minimum
376
+ and the maximum is not allowed. (/a{,}/)
377
+ * <code>a{n}?<code> is not a non-greedy operator.
378
+ <code>/a{n}?/ == /(?:a{n})?/<code>
379
+ * invalid back reference is checked and cause error.
380
+ /\1/, /(a)\2/
381
+ * Zero-length match in infinite repeat stops the repeat,
382
+ then changes of the capture group status are checked as stop condition.
383
+ /(?:()|())*\1\2/ =~ ""
384
+ /(?:\1a|())*/ =~ "a"
385
+
386
+
387
+ == Problems
388
+
389
+ * Invalid encoding byte sequence is not checked in UTF-8.
390
+
391
+ * Invalid first byte is treated as a character.
392
+ /./u =~ "\xa3"
393
+
394
+ * Incomplete byte sequence is not checked.
395
+ /\w+/ =~ "a\xf3\x8ec"
396
+
data/ext/extconf.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require 'mkmf'
2
2
  have_library("onig")
3
+ $CFLAGS='-Wall'
3
4
  create_makefile( "oregexp" )
data/ext/oregexp.c CHANGED
@@ -24,7 +24,10 @@ static VALUE oregexp_allocate( VALUE klass ) {
24
24
  }
25
25
 
26
26
 
27
- static OnigEncodingType * int2encoding( int index ) {
27
+ static OnigEncodingType * int2encoding( VALUE v_index ) {
28
+ int index;
29
+ if( ! NIL_P(v_index) ) {
30
+ index = FIX2INT(v_index);
28
31
  switch( index ) {
29
32
  case 0: return ONIG_ENCODING_ASCII;
30
33
  case 1: return ONIG_ENCODING_ISO_8859_1;
@@ -60,10 +63,14 @@ static OnigEncodingType * int2encoding( int index ) {
60
63
  case 31: return ONIG_ENCODING_GB18030;
61
64
  case 32: return ONIG_ENCODING_UNDEF;
62
65
  }
66
+ }
63
67
  return ONIG_ENCODING_UNDEF;
64
68
  }
65
69
 
66
- static OnigSyntaxType * int2syntax( int index ) {
70
+ static OnigSyntaxType * int2syntax( VALUE v_index ) {
71
+ int index;
72
+ if( ! NIL_P(v_index) ) {
73
+ index = FIX2INT(v_index);
67
74
  switch( index ) {
68
75
  case 0: return ONIG_SYNTAX_ASIS;
69
76
  case 1: return ONIG_SYNTAX_POSIX_BASIC;
@@ -77,25 +84,32 @@ static OnigSyntaxType * int2syntax( int index ) {
77
84
  case 9: return ONIG_SYNTAX_RUBY;
78
85
  case 10: return ONIG_SYNTAX_DEFAULT;
79
86
  }
87
+ }
80
88
  return ONIG_SYNTAX_DEFAULT;
81
89
  }
82
90
 
91
+ struct callback_packet {
92
+ VALUE hash;
93
+ OnigRegion * region;
94
+ };
95
+
83
96
  static int name_callback(
84
97
  const UChar* name,
85
98
  const UChar* name_end,
86
99
  int ngroup_num,
87
100
  int* group_nums,
88
101
  regex_t* reg,
89
- void* arg
102
+ struct callback_packet* arg
90
103
  ) {
91
104
  int i, gn, ref;
92
- OnigRegion *region = (OnigRegion* )arg;
105
+ OnigRegion *region = arg->region;
106
+ VALUE nameHash = arg->hash;
93
107
 
94
108
  for (i = 0; i < ngroup_num; i++) {
95
109
  gn = group_nums[i];
96
110
  ref = onig_name_to_backref_number(reg, name, name_end, region);
97
111
  if (ref != gn )
98
- rb_raise(rb_eException, "Oniguruma Error: group and backreference names are different");
112
+ return 1;
99
113
  rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
100
114
  }
101
115
  return 0;
@@ -110,13 +124,16 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
110
124
  rb_iv_set( self, "@options", options );
111
125
  UChar* pat_ptr = RSTRING(pattern_str)->ptr;
112
126
  int pat_len = RSTRING(pattern_str)->len;
127
+ if( pat_len == 0 ) {
128
+ rb_raise(rb_eArgError, "Empty pattern makes no sense.");
129
+ }
113
130
 
114
131
  VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
115
132
  VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
116
133
  VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
117
134
  int iOptions = NUM2INT( rOptions );
118
- int iEncoding = int2encoding( rEncoding );
119
- int iSyntax = int2syntax( rSyntax );
135
+ OnigEncodingType * iEncoding = int2encoding( rEncoding );
136
+ OnigSyntaxType * iSyntax = int2syntax( rSyntax );
120
137
 
121
138
 
122
139
  int r;
@@ -130,6 +147,40 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
130
147
  return self;
131
148
  }
132
149
 
150
+ struct RMatch {
151
+ struct RBasic basic;
152
+ VALUE str;
153
+ struct re_registers *regs;
154
+ };
155
+
156
+ static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
157
+ VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
158
+ NEWOBJ(match, struct RMatch);
159
+ OBJSETUP(match, rb_cMatch, T_MATCH);
160
+ VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
161
+ int i , count = region->num_regs;
162
+ struct callback_packet packet;
163
+
164
+ match->str = rb_str_new4(string_str);
165
+ match->regs = ALLOC(struct re_registers);
166
+ match->regs->allocated = count+1;
167
+ match->regs->num_regs = count;
168
+ match->regs->beg = ALLOC_N(int, (count+1));
169
+ match->regs->end = ALLOC_N(int, (count+1));
170
+
171
+ for ( i = 0; i <= count; i++){
172
+ match->regs->beg[i] = region->beg[i];
173
+ match->regs->end[i] = region->end[i];
174
+ }
175
+ rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
176
+ packet.region = region;
177
+ packet.hash = rb_hash_new();
178
+ if( onig_foreach_name(oregexp->reg, name_callback, &packet) )
179
+ rb_raise(rb_eException, "Oniguruma Error: group and backreference names are different");
180
+ rb_iv_set((VALUE)match, "@named_captures", packet.hash);
181
+ return (VALUE)match;
182
+ }
183
+
133
184
  /*
134
185
  * call-seq:
135
186
  * rxp.match(str) => matchdata or nil
@@ -151,25 +202,7 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
151
202
  OnigRegion *region = onig_region_new();
152
203
  int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
153
204
  if (r >= 0) {
154
-
155
- VALUE begins = rb_ary_new();
156
- VALUE ends = rb_ary_new();
157
- nameHash = rb_hash_new();
158
-
159
- onig_foreach_name(oregexp->reg, name_callback, (void* )region);
160
-
161
-
162
- int i;
163
-
164
- for (i = 0; i < region->num_regs; i++) {
165
- rb_ary_push( begins, INT2FIX( region->beg[i] ) );
166
- rb_ary_push( ends, INT2FIX( region->end[i] ) );
167
- }
168
- VALUE kMatchData = rb_const_get( mOniguruma, rb_intern( "MatchData" ) );
169
- VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) );
170
- VALUE matchData = rb_funcall(kMatchData, rb_intern("new"), 4, string_str, begins, ends, nameHash );
171
- rb_cv_set( kORegexp, "@@last_match", matchData );
172
-
205
+ VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
173
206
  onig_region_free(region, 1 );
174
207
  return matchData;
175
208
  } else if (r == ONIG_MISMATCH) {
@@ -184,11 +217,267 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
184
217
 
185
218
  }
186
219
 
220
+ static const UChar BACKSLASH = 0x5c;
221
+
222
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
223
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
224
+ backslash). */
225
+
226
+ /* scan the replacement text, looking for substitutions (\n) and \escapes. */
227
+ static VALUE
228
+ oregexp_get_replacement(pat, src_text, repl_text, region)
229
+ VALUE pat,
230
+ src_text,
231
+ repl_text;
232
+ OnigRegion * region;
233
+ {
234
+ ORegexp *oregexp;
235
+ VALUE ret;
236
+ int32_t replIdx = 0;
237
+ int32_t replacementLength = RSTRING(repl_text)->len;
238
+ UChar *replacementText = RSTRING(repl_text)->ptr;
239
+ UChar *replacementEnd = replacementText + (replacementLength-1);
240
+ long numDigits = 0;
241
+ long groupNum = 0, g_start, g_end;
242
+ OnigCodePoint digitC;
243
+ OnigEncoding enc;
244
+ const UChar * matchText;
245
+ long matchLen;
246
+
247
+ matchText = RSTRING(src_text)->ptr;
248
+ matchLen = RSTRING(src_text)->len;
249
+ Data_Get_Struct( pat, ORegexp, oregexp );
250
+ enc = onig_get_encoding( oregexp->reg );
251
+
252
+ ret = rb_str_buf_new(RSTRING(repl_text)->len);
253
+
254
+ while (replIdx < replacementLength) {
255
+ OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
256
+ int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
257
+ replIdx += c_len;
258
+ if ( c != BACKSLASH) {
259
+ /* Common case, no substitution, no escaping, */
260
+ /* just copy the char to the dest buf. */
261
+ rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
262
+ continue;
263
+ }
264
+ if (replIdx >= replacementLength) {
265
+ rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
266
+ break;
267
+ }
268
+ /* Pick up a capture group number if one follows. */
269
+ numDigits = 0;
270
+ groupNum = 0;
271
+ for (;;) {
272
+ if (replIdx >= replacementLength) {
273
+ break;
274
+ }
275
+ digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
276
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
277
+ if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
278
+ break;
279
+ }
280
+ replIdx += c_len;
281
+ groupNum=groupNum*10 + (digitC - '0');
282
+ numDigits++;
283
+ if (numDigits >= 2) { /* limit 99 groups */
284
+ break;
285
+ }
286
+ }
287
+ if (numDigits == 0) {
288
+ /* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
289
+ matched group), \` (string prior to match), \' (string after match), and \\ (a literal
290
+ backslash). */
291
+ int p_len = c_len;
292
+ c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
293
+ c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
294
+ switch(c) {
295
+ case '&' : // matched substring
296
+ rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
297
+ replIdx += c_len;
298
+ break;
299
+ case '`' : // prematch
300
+ rb_str_buf_cat(ret, matchText, region->beg[0]);
301
+ replIdx += c_len;
302
+ break;
303
+ case '\'': // postmatch
304
+ rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
305
+ replIdx += c_len;
306
+ break;
307
+ case '\\': // literal backslash
308
+ // place single backslash
309
+ rb_str_buf_cat(ret, replacementText+replIdx, c_len);
310
+ replIdx += c_len;
311
+ break;
312
+ case '+': // last matched group
313
+ replIdx += c_len;
314
+ for(groupNum = region->num_regs; groupNum > 0; groupNum --) {
315
+ g_start = region->beg[ groupNum ];
316
+ g_end = region->end[ groupNum ];
317
+ if( g_start != -1 ) {
318
+ rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
319
+ break;
320
+ }
321
+ }
322
+ break;
323
+
324
+ default:
325
+ rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
326
+ replIdx += c_len;
327
+
328
+ }
329
+ } else {
330
+ /* Finally, append the capture group data to the destination. */
331
+ if( groupNum < region->num_regs && region->beg[groupNum] >= 0 && region->end[groupNum]>= region->beg[groupNum] ) {
332
+ rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
333
+ }
334
+ }
335
+ }
336
+ return ret;
337
+ }
338
+
339
+ static inline void
340
+ str_mod_check(s, p, len)
341
+ VALUE s;
342
+ char *p;
343
+ long len;
344
+ {
345
+ if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
346
+ rb_raise(rb_eRuntimeError, "string modified");
347
+ }
348
+ }
349
+
350
+ static VALUE
351
+ oregexp_gsub(self, argc, argv, bang, once, region)
352
+ VALUE self; // pattern
353
+ int argc; // should be 1 if block given
354
+ VALUE *argv; // either replacement string
355
+ int bang;
356
+ int once;
357
+ OnigRegion *region;
358
+ {
359
+ VALUE repl;
360
+ long beg,
361
+ end,
362
+ prev_end;
363
+ int tainted = 0,
364
+ iter = 0;
365
+
366
+ VALUE buf, curr_repl, block_res;
367
+ ORegexp *oregexp;
368
+
369
+ if (argc == 1 && rb_block_given_p()) {
370
+ iter = 1;
371
+ } else if (argc == 2) {
372
+ repl = argv[1];
373
+ Check_Type(repl, T_STRING);
374
+ if (OBJ_TAINTED(argv[1]))
375
+ tainted = 1;
376
+ } else {
377
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
378
+ }
379
+ Data_Get_Struct( self, ORegexp, oregexp );
380
+
381
+ VALUE string_str = StringValue( argv[0] );
382
+ UChar* str_ptr = RSTRING(string_str)->ptr;
383
+ int str_len = RSTRING(string_str)->len;
384
+
385
+ beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
386
+
387
+ if (beg < 0) {
388
+ /* no match */
389
+ if (bang)
390
+ return Qnil;
391
+ return rb_str_dup(string_str);
392
+ }
393
+ end = 0;
394
+ buf = rb_str_buf_new(str_len);
395
+ do {
396
+ prev_end = end;
397
+ beg = region->beg[0];
398
+ end = region->end[0];
399
+ rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
400
+ if ( iter ) {
401
+ VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
402
+ rb_backref_set(match_data);
403
+ if( once )
404
+ block_res = rb_yield( match_data );
405
+ else {
406
+ VALUE match_string = rb_str_new( str_ptr+beg, end-beg);
407
+ block_res = rb_yield_values(2, match_string, match_data );
408
+ }
409
+ str_mod_check( string_str, str_ptr, str_len);
410
+ curr_repl = rb_obj_as_string(block_res);
411
+ } else {
412
+ curr_repl = oregexp_get_replacement(self, string_str, repl, region);
413
+ }
414
+ rb_str_append(buf, curr_repl);
415
+ if( once ) break;
416
+ // find next match
417
+ beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
418
+ str_ptr+end, str_ptr + str_len,
419
+ region, ONIG_OPTION_NONE);
420
+ } while ( beg >= 0);
421
+ rb_str_buf_cat( buf, str_ptr+end, str_len - end);
422
+
423
+ if(tainted)
424
+ OBJ_INFECT(buf, repl);
425
+ OBJ_INFECT(buf, string_str);
426
+ if (bang) {
427
+ rb_funcall(string_str, rb_intern("replace"), 1, buf);
428
+ return string_str;
429
+ } else {
430
+ return buf;
431
+ }
432
+ }
433
+
434
+ typedef struct gsub_packet_t {
435
+ VALUE self; // pattern
436
+ int argc; // should be 1 if block given
437
+ VALUE *argv; // either replacement string
438
+ int bang;
439
+ int once;
440
+ OnigRegion *region;
441
+ } gsub_packet;
442
+ static VALUE oregexp_packed_gsub( gsub_packet* args ) {
443
+ return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
444
+ }
445
+ void oregexp_cleanup_region(OnigRegion * region){
446
+ onig_region_free(region, 1);
447
+ }
448
+ static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
449
+ VALUE self; // pattern
450
+ int argc; // should be 1 if block given
451
+ VALUE *argv; // either replacement string
452
+ int bang;
453
+ int once;
454
+ {
455
+ OnigRegion * region = onig_region_new();
456
+ gsub_packet call_args = {self, argc, argv, bang, once, region};
457
+ return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
458
+ }
459
+ static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
460
+ return oregexp_safe_gsub(self, argc, argv, 0, 0);
461
+ }
462
+ static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
463
+ return oregexp_safe_gsub(self, argc, argv, 0, 1);
464
+ }
465
+
466
+ static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
467
+ return oregexp_safe_gsub(self, argc, argv, 1, 0);
468
+ }
469
+ static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
470
+ return oregexp_safe_gsub(self, argc, argv, 1, 1);
471
+ }
472
+
187
473
  void Init_oregexp() {
188
474
  mOniguruma = rb_define_module("Oniguruma");
189
475
  VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
190
476
  rb_define_alloc_func(cORegexp, oregexp_allocate);
191
477
  rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
192
478
  rb_define_method( cORegexp, "match", oregexp_match, 1 );
193
-
479
+ rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
480
+ rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
481
+ rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
482
+ rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
194
483
  }
data/lib/oniguruma.rb CHANGED
@@ -254,7 +254,7 @@ module Oniguruma
254
254
  return nil unless string
255
255
  m = match( string )
256
256
  return nil unless m
257
- m.begin
257
+ m.begin(0)
258
258
  end
259
259
 
260
260
  # call-seq:
@@ -289,7 +289,7 @@ module Oniguruma
289
289
  matches << m
290
290
  positions << position
291
291
  tmp_string = m.post_match
292
- position += m.end
292
+ position += m.end(0)
293
293
  #if m.end == m.begin
294
294
  # tmp_string = tmp_string[1..-1]
295
295
  # position += 1
@@ -304,51 +304,6 @@ module Oniguruma
304
304
  nil
305
305
  end
306
306
  end
307
-
308
- def sub string, replacement = nil
309
- matches = match( string )
310
- if matches
311
- replacement = yield matches[0] unless replacement
312
- string.sub( matches[0], replacement )
313
- else
314
- return string
315
- end
316
- end
317
-
318
- def gsub string, replacement = nil
319
- result = string
320
- matches = match_all( string )
321
- string_replace = replacement
322
- if matches
323
- matches.each do |m, p|
324
- replacement = yield( m[0], m ) unless string_replace
325
- result = result.sub( m[0], replacement )
326
- end
327
- end
328
- result
329
- end
330
-
331
- def sub! string, replacement = nil
332
- matches = match( string )
333
- if matches
334
- replacement = yield matches[0] unless replacement
335
- string.sub!( matches[0], replacement )
336
- else
337
- return string
338
- end
339
- end
340
-
341
- def gsub! string, replacement = nil
342
- matches = match_all( string )
343
- string_replace = replacement
344
- if matches
345
- matches.each do |m, p|
346
- replacement = yield( m[0], m ) unless string_replace
347
- string.sub!( m[0], replacement )
348
- end
349
- end
350
- string
351
- end
352
307
  end
353
308
 
354
309
  class MultiMatchData
@@ -371,11 +326,11 @@ module Oniguruma
371
326
  end
372
327
 
373
328
  def begin index
374
- @matches[index].begin + @positions[index]
329
+ @matches[index].begin(0) + @positions[index]
375
330
  end
376
331
 
377
332
  def end index
378
- @matches[index].end + @positions[index]
333
+ @matches[index].end(0) + @positions[index]
379
334
  end
380
335
 
381
336
  def length
@@ -402,90 +357,15 @@ module Oniguruma
402
357
  end
403
358
  end
404
359
 
405
- class MatchData
406
- def initialize( string, starts, ends, names )
407
- @string = string
408
- @starts = starts
409
- @ends = ends
410
- @matches = []
411
- @starts.size.times do |i|
412
- @matches << @string[@starts[i]...@ends[i]]
413
- end
414
- @match_count = @matches.size
415
- @start_pos = 0
416
- @names = names
417
- end
418
-
419
- def [] ( value1, value2 = nil )
420
- unless value2
421
- if index = to_index( value1 )
422
- @matches[index]
423
- else
424
- nil
425
- end
426
- else
427
- @matches[value1, value2]
428
- end
429
- end
430
-
431
- def to_index name
432
- if name.is_a? Symbol
433
- @names[name]
434
- else
435
- name
436
- end
437
- end
438
-
439
- def begin index = 0
440
- @starts[to_index( index )]
441
- end
442
-
443
- def end index = 0
444
- @ends[to_index( index )]
445
- end
446
-
447
- def captures
448
- @matches[1..-1]
449
- end
450
-
451
- def length
452
- @match_count
453
- end
454
- alias size length
455
-
456
- def offset index = 0
457
- [@starts[to_index( index )], @ends[to_index( index )]]
458
- end
459
-
460
- def post_match
461
- @string[@ends[0], @string.length]
462
- end
463
-
464
- def pre_match
465
- @string[0, @starts[0]]
466
- end
467
-
468
- def select &block
469
- @matches.select( &block )
470
- end
471
-
472
- def string
473
- @string.freeze
474
- end
475
-
476
- def to_a
477
- @matches
478
- end
479
-
480
- def to_s
481
- @matches[0]
482
- end
483
-
484
- def values_at *values
485
- result = []
486
- values.each { |v| result << @matches[v] }
487
- result
488
- end
489
- end
490
360
  end
491
-
361
+ class ::MatchData
362
+ alias old_aref :[]
363
+ def [](*idx)
364
+ if idx[0].is_a?(Symbol)
365
+ k = @named_captures && @named_captures[idx[0]]
366
+ k && old_aref(k)
367
+ else
368
+ old_aref(*idx)
369
+ end
370
+ end
371
+ end
@@ -210,5 +210,68 @@ class MatchDataTestCase < Test::Unit::TestCase
210
210
  assert_equal( ')', matches[:end] )
211
211
  assert_equal( nil, matches[:inexistent])
212
212
  end
213
+
214
+ def test_utf8_ignore_case
215
+ reg = Oniguruma::ORegexp.new( '([а-я])+', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
216
+ matches = reg.match("Text: Ехал Грека Через Реку")
217
+ assert_not_nil( matches )
218
+ assert_equal("Ехал", matches[0])
219
+ reg = Oniguruma::ORegexp.new( 'р(уби.*)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
220
+ assert_equal("*убил бы*", reg.gsub("Руби", '*\1л бы*') )
221
+ end
222
+
223
+ def test_utf8_gsub
224
+ reg = Oniguruma::ORegexp.new( '([а-я])([а-я])([а-я]+)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
225
+ new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| m[1]*2+m[2]*2+m[3] }
226
+ assert_equal("Text: ЕЕххал ГГррека ЧЧеерез РРееку", new_str)
227
+ end
228
+
229
+ def test_utf8_gsub2
230
+ reg = Oniguruma::ORegexp.new( '[а-я]', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
231
+ new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| s*2 }
232
+ assert_equal("Text: ЕЕххаалл ГГррееккаа ЧЧеерреезз РРееккуу", new_str)
233
+ end
213
234
 
235
+ def test_sub_compatibility
236
+ $x = "a.gif"
237
+ assert_equal("b.gif", $x.osub('.*\.([^\.]+)$', 'b.\1'))
238
+ assert_equal("\\.gif", $x.osub('.*\.([^\.]+)$', '\\.\1'))
239
+ assert_equal("gif", $x.osub('.*\.([^\.]+)$', '\1'))
240
+ assert_equal("", $x.osub('.*\.([^\.]+)$', '\2'))
241
+ assert_equal("ab", $x.osub('.*\.([^\.]+)$', 'a\2b'))
242
+ assert_equal("<a.gif>", $x.osub('.*\.([^\.]+)$', '<\&>'))
243
+ assert_equal("a.a.", $x.osub('(gif)', '\`') )
244
+ end
245
+
246
+ class ::String
247
+ def ogsub(*args)
248
+ Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
249
+ end
250
+ def ogsub!(*args)
251
+ Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
252
+ end
253
+ def osub(re, *args)
254
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
255
+ end
256
+ end
257
+
258
+ def test_gsub_compat
259
+ assert_equal("hello".ogsub('[aeiou]', '*') , "h*ll*")
260
+ assert_equal("hello".ogsub('([aeiou])', '<\1>') , "h<e>ll<o>")
261
+ i = 0
262
+ assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|s,m| i+=1; i.to_s})
263
+ assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| m[2] + m[1] })
264
+ a = "test"
265
+ a.ogsub!('t', a)
266
+ assert_equal("testestest", a)
267
+ end
268
+
269
+ def test_match_compat
270
+ t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| "#$2#$1" }
271
+ assert_equal("214365", t )
272
+ t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|s,m| "<#$1>" }
273
+ assert_equal( "h<e>ll<o>", t)
274
+ end
275
+
276
+
214
277
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: oniguruma
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.9.0
7
- date: 2007-03-22 00:00:00 +01:00
6
+ version: 0.9.1
7
+ date: 2007-03-25 00:00:00 +01:00
8
8
  summary: Bindings for the oniguruma regular expression library
9
9
  require_paths:
10
10
  - lib
@@ -33,6 +33,7 @@ files:
33
33
  - History.txt
34
34
  - Manifest.txt
35
35
  - README.txt
36
+ - Syntax.txt
36
37
  - Rakefile
37
38
  - lib/oniguruma.rb
38
39
  - ext/oregexp.c