oniguruma 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -1
- data/Manifest.txt +1 -0
- data/README.txt +7 -2
- data/Rakefile +1 -1
- data/Syntax.txt +396 -0
- data/ext/extconf.rb +1 -0
- data/ext/oregexp.c +316 -27
- data/lib/oniguruma.rb +15 -135
- data/test/test_oniguruma.rb +63 -0
- metadata +3 -2
data/History.txt
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
-
==
|
1
|
+
== 0.9.1 / 2007-03-25
|
2
|
+
* FIX: Buggy resolution of numeric codes for encoding and syntax options (Nikolai Lugovoi)
|
3
|
+
* FIX: Buggy implementation of ORegexp#gsub and ORegexp#gsub methods. Now code is all C (Nikolai Lugovoi)
|
4
|
+
* Added documentation for class ORegexp
|
5
|
+
* Added regexp syntax documentation.
|
6
|
+
|
7
|
+
== 0.9.0 / 2007-03-19
|
2
8
|
|
3
9
|
* 1 major enhancement
|
4
10
|
* Birthday!
|
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -16,6 +16,10 @@ Ruby bindings to the Oniguruma[http://www.geocities.jp/kosako3/oniguruma/] regul
|
|
16
16
|
puts match[0] <= 'terraforming'
|
17
17
|
puts match[:before] <= 'terr'
|
18
18
|
puts match[:after] <= 'forming'
|
19
|
+
|
20
|
+
== SYNTAX
|
21
|
+
|
22
|
+
Consult the Syntax.txt[link:files/Syntax_txt.html] page.
|
19
23
|
|
20
24
|
== REQUIREMENTS:
|
21
25
|
|
@@ -39,8 +43,9 @@ sudo gem install -r oniguruma
|
|
39
43
|
|
40
44
|
== CREDITS:
|
41
45
|
|
42
|
-
*
|
43
|
-
*
|
46
|
+
* N. Lugovoi. ORegexp.sub and ORegexp.gsub code, plus other patches.
|
47
|
+
* K. Kosako. For his great library.
|
48
|
+
* A lot of the documentation has been copied from the original Ruby Regex documentation.
|
44
49
|
|
45
50
|
== LICENSE:
|
46
51
|
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@ require 'hoe'
|
|
3
3
|
|
4
4
|
class Hoe; def extra_deps; @extra_deps.reject { |x| Array(x).first == 'hoe' }; end end
|
5
5
|
|
6
|
-
Hoe.new('oniguruma', '0.9.
|
6
|
+
Hoe.new('oniguruma', '0.9.1') do |p|
|
7
7
|
p.rubyforge_name = 'oniguruma'
|
8
8
|
p.author = 'Dizan Vasquez'
|
9
9
|
p.email = 'dix_ans@yahoo.com'
|
data/Syntax.txt
ADDED
@@ -0,0 +1,396 @@
|
|
1
|
+
= RUBY REGULAR EXPRESSION SYNTAX
|
2
|
+
|
3
|
+
|
4
|
+
== Syntax Elements
|
5
|
+
|
6
|
+
[\] escape (enable or disable meta character meaning)
|
7
|
+
[|] alternation
|
8
|
+
[(...)] group
|
9
|
+
[[...]] character class
|
10
|
+
|
11
|
+
|
12
|
+
== Characters
|
13
|
+
|
14
|
+
[\t] horizontal tab (0x09)
|
15
|
+
[\v] vertical tab (0x0B)
|
16
|
+
[\n] newline (0x0A)
|
17
|
+
[\r] return (0x0D)
|
18
|
+
[\b] back space (0x08)
|
19
|
+
|
20
|
+
\b is effective in character class [...] only
|
21
|
+
[\f] form feed (0x0C)
|
22
|
+
[\a] bell (0x07)
|
23
|
+
[\e] escape (0x1B)
|
24
|
+
[\nnn] octal char (encoded byte value)
|
25
|
+
[\xHH] hexadecimal char (encoded byte value)
|
26
|
+
[\x{7HHHHHHH}] wide hexadecimal char (character code point value)
|
27
|
+
[\cx] control char (character code point value)
|
28
|
+
[\C-x] control char (character code point value)
|
29
|
+
[\M-x] meta (x|0x80) (character code point value)
|
30
|
+
[\M-\C-x] meta control char (character code point value)
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
== Character types
|
35
|
+
|
36
|
+
[.] any character (except newline)
|
37
|
+
[\w] word character
|
38
|
+
|
39
|
+
Not Unicode:
|
40
|
+
* alphanumeric, "_" and multibyte char.
|
41
|
+
Unicode:
|
42
|
+
* General_Category -- (Letter|Mark|Number|Connector_Punctuation)
|
43
|
+
[\W] non word char
|
44
|
+
[\s] whitespace char
|
45
|
+
|
46
|
+
Not Unicode:
|
47
|
+
* \t, \n, \v, \f, \r, \x20
|
48
|
+
Unicode:
|
49
|
+
* 0009, 000A, 000B, 000C, 000D, 0085(NEL),
|
50
|
+
* General_Category:
|
51
|
+
* -- Line_Separator
|
52
|
+
* -- Paragraph_Separator
|
53
|
+
* -- Space_Separator
|
54
|
+
[\S] non whitespace char
|
55
|
+
[\d] decimal digit char
|
56
|
+
|
57
|
+
Unicode: General_Category -- Decimal_Number
|
58
|
+
[\D] non decimal digit char
|
59
|
+
[\h] hexadecimal digit char [0-9a-fA-F]
|
60
|
+
[\H] non hexadecimal digit char
|
61
|
+
|
62
|
+
|
63
|
+
== Character Properties
|
64
|
+
|
65
|
+
\p{property-name}
|
66
|
+
\p{^property-name} (negative)
|
67
|
+
\P{property-name} (negative)
|
68
|
+
|
69
|
+
=== property-name:
|
70
|
+
|
71
|
+
Works on all encodings:
|
72
|
+
* Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
|
73
|
+
Print, Punct, Space, Upper, XDigit, Word, ASCII,
|
74
|
+
Works on EUC_JP, Shift_JIS:
|
75
|
+
* Hiragana, Katakana
|
76
|
+
Works on UTF8, UTF16, UTF32:
|
77
|
+
* Any, Assigned, C, Cc, Cf, Cn, Co, Cs, L, Ll, Lm, Lo, Lt, Lu,
|
78
|
+
M, Mc, Me, Mn, N, Nd, Nl, No, P, Pc, Pd, Pe, Pf, Pi, Po, Ps,
|
79
|
+
S, Sc, Sk, Sm, So, Z, Zl, Zp, Zs,
|
80
|
+
Arabic, Armenian, Bengali, Bopomofo, Braille, Buginese,
|
81
|
+
Buhid, Canadian_Aboriginal, Cherokee, Common, Coptic,
|
82
|
+
Cypriot, Cyrillic, Deseret, Devanagari, Ethiopic, Georgian,
|
83
|
+
Glagolitic, Gothic, Greek, Gujarati, Gurmukhi, Han, Hangul,
|
84
|
+
Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana,
|
85
|
+
Kharoshthi, Khmer, Lao, Latin, Limbu, Linear_B, Malayalam,
|
86
|
+
Mongolian, Myanmar, New_Tai_Lue, Ogham, Old_Italic, Old_Persian,
|
87
|
+
Oriya, Osmanya, Runic, Shavian, Sinhala, Syloti_Nagri, Syriac,
|
88
|
+
Tagalog, Tagbanwa, Tai_Le, Tamil, Telugu, Thaana, Thai, Tibetan,
|
89
|
+
Tifinagh, Ugaritic, Yi
|
90
|
+
|
91
|
+
== Quantifiers
|
92
|
+
|
93
|
+
=== Greedy
|
94
|
+
|
95
|
+
[?] 1 or 0 times
|
96
|
+
[*] 0 or more times
|
97
|
+
[+] 1 or more times
|
98
|
+
[{n,m}] at least n but not more than m times
|
99
|
+
[{n,}] at least n times
|
100
|
+
[{,n}] at least 0 but not more than n times ({0,n})
|
101
|
+
[{n}] n times
|
102
|
+
|
103
|
+
=== Reluctant
|
104
|
+
|
105
|
+
[??] 1 or 0 times
|
106
|
+
[*?] 0 or more times
|
107
|
+
[+?] 1 or more times
|
108
|
+
[{n,m}?] at least n but not more than m times
|
109
|
+
[{n,}?] at least n times
|
110
|
+
[{,n}?] at least 0 but not more than n times (== {0,n}?)
|
111
|
+
|
112
|
+
=== Possessive (greedy and does not backtrack after repeated)
|
113
|
+
|
114
|
+
[?+] 1 or 0 times
|
115
|
+
[*+] 0 or more times
|
116
|
+
[++] 1 or more times
|
117
|
+
|
118
|
+
({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
|
119
|
+
|
120
|
+
|
121
|
+
== Anchors
|
122
|
+
|
123
|
+
[^] beginning of the line
|
124
|
+
[$] end of the line
|
125
|
+
[\b] word boundary
|
126
|
+
[\B] not word boundary
|
127
|
+
[\A] beginning of string
|
128
|
+
[\Z] end of string, or before newline at the end
|
129
|
+
[\z] end of string
|
130
|
+
[\G] matching start position
|
131
|
+
|
132
|
+
|
133
|
+
== Character class
|
134
|
+
|
135
|
+
[^...] negative class (lowest precedence operator)
|
136
|
+
[x-y] range from x to y
|
137
|
+
[[...]] set (character class in character class)
|
138
|
+
[..&&..] intersection (low precedence at the next of ^)
|
139
|
+
|
140
|
+
If you want to use '[', '-', ']' as a normal character
|
141
|
+
in a character class, you should escape these characters by '\'.
|
142
|
+
|
143
|
+
|
144
|
+
POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
|
145
|
+
|
146
|
+
=== Not Unicode Case:
|
147
|
+
|
148
|
+
[alnum] alphabet or digit char
|
149
|
+
[alpha] alphabet
|
150
|
+
[ascii] code value: [0 - 127]
|
151
|
+
[blank] \t, \x20
|
152
|
+
[cntrl] control
|
153
|
+
[digit] 0-9
|
154
|
+
[graph] include all of multibyte encoded characters
|
155
|
+
[lower] lower case
|
156
|
+
[print] include all of multibyte encoded characters
|
157
|
+
[punct] punctuation
|
158
|
+
[space] \t, \n, \v, \f, \r, \x20
|
159
|
+
[upper] upper case
|
160
|
+
[xdigit] 0-9, a-f, A-F
|
161
|
+
[word] alphanumeric, "_" and multibyte characters
|
162
|
+
|
163
|
+
|
164
|
+
=== Unicode Case:
|
165
|
+
|
166
|
+
[alnum] Letter | Mark | Decimal_Number
|
167
|
+
[alpha] Letter | Mark
|
168
|
+
[ascii] 0000 - 007F
|
169
|
+
[blank] Space_Separator | 0009
|
170
|
+
[cntrl] Control | Format | Unassigned | Private_Use | Surrogate
|
171
|
+
[digit] Decimal_Number
|
172
|
+
[graph] [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
|
173
|
+
[lower] Lowercase_Letter
|
174
|
+
[print] [[:graph:]] | [[:space:]]
|
175
|
+
[punct] Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
|
176
|
+
Final_Punctuation | Initial_Punctuation | Other_Punctuation |
|
177
|
+
Open_Punctuation
|
178
|
+
[space] Space_Separator | Line_Separator | Paragraph_Separator |
|
179
|
+
0009 | 000A | 000B | 000C | 000D | 0085
|
180
|
+
[upper] Uppercase_Letter
|
181
|
+
[xdigit] 0030 - 0039 | 0041 - 0046 | 0061 - 0066
|
182
|
+
(0-9, a-f, A-F)
|
183
|
+
[word] Letter | Mark | Decimal_Number | Connector_Punctuation
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
== Extended groups
|
188
|
+
|
189
|
+
[(?#...)] comment
|
190
|
+
[(?imx-imx)] option on/off:
|
191
|
+
* i: ignore case
|
192
|
+
* m: multi-line (dot(.) match newline)
|
193
|
+
* x: extended form
|
194
|
+
[(?imx-imx:subexp)] option on/off for subexp
|
195
|
+
[(?:subexp)] not captured group
|
196
|
+
[(subexp)] captured group
|
197
|
+
[(?=subexp)] look-ahead
|
198
|
+
[(?!subexp)] negative look-ahead
|
199
|
+
[(?<=subexp)] look-behind
|
200
|
+
[(?<!subexp)] negative look-behind
|
201
|
+
|
202
|
+
Subexp of look-behind must be fixed character length.
|
203
|
+
But different character length is allowed in top level
|
204
|
+
alternatives only.
|
205
|
+
ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
|
206
|
+
|
207
|
+
In negative-look-behind, captured group isn't allowed,
|
208
|
+
but shy group(?:) is allowed.
|
209
|
+
[(?>subexp)] atomic group
|
210
|
+
don't backtrack in subexp.
|
211
|
+
[(?<name>subexp)] define named group
|
212
|
+
(All characters of the name must be a word character.)
|
213
|
+
|
214
|
+
Not only a name but a number is assigned like a captured
|
215
|
+
group.
|
216
|
+
|
217
|
+
Assigning the same name as two or more subexps is allowed.
|
218
|
+
In this case, a subexp call can not be performed although
|
219
|
+
the back reference is possible.
|
220
|
+
|
221
|
+
|
222
|
+
== Back reference
|
223
|
+
|
224
|
+
[\n] back reference by group number (n >= 1)
|
225
|
+
[\k<name>] back reference by group name
|
226
|
+
In the back reference by the multiplex definition name,
|
227
|
+
a subexp with a large number is referred to preferentially.
|
228
|
+
(When not matched, a group of the small number is referred to.)
|
229
|
+
|
230
|
+
* Back reference by group number is forbidden if named group is defined
|
231
|
+
in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
|
232
|
+
|
233
|
+
|
234
|
+
=== Back reference with nest level
|
235
|
+
|
236
|
+
[\k<name+n>] n: 0, 1, 2, ...
|
237
|
+
[\k<name-n>] n: 0, 1, 2, ...
|
238
|
+
|
239
|
+
Destinate relative nest level from back reference position.
|
240
|
+
|
241
|
+
Examples:
|
242
|
+
/\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
|
243
|
+
|
244
|
+
r = ORegexp.compile(<<'__REGEXP__'.strip, :options => Oniguruma::EXTENDED)
|
245
|
+
(?<element> \g<stag> \g<content>* \g<etag> ){0}
|
246
|
+
(?<stag> < \g<name> \s* > ){0}
|
247
|
+
(?<name> [a-zA-Z_:]+ ){0}
|
248
|
+
(?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
|
249
|
+
(?<etag> </ \k<name+1> >){0}
|
250
|
+
\g<element>
|
251
|
+
__REGEXP__
|
252
|
+
|
253
|
+
p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
=== Subexp call ("Tanaka Akira special")
|
258
|
+
|
259
|
+
[\g<name>] call by group name
|
260
|
+
[\g<n>] call by group number (n >= 1)
|
261
|
+
|
262
|
+
* left-most recursive call is not allowed.
|
263
|
+
|
264
|
+
Example:
|
265
|
+
(?<name>a|\g<name>b) => error
|
266
|
+
(?<name>a|b\g<name>c) => OK
|
267
|
+
* Call by group number is forbidden if named group is defined in the pattern
|
268
|
+
and Oniguruma::OPTION_CAPTURE_GROUP is not set.
|
269
|
+
* If the option status of called group is different from calling position
|
270
|
+
then the group's option is effective.
|
271
|
+
|
272
|
+
Example:
|
273
|
+
(?-i:\g<name>)(?i:(?<name>a)){0} <i>matches "A"</i>
|
274
|
+
|
275
|
+
|
276
|
+
== Captured group
|
277
|
+
|
278
|
+
Behavior of the no-named group (...) changes with the following conditions.
|
279
|
+
(But named group is not changed.)
|
280
|
+
|
281
|
+
[case 1] <code>ORegexp.new( '...' )</code> (named group is not used, no option)
|
282
|
+
|
283
|
+
... is treated as a captured group.
|
284
|
+
[case 2] <code>ORegexp.new( '...', :options => OPTION_DONT_CAPTURE_GROUP )</code> (named group is not used, 'g' option)
|
285
|
+
|
286
|
+
... is treated as a no-captured group (?:...).
|
287
|
+
|
288
|
+
[case 3] <code>ORegexp.new( '...(?<name>...)...' )</code> (named group is used, no option)
|
289
|
+
|
290
|
+
(?<name>...) is treated as a no-captured group (?:...)
|
291
|
+
|
292
|
+
numbered-backref/call is not allowed.
|
293
|
+
|
294
|
+
[case 2] <code>ORegexp.new( '...', :options => OPTION_CAPTURE_GROUP )</code> (named group is used, 'G' option)
|
295
|
+
|
296
|
+
(?<name>...) is treated as a captured group (?:...)
|
297
|
+
|
298
|
+
numbered-backref/call is allowed.
|
299
|
+
|
300
|
+
where
|
301
|
+
* g: OPTION_DONT_CAPTURE_GROUP
|
302
|
+
* G: OPTION_CAPTURE_GROUP
|
303
|
+
|
304
|
+
('g' and 'G' options are argued in ruby-dev ML)
|
305
|
+
|
306
|
+
|
307
|
+
== Syntax dependent options
|
308
|
+
|
309
|
+
=== ONIG_SYNTAX_RUBY
|
310
|
+
|
311
|
+
[(?m)] dot(.) match newline
|
312
|
+
|
313
|
+
=== ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
|
314
|
+
|
315
|
+
[(?s)] dot(.) match newline
|
316
|
+
[(?m)] ^ match after newline, $ match before newline
|
317
|
+
|
318
|
+
== Original extensions
|
319
|
+
|
320
|
+
* hexadecimal digit char type \h, \H
|
321
|
+
* named group (?<name>...)
|
322
|
+
* named backref \k<name>
|
323
|
+
* subexp call \g<name>, \g<group-num>
|
324
|
+
|
325
|
+
|
326
|
+
== Lacking features compare with perl 5.8.0
|
327
|
+
|
328
|
+
* \N{name}
|
329
|
+
* \l,\u,\L,\U, \X, \C
|
330
|
+
* (?{code})
|
331
|
+
* (??{code})
|
332
|
+
* (?(condition)yes-pat|no-pat)
|
333
|
+
* \Q...\E
|
334
|
+
|
335
|
+
This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
|
336
|
+
|
337
|
+
|
338
|
+
== Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
|
339
|
+
|
340
|
+
* add character property (\p{property}, \P{property})
|
341
|
+
* add hexadecimal digit char type (\h, \H)
|
342
|
+
* add look-behind
|
343
|
+
|
344
|
+
(?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
|
345
|
+
* add possessive quantifier. ?+, *+, ++
|
346
|
+
* add operations in character class. [], &&
|
347
|
+
|
348
|
+
('[' must be escaped as an usual char in character class.)
|
349
|
+
* add named group and subexp call.
|
350
|
+
* octal or hexadecimal number sequence can be treated as
|
351
|
+
a multibyte code char in character class if multibyte encoding
|
352
|
+
is specified.
|
353
|
+
|
354
|
+
(ex. <code>[\xa1\xa2], [\xa1\xa7-\xa4\xa1]</code>)
|
355
|
+
* allow the range of single byte char and multibyte char in character
|
356
|
+
class.
|
357
|
+
|
358
|
+
ex. <code>[a-<<any EUC-JP character>>]</code> in EUC-JP encoding.
|
359
|
+
* effect range of isolated option is to next ')'.
|
360
|
+
ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
|
361
|
+
* isolated option is not transparent to previous pattern.
|
362
|
+
ex. <code>a(?i)*</code> is a syntax error pattern.
|
363
|
+
* allowed incompleted left brace as an usual string.
|
364
|
+
ex. /{/, /({)/, /a{2,3/ etc...
|
365
|
+
* negative POSIX bracket [:^xxxx:] is supported.
|
366
|
+
* POSIX bracket [:ascii:] is added.
|
367
|
+
* repeat of look-ahead is not allowed.
|
368
|
+
ex. <code>(?=a)*</code>, <code>(?!b){5}</code>
|
369
|
+
* Ignore case option is effective to numbered character.
|
370
|
+
ex. <code>/\x61/i =~ "A"<code>
|
371
|
+
* In the range quantifier, the number of the minimum is omissible.
|
372
|
+
|
373
|
+
<code>/a{,n}/ == /a{0,n}/<code>
|
374
|
+
|
375
|
+
The simultanious abbreviation of the number of times of the minimum
|
376
|
+
and the maximum is not allowed. (/a{,}/)
|
377
|
+
* <code>a{n}?<code> is not a non-greedy operator.
|
378
|
+
<code>/a{n}?/ == /(?:a{n})?/<code>
|
379
|
+
* invalid back reference is checked and cause error.
|
380
|
+
/\1/, /(a)\2/
|
381
|
+
* Zero-length match in infinite repeat stops the repeat,
|
382
|
+
then changes of the capture group status are checked as stop condition.
|
383
|
+
/(?:()|())*\1\2/ =~ ""
|
384
|
+
/(?:\1a|())*/ =~ "a"
|
385
|
+
|
386
|
+
|
387
|
+
== Problems
|
388
|
+
|
389
|
+
* Invalid encoding byte sequence is not checked in UTF-8.
|
390
|
+
|
391
|
+
* Invalid first byte is treated as a character.
|
392
|
+
/./u =~ "\xa3"
|
393
|
+
|
394
|
+
* Incomplete byte sequence is not checked.
|
395
|
+
/\w+/ =~ "a\xf3\x8ec"
|
396
|
+
|
data/ext/extconf.rb
CHANGED
data/ext/oregexp.c
CHANGED
@@ -24,7 +24,10 @@ static VALUE oregexp_allocate( VALUE klass ) {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
|
27
|
-
static OnigEncodingType * int2encoding(
|
27
|
+
static OnigEncodingType * int2encoding( VALUE v_index ) {
|
28
|
+
int index;
|
29
|
+
if( ! NIL_P(v_index) ) {
|
30
|
+
index = FIX2INT(v_index);
|
28
31
|
switch( index ) {
|
29
32
|
case 0: return ONIG_ENCODING_ASCII;
|
30
33
|
case 1: return ONIG_ENCODING_ISO_8859_1;
|
@@ -60,10 +63,14 @@ static OnigEncodingType * int2encoding( int index ) {
|
|
60
63
|
case 31: return ONIG_ENCODING_GB18030;
|
61
64
|
case 32: return ONIG_ENCODING_UNDEF;
|
62
65
|
}
|
66
|
+
}
|
63
67
|
return ONIG_ENCODING_UNDEF;
|
64
68
|
}
|
65
69
|
|
66
|
-
static OnigSyntaxType * int2syntax(
|
70
|
+
static OnigSyntaxType * int2syntax( VALUE v_index ) {
|
71
|
+
int index;
|
72
|
+
if( ! NIL_P(v_index) ) {
|
73
|
+
index = FIX2INT(v_index);
|
67
74
|
switch( index ) {
|
68
75
|
case 0: return ONIG_SYNTAX_ASIS;
|
69
76
|
case 1: return ONIG_SYNTAX_POSIX_BASIC;
|
@@ -77,25 +84,32 @@ static OnigSyntaxType * int2syntax( int index ) {
|
|
77
84
|
case 9: return ONIG_SYNTAX_RUBY;
|
78
85
|
case 10: return ONIG_SYNTAX_DEFAULT;
|
79
86
|
}
|
87
|
+
}
|
80
88
|
return ONIG_SYNTAX_DEFAULT;
|
81
89
|
}
|
82
90
|
|
91
|
+
struct callback_packet {
|
92
|
+
VALUE hash;
|
93
|
+
OnigRegion * region;
|
94
|
+
};
|
95
|
+
|
83
96
|
static int name_callback(
|
84
97
|
const UChar* name,
|
85
98
|
const UChar* name_end,
|
86
99
|
int ngroup_num,
|
87
100
|
int* group_nums,
|
88
101
|
regex_t* reg,
|
89
|
-
|
102
|
+
struct callback_packet* arg
|
90
103
|
) {
|
91
104
|
int i, gn, ref;
|
92
|
-
OnigRegion *region =
|
105
|
+
OnigRegion *region = arg->region;
|
106
|
+
VALUE nameHash = arg->hash;
|
93
107
|
|
94
108
|
for (i = 0; i < ngroup_num; i++) {
|
95
109
|
gn = group_nums[i];
|
96
110
|
ref = onig_name_to_backref_number(reg, name, name_end, region);
|
97
111
|
if (ref != gn )
|
98
|
-
|
112
|
+
return 1;
|
99
113
|
rb_hash_aset( nameHash, ID2SYM(rb_intern(name)), INT2FIX( gn ) );
|
100
114
|
}
|
101
115
|
return 0;
|
@@ -110,13 +124,16 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
110
124
|
rb_iv_set( self, "@options", options );
|
111
125
|
UChar* pat_ptr = RSTRING(pattern_str)->ptr;
|
112
126
|
int pat_len = RSTRING(pattern_str)->len;
|
127
|
+
if( pat_len == 0 ) {
|
128
|
+
rb_raise(rb_eArgError, "Empty pattern makes no sense.");
|
129
|
+
}
|
113
130
|
|
114
131
|
VALUE rOptions = rb_hash_aref( options, ID2SYM( rb_intern( "options" ) ) );
|
115
132
|
VALUE rEncoding = rb_hash_aref( options, ID2SYM( rb_intern( "encoding" ) ) );
|
116
133
|
VALUE rSyntax = rb_hash_aref( options, ID2SYM( rb_intern( "syntax" ) ) );
|
117
134
|
int iOptions = NUM2INT( rOptions );
|
118
|
-
|
119
|
-
|
135
|
+
OnigEncodingType * iEncoding = int2encoding( rEncoding );
|
136
|
+
OnigSyntaxType * iSyntax = int2syntax( rSyntax );
|
120
137
|
|
121
138
|
|
122
139
|
int r;
|
@@ -130,6 +147,40 @@ static VALUE oregexp_initialize( VALUE self, VALUE pattern, VALUE options ) {
|
|
130
147
|
return self;
|
131
148
|
}
|
132
149
|
|
150
|
+
struct RMatch {
|
151
|
+
struct RBasic basic;
|
152
|
+
VALUE str;
|
153
|
+
struct re_registers *regs;
|
154
|
+
};
|
155
|
+
|
156
|
+
static VALUE oregexp_make_match_data(ORegexp * oregexp, OnigRegion * region, VALUE string_str) {
|
157
|
+
VALUE rb_cMatch = rb_const_get(rb_cObject, rb_intern("MatchData")) ;
|
158
|
+
NEWOBJ(match, struct RMatch);
|
159
|
+
OBJSETUP(match, rb_cMatch, T_MATCH);
|
160
|
+
VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) ) ;
|
161
|
+
int i , count = region->num_regs;
|
162
|
+
struct callback_packet packet;
|
163
|
+
|
164
|
+
match->str = rb_str_new4(string_str);
|
165
|
+
match->regs = ALLOC(struct re_registers);
|
166
|
+
match->regs->allocated = count+1;
|
167
|
+
match->regs->num_regs = count;
|
168
|
+
match->regs->beg = ALLOC_N(int, (count+1));
|
169
|
+
match->regs->end = ALLOC_N(int, (count+1));
|
170
|
+
|
171
|
+
for ( i = 0; i <= count; i++){
|
172
|
+
match->regs->beg[i] = region->beg[i];
|
173
|
+
match->regs->end[i] = region->end[i];
|
174
|
+
}
|
175
|
+
rb_cv_set( kORegexp, "@@last_match", (VALUE)match );
|
176
|
+
packet.region = region;
|
177
|
+
packet.hash = rb_hash_new();
|
178
|
+
if( onig_foreach_name(oregexp->reg, name_callback, &packet) )
|
179
|
+
rb_raise(rb_eException, "Oniguruma Error: group and backreference names are different");
|
180
|
+
rb_iv_set((VALUE)match, "@named_captures", packet.hash);
|
181
|
+
return (VALUE)match;
|
182
|
+
}
|
183
|
+
|
133
184
|
/*
|
134
185
|
* call-seq:
|
135
186
|
* rxp.match(str) => matchdata or nil
|
@@ -151,25 +202,7 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
151
202
|
OnigRegion *region = onig_region_new();
|
152
203
|
int r = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
153
204
|
if (r >= 0) {
|
154
|
-
|
155
|
-
VALUE begins = rb_ary_new();
|
156
|
-
VALUE ends = rb_ary_new();
|
157
|
-
nameHash = rb_hash_new();
|
158
|
-
|
159
|
-
onig_foreach_name(oregexp->reg, name_callback, (void* )region);
|
160
|
-
|
161
|
-
|
162
|
-
int i;
|
163
|
-
|
164
|
-
for (i = 0; i < region->num_regs; i++) {
|
165
|
-
rb_ary_push( begins, INT2FIX( region->beg[i] ) );
|
166
|
-
rb_ary_push( ends, INT2FIX( region->end[i] ) );
|
167
|
-
}
|
168
|
-
VALUE kMatchData = rb_const_get( mOniguruma, rb_intern( "MatchData" ) );
|
169
|
-
VALUE kORegexp = rb_const_get( mOniguruma, rb_intern( "ORegexp" ) );
|
170
|
-
VALUE matchData = rb_funcall(kMatchData, rb_intern("new"), 4, string_str, begins, ends, nameHash );
|
171
|
-
rb_cv_set( kORegexp, "@@last_match", matchData );
|
172
|
-
|
205
|
+
VALUE matchData = oregexp_make_match_data( oregexp, region, string_str);
|
173
206
|
onig_region_free(region, 1 );
|
174
207
|
return matchData;
|
175
208
|
} else if (r == ONIG_MISMATCH) {
|
@@ -184,11 +217,267 @@ static VALUE oregexp_match( VALUE self, VALUE string ) {
|
|
184
217
|
|
185
218
|
}
|
186
219
|
|
220
|
+
static const UChar BACKSLASH = 0x5c;
|
221
|
+
|
222
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
223
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
224
|
+
backslash). */
|
225
|
+
|
226
|
+
/* scan the replacement text, looking for substitutions (\n) and \escapes. */
|
227
|
+
static VALUE
|
228
|
+
oregexp_get_replacement(pat, src_text, repl_text, region)
|
229
|
+
VALUE pat,
|
230
|
+
src_text,
|
231
|
+
repl_text;
|
232
|
+
OnigRegion * region;
|
233
|
+
{
|
234
|
+
ORegexp *oregexp;
|
235
|
+
VALUE ret;
|
236
|
+
int32_t replIdx = 0;
|
237
|
+
int32_t replacementLength = RSTRING(repl_text)->len;
|
238
|
+
UChar *replacementText = RSTRING(repl_text)->ptr;
|
239
|
+
UChar *replacementEnd = replacementText + (replacementLength-1);
|
240
|
+
long numDigits = 0;
|
241
|
+
long groupNum = 0, g_start, g_end;
|
242
|
+
OnigCodePoint digitC;
|
243
|
+
OnigEncoding enc;
|
244
|
+
const UChar * matchText;
|
245
|
+
long matchLen;
|
246
|
+
|
247
|
+
matchText = RSTRING(src_text)->ptr;
|
248
|
+
matchLen = RSTRING(src_text)->len;
|
249
|
+
Data_Get_Struct( pat, ORegexp, oregexp );
|
250
|
+
enc = onig_get_encoding( oregexp->reg );
|
251
|
+
|
252
|
+
ret = rb_str_buf_new(RSTRING(repl_text)->len);
|
253
|
+
|
254
|
+
while (replIdx < replacementLength) {
|
255
|
+
OnigCodePoint c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
256
|
+
int c_len =ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
257
|
+
replIdx += c_len;
|
258
|
+
if ( c != BACKSLASH) {
|
259
|
+
/* Common case, no substitution, no escaping, */
|
260
|
+
/* just copy the char to the dest buf. */
|
261
|
+
rb_str_buf_cat( ret, replacementText+replIdx-c_len, c_len);
|
262
|
+
continue;
|
263
|
+
}
|
264
|
+
if (replIdx >= replacementLength) {
|
265
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-c_len), c_len);
|
266
|
+
break;
|
267
|
+
}
|
268
|
+
/* Pick up a capture group number if one follows. */
|
269
|
+
numDigits = 0;
|
270
|
+
groupNum = 0;
|
271
|
+
for (;;) {
|
272
|
+
if (replIdx >= replacementLength) {
|
273
|
+
break;
|
274
|
+
}
|
275
|
+
digitC = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
276
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
277
|
+
if ( ! ONIGENC_IS_CODE_DIGIT(enc, digitC) ) {
|
278
|
+
break;
|
279
|
+
}
|
280
|
+
replIdx += c_len;
|
281
|
+
groupNum=groupNum*10 + (digitC - '0');
|
282
|
+
numDigits++;
|
283
|
+
if (numDigits >= 2) { /* limit 99 groups */
|
284
|
+
break;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
if (numDigits == 0) {
|
288
|
+
/* Additional backslash sequences work in substitution strings: \& (last match), \+ (last
|
289
|
+
matched group), \` (string prior to match), \' (string after match), and \\ (a literal
|
290
|
+
backslash). */
|
291
|
+
int p_len = c_len;
|
292
|
+
c = ONIGENC_MBC_TO_CODE(enc, replacementText+replIdx, replacementEnd);
|
293
|
+
c_len = ONIGENC_MBC_ENC_LEN(enc, replacementText+replIdx) ;
|
294
|
+
switch(c) {
|
295
|
+
case '&' : // matched substring
|
296
|
+
rb_str_buf_cat(ret, matchText+region->beg[0], region->end[0] - region->beg[0]);
|
297
|
+
replIdx += c_len;
|
298
|
+
break;
|
299
|
+
case '`' : // prematch
|
300
|
+
rb_str_buf_cat(ret, matchText, region->beg[0]);
|
301
|
+
replIdx += c_len;
|
302
|
+
break;
|
303
|
+
case '\'': // postmatch
|
304
|
+
rb_str_buf_cat(ret, matchText+region->end[0], matchLen - region->end[0]);
|
305
|
+
replIdx += c_len;
|
306
|
+
break;
|
307
|
+
case '\\': // literal backslash
|
308
|
+
// place single backslash
|
309
|
+
rb_str_buf_cat(ret, replacementText+replIdx, c_len);
|
310
|
+
replIdx += c_len;
|
311
|
+
break;
|
312
|
+
case '+': // last matched group
|
313
|
+
replIdx += c_len;
|
314
|
+
for(groupNum = region->num_regs; groupNum > 0; groupNum --) {
|
315
|
+
g_start = region->beg[ groupNum ];
|
316
|
+
g_end = region->end[ groupNum ];
|
317
|
+
if( g_start != -1 ) {
|
318
|
+
rb_str_buf_cat(ret, matchText+g_start, g_end-g_start);
|
319
|
+
break;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
break;
|
323
|
+
|
324
|
+
default:
|
325
|
+
rb_str_buf_cat(ret, replacementText+(replIdx-p_len), p_len+c_len);
|
326
|
+
replIdx += c_len;
|
327
|
+
|
328
|
+
}
|
329
|
+
} else {
|
330
|
+
/* Finally, append the capture group data to the destination. */
|
331
|
+
if( groupNum < region->num_regs && region->beg[groupNum] >= 0 && region->end[groupNum]>= region->beg[groupNum] ) {
|
332
|
+
rb_str_buf_cat(ret, matchText+region->beg[groupNum], region->end[groupNum]-region->beg[groupNum]);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
return ret;
|
337
|
+
}
|
338
|
+
|
339
|
+
static inline void
|
340
|
+
str_mod_check(s, p, len)
|
341
|
+
VALUE s;
|
342
|
+
char *p;
|
343
|
+
long len;
|
344
|
+
{
|
345
|
+
if (RSTRING(s)->ptr != p || RSTRING(s)->len != len) {
|
346
|
+
rb_raise(rb_eRuntimeError, "string modified");
|
347
|
+
}
|
348
|
+
}
|
349
|
+
|
350
|
+
static VALUE
|
351
|
+
oregexp_gsub(self, argc, argv, bang, once, region)
|
352
|
+
VALUE self; // pattern
|
353
|
+
int argc; // should be 1 if block given
|
354
|
+
VALUE *argv; // either replacement string
|
355
|
+
int bang;
|
356
|
+
int once;
|
357
|
+
OnigRegion *region;
|
358
|
+
{
|
359
|
+
VALUE repl;
|
360
|
+
long beg,
|
361
|
+
end,
|
362
|
+
prev_end;
|
363
|
+
int tainted = 0,
|
364
|
+
iter = 0;
|
365
|
+
|
366
|
+
VALUE buf, curr_repl, block_res;
|
367
|
+
ORegexp *oregexp;
|
368
|
+
|
369
|
+
if (argc == 1 && rb_block_given_p()) {
|
370
|
+
iter = 1;
|
371
|
+
} else if (argc == 2) {
|
372
|
+
repl = argv[1];
|
373
|
+
Check_Type(repl, T_STRING);
|
374
|
+
if (OBJ_TAINTED(argv[1]))
|
375
|
+
tainted = 1;
|
376
|
+
} else {
|
377
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
|
378
|
+
}
|
379
|
+
Data_Get_Struct( self, ORegexp, oregexp );
|
380
|
+
|
381
|
+
VALUE string_str = StringValue( argv[0] );
|
382
|
+
UChar* str_ptr = RSTRING(string_str)->ptr;
|
383
|
+
int str_len = RSTRING(string_str)->len;
|
384
|
+
|
385
|
+
beg = onig_search(oregexp->reg, str_ptr, str_ptr + str_len, str_ptr, str_ptr + str_len, region, ONIG_OPTION_NONE);
|
386
|
+
|
387
|
+
if (beg < 0) {
|
388
|
+
/* no match */
|
389
|
+
if (bang)
|
390
|
+
return Qnil;
|
391
|
+
return rb_str_dup(string_str);
|
392
|
+
}
|
393
|
+
end = 0;
|
394
|
+
buf = rb_str_buf_new(str_len);
|
395
|
+
do {
|
396
|
+
prev_end = end;
|
397
|
+
beg = region->beg[0];
|
398
|
+
end = region->end[0];
|
399
|
+
rb_str_buf_cat(buf, str_ptr+prev_end, beg-prev_end);
|
400
|
+
if ( iter ) {
|
401
|
+
VALUE match_data = oregexp_make_match_data( oregexp, region, string_str );
|
402
|
+
rb_backref_set(match_data);
|
403
|
+
if( once )
|
404
|
+
block_res = rb_yield( match_data );
|
405
|
+
else {
|
406
|
+
VALUE match_string = rb_str_new( str_ptr+beg, end-beg);
|
407
|
+
block_res = rb_yield_values(2, match_string, match_data );
|
408
|
+
}
|
409
|
+
str_mod_check( string_str, str_ptr, str_len);
|
410
|
+
curr_repl = rb_obj_as_string(block_res);
|
411
|
+
} else {
|
412
|
+
curr_repl = oregexp_get_replacement(self, string_str, repl, region);
|
413
|
+
}
|
414
|
+
rb_str_append(buf, curr_repl);
|
415
|
+
if( once ) break;
|
416
|
+
// find next match
|
417
|
+
beg=onig_search(oregexp->reg, str_ptr, str_ptr + str_len,
|
418
|
+
str_ptr+end, str_ptr + str_len,
|
419
|
+
region, ONIG_OPTION_NONE);
|
420
|
+
} while ( beg >= 0);
|
421
|
+
rb_str_buf_cat( buf, str_ptr+end, str_len - end);
|
422
|
+
|
423
|
+
if(tainted)
|
424
|
+
OBJ_INFECT(buf, repl);
|
425
|
+
OBJ_INFECT(buf, string_str);
|
426
|
+
if (bang) {
|
427
|
+
rb_funcall(string_str, rb_intern("replace"), 1, buf);
|
428
|
+
return string_str;
|
429
|
+
} else {
|
430
|
+
return buf;
|
431
|
+
}
|
432
|
+
}
|
433
|
+
|
434
|
+
typedef struct gsub_packet_t {
|
435
|
+
VALUE self; // pattern
|
436
|
+
int argc; // should be 1 if block given
|
437
|
+
VALUE *argv; // either replacement string
|
438
|
+
int bang;
|
439
|
+
int once;
|
440
|
+
OnigRegion *region;
|
441
|
+
} gsub_packet;
|
442
|
+
static VALUE oregexp_packed_gsub( gsub_packet* args ) {
|
443
|
+
return oregexp_gsub(args->self, args->argc, args->argv, args->bang, args->once, args->region);
|
444
|
+
}
|
445
|
+
void oregexp_cleanup_region(OnigRegion * region){
|
446
|
+
onig_region_free(region, 1);
|
447
|
+
}
|
448
|
+
static VALUE oregexp_safe_gsub(self, argc, argv, bang, once)
|
449
|
+
VALUE self; // pattern
|
450
|
+
int argc; // should be 1 if block given
|
451
|
+
VALUE *argv; // either replacement string
|
452
|
+
int bang;
|
453
|
+
int once;
|
454
|
+
{
|
455
|
+
OnigRegion * region = onig_region_new();
|
456
|
+
gsub_packet call_args = {self, argc, argv, bang, once, region};
|
457
|
+
return rb_ensure( oregexp_packed_gsub, (VALUE)&call_args, oregexp_cleanup_region, (VALUE)region);
|
458
|
+
}
|
459
|
+
static VALUE oregexp_m_gsub(int argc, VALUE *argv, VALUE self) {
|
460
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 0);
|
461
|
+
}
|
462
|
+
static VALUE oregexp_m_sub(int argc, VALUE *argv, VALUE self) {
|
463
|
+
return oregexp_safe_gsub(self, argc, argv, 0, 1);
|
464
|
+
}
|
465
|
+
|
466
|
+
static VALUE oregexp_m_gsub_bang(int argc, VALUE *argv, VALUE self) {
|
467
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 0);
|
468
|
+
}
|
469
|
+
static VALUE oregexp_m_sub_bang(int argc, VALUE *argv, VALUE self) {
|
470
|
+
return oregexp_safe_gsub(self, argc, argv, 1, 1);
|
471
|
+
}
|
472
|
+
|
187
473
|
void Init_oregexp() {
|
188
474
|
mOniguruma = rb_define_module("Oniguruma");
|
189
475
|
VALUE cORegexp = rb_define_class_under(mOniguruma, "ORegexp", rb_cObject);
|
190
476
|
rb_define_alloc_func(cORegexp, oregexp_allocate);
|
191
477
|
rb_define_method( cORegexp, "initialize", oregexp_initialize, 2 );
|
192
478
|
rb_define_method( cORegexp, "match", oregexp_match, 1 );
|
193
|
-
|
479
|
+
rb_define_method( cORegexp, "gsub", oregexp_m_gsub, -1 );
|
480
|
+
rb_define_method( cORegexp, "sub", oregexp_m_sub, -1 );
|
481
|
+
rb_define_method( cORegexp, "gsub!", oregexp_m_gsub_bang, -1 );
|
482
|
+
rb_define_method( cORegexp, "sub!", oregexp_m_sub_bang, -1 );
|
194
483
|
}
|
data/lib/oniguruma.rb
CHANGED
@@ -254,7 +254,7 @@ module Oniguruma
|
|
254
254
|
return nil unless string
|
255
255
|
m = match( string )
|
256
256
|
return nil unless m
|
257
|
-
m.begin
|
257
|
+
m.begin(0)
|
258
258
|
end
|
259
259
|
|
260
260
|
# call-seq:
|
@@ -289,7 +289,7 @@ module Oniguruma
|
|
289
289
|
matches << m
|
290
290
|
positions << position
|
291
291
|
tmp_string = m.post_match
|
292
|
-
position += m.end
|
292
|
+
position += m.end(0)
|
293
293
|
#if m.end == m.begin
|
294
294
|
# tmp_string = tmp_string[1..-1]
|
295
295
|
# position += 1
|
@@ -304,51 +304,6 @@ module Oniguruma
|
|
304
304
|
nil
|
305
305
|
end
|
306
306
|
end
|
307
|
-
|
308
|
-
def sub string, replacement = nil
|
309
|
-
matches = match( string )
|
310
|
-
if matches
|
311
|
-
replacement = yield matches[0] unless replacement
|
312
|
-
string.sub( matches[0], replacement )
|
313
|
-
else
|
314
|
-
return string
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
def gsub string, replacement = nil
|
319
|
-
result = string
|
320
|
-
matches = match_all( string )
|
321
|
-
string_replace = replacement
|
322
|
-
if matches
|
323
|
-
matches.each do |m, p|
|
324
|
-
replacement = yield( m[0], m ) unless string_replace
|
325
|
-
result = result.sub( m[0], replacement )
|
326
|
-
end
|
327
|
-
end
|
328
|
-
result
|
329
|
-
end
|
330
|
-
|
331
|
-
def sub! string, replacement = nil
|
332
|
-
matches = match( string )
|
333
|
-
if matches
|
334
|
-
replacement = yield matches[0] unless replacement
|
335
|
-
string.sub!( matches[0], replacement )
|
336
|
-
else
|
337
|
-
return string
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
def gsub! string, replacement = nil
|
342
|
-
matches = match_all( string )
|
343
|
-
string_replace = replacement
|
344
|
-
if matches
|
345
|
-
matches.each do |m, p|
|
346
|
-
replacement = yield( m[0], m ) unless string_replace
|
347
|
-
string.sub!( m[0], replacement )
|
348
|
-
end
|
349
|
-
end
|
350
|
-
string
|
351
|
-
end
|
352
307
|
end
|
353
308
|
|
354
309
|
class MultiMatchData
|
@@ -371,11 +326,11 @@ module Oniguruma
|
|
371
326
|
end
|
372
327
|
|
373
328
|
def begin index
|
374
|
-
@matches[index].begin + @positions[index]
|
329
|
+
@matches[index].begin(0) + @positions[index]
|
375
330
|
end
|
376
331
|
|
377
332
|
def end index
|
378
|
-
@matches[index].end + @positions[index]
|
333
|
+
@matches[index].end(0) + @positions[index]
|
379
334
|
end
|
380
335
|
|
381
336
|
def length
|
@@ -402,90 +357,15 @@ module Oniguruma
|
|
402
357
|
end
|
403
358
|
end
|
404
359
|
|
405
|
-
class MatchData
|
406
|
-
def initialize( string, starts, ends, names )
|
407
|
-
@string = string
|
408
|
-
@starts = starts
|
409
|
-
@ends = ends
|
410
|
-
@matches = []
|
411
|
-
@starts.size.times do |i|
|
412
|
-
@matches << @string[@starts[i]...@ends[i]]
|
413
|
-
end
|
414
|
-
@match_count = @matches.size
|
415
|
-
@start_pos = 0
|
416
|
-
@names = names
|
417
|
-
end
|
418
|
-
|
419
|
-
def [] ( value1, value2 = nil )
|
420
|
-
unless value2
|
421
|
-
if index = to_index( value1 )
|
422
|
-
@matches[index]
|
423
|
-
else
|
424
|
-
nil
|
425
|
-
end
|
426
|
-
else
|
427
|
-
@matches[value1, value2]
|
428
|
-
end
|
429
|
-
end
|
430
|
-
|
431
|
-
def to_index name
|
432
|
-
if name.is_a? Symbol
|
433
|
-
@names[name]
|
434
|
-
else
|
435
|
-
name
|
436
|
-
end
|
437
|
-
end
|
438
|
-
|
439
|
-
def begin index = 0
|
440
|
-
@starts[to_index( index )]
|
441
|
-
end
|
442
|
-
|
443
|
-
def end index = 0
|
444
|
-
@ends[to_index( index )]
|
445
|
-
end
|
446
|
-
|
447
|
-
def captures
|
448
|
-
@matches[1..-1]
|
449
|
-
end
|
450
|
-
|
451
|
-
def length
|
452
|
-
@match_count
|
453
|
-
end
|
454
|
-
alias size length
|
455
|
-
|
456
|
-
def offset index = 0
|
457
|
-
[@starts[to_index( index )], @ends[to_index( index )]]
|
458
|
-
end
|
459
|
-
|
460
|
-
def post_match
|
461
|
-
@string[@ends[0], @string.length]
|
462
|
-
end
|
463
|
-
|
464
|
-
def pre_match
|
465
|
-
@string[0, @starts[0]]
|
466
|
-
end
|
467
|
-
|
468
|
-
def select &block
|
469
|
-
@matches.select( &block )
|
470
|
-
end
|
471
|
-
|
472
|
-
def string
|
473
|
-
@string.freeze
|
474
|
-
end
|
475
|
-
|
476
|
-
def to_a
|
477
|
-
@matches
|
478
|
-
end
|
479
|
-
|
480
|
-
def to_s
|
481
|
-
@matches[0]
|
482
|
-
end
|
483
|
-
|
484
|
-
def values_at *values
|
485
|
-
result = []
|
486
|
-
values.each { |v| result << @matches[v] }
|
487
|
-
result
|
488
|
-
end
|
489
|
-
end
|
490
360
|
end
|
491
|
-
|
361
|
+
class ::MatchData
|
362
|
+
alias old_aref :[]
|
363
|
+
def [](*idx)
|
364
|
+
if idx[0].is_a?(Symbol)
|
365
|
+
k = @named_captures && @named_captures[idx[0]]
|
366
|
+
k && old_aref(k)
|
367
|
+
else
|
368
|
+
old_aref(*idx)
|
369
|
+
end
|
370
|
+
end
|
371
|
+
end
|
data/test/test_oniguruma.rb
CHANGED
@@ -210,5 +210,68 @@ class MatchDataTestCase < Test::Unit::TestCase
|
|
210
210
|
assert_equal( ')', matches[:end] )
|
211
211
|
assert_equal( nil, matches[:inexistent])
|
212
212
|
end
|
213
|
+
|
214
|
+
def test_utf8_ignore_case
|
215
|
+
reg = Oniguruma::ORegexp.new( '([а-я])+', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
216
|
+
matches = reg.match("Text: Ехал Грека Через Реку")
|
217
|
+
assert_not_nil( matches )
|
218
|
+
assert_equal("Ехал", matches[0])
|
219
|
+
reg = Oniguruma::ORegexp.new( 'р(уби.*)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
220
|
+
assert_equal("*убил бы*", reg.gsub("Руби", '*\1л бы*') )
|
221
|
+
end
|
222
|
+
|
223
|
+
def test_utf8_gsub
|
224
|
+
reg = Oniguruma::ORegexp.new( '([а-я])([а-я])([а-я]+)', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
225
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| m[1]*2+m[2]*2+m[3] }
|
226
|
+
assert_equal("Text: ЕЕххал ГГррека ЧЧеерез РРееку", new_str)
|
227
|
+
end
|
228
|
+
|
229
|
+
def test_utf8_gsub2
|
230
|
+
reg = Oniguruma::ORegexp.new( '[а-я]', :options => Oniguruma::OPTION_IGNORECASE, :encoding => Oniguruma::ENCODING_UTF8 )
|
231
|
+
new_str = reg.gsub("Text: Ехал Грека Через Реку") {|s,m| s*2 }
|
232
|
+
assert_equal("Text: ЕЕххаалл ГГррееккаа ЧЧеерреезз РРееккуу", new_str)
|
233
|
+
end
|
213
234
|
|
235
|
+
def test_sub_compatibility
|
236
|
+
$x = "a.gif"
|
237
|
+
assert_equal("b.gif", $x.osub('.*\.([^\.]+)$', 'b.\1'))
|
238
|
+
assert_equal("\\.gif", $x.osub('.*\.([^\.]+)$', '\\.\1'))
|
239
|
+
assert_equal("gif", $x.osub('.*\.([^\.]+)$', '\1'))
|
240
|
+
assert_equal("", $x.osub('.*\.([^\.]+)$', '\2'))
|
241
|
+
assert_equal("ab", $x.osub('.*\.([^\.]+)$', 'a\2b'))
|
242
|
+
assert_equal("<a.gif>", $x.osub('.*\.([^\.]+)$', '<\&>'))
|
243
|
+
assert_equal("a.a.", $x.osub('(gif)', '\`') )
|
244
|
+
end
|
245
|
+
|
246
|
+
class ::String
|
247
|
+
def ogsub(*args)
|
248
|
+
Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
|
249
|
+
end
|
250
|
+
def ogsub!(*args)
|
251
|
+
Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
|
252
|
+
end
|
253
|
+
def osub(re, *args)
|
254
|
+
Oniguruma::ORegexp.new( re ).sub(self, *args)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def test_gsub_compat
|
259
|
+
assert_equal("hello".ogsub('[aeiou]', '*') , "h*ll*")
|
260
|
+
assert_equal("hello".ogsub('([aeiou])', '<\1>') , "h<e>ll<o>")
|
261
|
+
i = 0
|
262
|
+
assert_equal("12345" , Oniguruma::ORegexp.new('.').gsub("hello") {|s,m| i+=1; i.to_s})
|
263
|
+
assert_equal("214365", Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| m[2] + m[1] })
|
264
|
+
a = "test"
|
265
|
+
a.ogsub!('t', a)
|
266
|
+
assert_equal("testestest", a)
|
267
|
+
end
|
268
|
+
|
269
|
+
def test_match_compat
|
270
|
+
t = Oniguruma::ORegexp.new('(.)(.)').gsub("123456") {|s,m| "#$2#$1" }
|
271
|
+
assert_equal("214365", t )
|
272
|
+
t = Oniguruma::ORegexp.new('([aeiou])').gsub("hello") {|s,m| "<#$1>" }
|
273
|
+
assert_equal( "h<e>ll<o>", t)
|
274
|
+
end
|
275
|
+
|
276
|
+
|
214
277
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: oniguruma
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-03-
|
6
|
+
version: 0.9.1
|
7
|
+
date: 2007-03-25 00:00:00 +01:00
|
8
8
|
summary: Bindings for the oniguruma regular expression library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- History.txt
|
34
34
|
- Manifest.txt
|
35
35
|
- README.txt
|
36
|
+
- Syntax.txt
|
36
37
|
- Rakefile
|
37
38
|
- lib/oniguruma.rb
|
38
39
|
- ext/oregexp.c
|