oniguruma 1.0.1-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,479 @@
1
+ require 'oregexp'
2
+
3
+ module Oniguruma
4
+ OPTION_NONE = 0
5
+ OPTION_IGNORECASE = 1
6
+ OPTION_EXTEND = (OPTION_IGNORECASE << 1)
7
+ OPTION_MULTILINE = (OPTION_EXTEND << 1)
8
+ OPTION_SINGLELINE = (OPTION_MULTILINE << 1)
9
+ OPTION_FIND_LONGEST = (OPTION_SINGLELINE << 1)
10
+ OPTION_FIND_NOT_EMPTY = (OPTION_FIND_LONGEST << 1)
11
+ OPTION_NEGATE_SINGLELINE = (OPTION_FIND_NOT_EMPTY << 1)
12
+ OPTION_DONT_CAPTURE_GROUP = (OPTION_NEGATE_SINGLELINE << 1)
13
+ OPTION_CAPTURE_GROUP = (OPTION_DONT_CAPTURE_GROUP << 1)
14
+ OPTION_NOTBOL = (OPTION_CAPTURE_GROUP << 1)
15
+ OPTION_NOTEOL = (OPTION_NOTBOL << 1)
16
+ OPTION_POSIX_REGION = (OPTION_NOTEOL << 1)
17
+ OPTION_MAXBIT = OPTION_POSIX_REGION
18
+ OPTION_DEFAULT = OPTION_NONE
19
+
20
+ OPTIONS_SHORTCUTS = {
21
+ 'i' => OPTION_IGNORECASE,
22
+ 'x' => OPTION_EXTEND,
23
+ 'm' => OPTION_MULTILINE,
24
+ 's' => OPTION_SINGLELINE,
25
+ 'l' => OPTION_FIND_LONGEST,
26
+ 'E' => OPTION_FIND_NOT_EMPTY,
27
+ 'S' => OPTION_NEGATE_SINGLELINE,
28
+ 'G' => OPTION_DONT_CAPTURE_GROUP,
29
+ 'g' => OPTION_CAPTURE_GROUP,
30
+ 'B' => OPTION_NOTBOL,
31
+ 'E' => OPTION_NOTEOL,
32
+ }
33
+
34
+ SYNTAX_ASIS = 0
35
+ SYNTAX_POSIX_BASIC = 1
36
+ SYNTAX_POSIX_EXTENDED = 2
37
+ SYNTAX_EMACS = 3
38
+ SYNTAX_GREP = 4
39
+ SYNTAX_GNU_REGEX = 5
40
+ SYNTAX_JAVA = 6
41
+ SYNTAX_PERL = 7
42
+ SYNTAX_PERL_NG = 8
43
+ SYNTAX_RUBY = 9
44
+ SYNTAX_DEFAULT = 10
45
+
46
+ ENCODING_ASCII = 0
47
+ ENCODING_ISO_8859_1 = 1
48
+ ENCODING_ISO_8859_2 = 2
49
+ ENCODING_ISO_8859_3 = 3
50
+ ENCODING_ISO_8859_4 = 4
51
+ ENCODING_ISO_8859_5 = 5
52
+ ENCODING_ISO_8859_6 = 6
53
+ ENCODING_ISO_8859_7 = 7
54
+ ENCODING_ISO_8859_8 = 8
55
+ ENCODING_ISO_8859_9 = 9
56
+ ENCODING_ISO_8859_10 = 10
57
+ ENCODING_ISO_8859_11 = 11
58
+ ENCODING_ISO_8859_12 = 12
59
+ ENCODING_ISO_8859_13 = 13
60
+ ENCODING_ISO_8859_14 = 14
61
+ ENCODING_ISO_8859_15 = 15
62
+ ENCODING_ISO_8859_16 = 16
63
+ ENCODING_UTF8 = 17
64
+ ENCODING_UTF16_BE = 18
65
+ ENCODING_UTF16_LE = 19
66
+ ENCODING_UTF32_BE = 20
67
+ ENCODING_UTF32_LE = 21
68
+ ENCODING_EUC_JP = 22
69
+ ENCODING_EUC_TW = 23
70
+ ENCODING_EUC_KR = 24
71
+ ENCODING_EUC_CN = 25
72
+ ENCODING_SJIS = 26
73
+ ENCODING_KOI8 = 27
74
+ ENCODING_KOI8_R = 28
75
+ ENCODING_CP1251 = 29
76
+ ENCODING_BIG5 = 30
77
+ ENCODING_GB18030 = 31
78
+ ENCODING_UNDEF = 32
79
+
80
+
81
+ class ORegexp
82
+
83
+ class << self
84
+ # :stopdoc:
85
+ alias compile new
86
+ # :startdoc:
87
+
88
+ # call-seq:
89
+ # ORegexp.escape(str) => a_str
90
+ # ORegexp.quote(str) => a_str
91
+ #
92
+ # Escapes any characters that would have special meaning in a regular
93
+ # expression. Returns a new escaped string, or self if no characters are
94
+ # escaped. For any string,
95
+ # <code>Regexp.escape(<i>str</i>)=~<i>str</i></code> will be true.
96
+ #
97
+ # ORegexp.escape('\\*?{}.') #=> \\\\\*\?\{\}\.
98
+ #
99
+
100
+ def escape( *args )
101
+ Regexp.escape( *args )
102
+ end
103
+ # :stopdoc:
104
+ alias quote escape
105
+ # :startdoc:
106
+
107
+ # call-seq:
108
+ # ORegexp.last_match => matchdata
109
+ # ORegexp.last_match(fixnum) => str
110
+ #
111
+ # The first form returns the <code>MatchData</code> object generated by the
112
+ # last successful pattern match. The second form returns the nth field in this
113
+ # <code>MatchData</code> object.
114
+ #
115
+ # ORegexp.new( 'c(.)t' ) =~ 'cat' #=> 0
116
+ # ORegexp.last_match #=> #<MatchData:0x401b3d30>
117
+ # ORegexp.last_match(0) #=> "cat"
118
+ # ORegexp.last_match(1) #=> "a"
119
+ # ORegexp.last_match(2) #=> nil
120
+
121
+ def last_match( index = nil)
122
+ if index
123
+ @@last_match[index]
124
+ else
125
+ @@last_match
126
+ end
127
+ end
128
+ end
129
+
130
+ # :stopdoc:
131
+ alias old_initialize initialize
132
+ # :startdoc:
133
+
134
+ # call-seq:
135
+ # ORegexp.new( pattern, options_hash )
136
+ # ORegexp.new( pattern, option_str, encoding_str=nil, syntax_str=nil)
137
+ #
138
+ # Constructs a new regular expression from <i>pattern</i>, which is a
139
+ # <code>String</code>. The second parameter <i></i> may be a <code>Hash</code>
140
+ # of the form:
141
+ #
142
+ # <code>{ :options => option_value, :encoding => encoding_value, :syntax => syntax_value }</code>
143
+ #
144
+ # Where <code>option_value</code> is a bitwise <code>OR</code> of
145
+ # <code>Oniguruma::OPTION_XXX</code> constants; <code>encoding_value</code>
146
+ # is one of <code>Oniguruma::ENCODING_XXX</code> constants; and
147
+ # <code>syntax_value</code> is one of <code>Oniguruma::SYNTAX_XXX</code>
148
+ # constants.
149
+ #
150
+ # r1 = ORegexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
151
+ # r2 = ORegexp.new('cat', :options => OPTION_IGNORECASE ) #=> /cat/i
152
+ # r3 = ORegexp.new('dog', :options => OPTION_EXTEND ) #=> /dog/x
153
+ #
154
+ # #Accept java syntax on SJIS encoding:
155
+ # r4 = ORegexp.new('ape', :syntax => SYNTAX_JAVA, :encoding => ENCODING_SJIS) #=> /ape/
156
+ #
157
+ # Second form uses string shortcuts to set options and encoding:
158
+ # r = ORegexp.new('cat', 'i', 'utf8', 'java')
159
+
160
+ def initialize( pattern, *args )
161
+ defaults = { :options => OPTION_DEFAULT, :encoding => ENCODING_ASCII, :syntax => SYNTAX_DEFAULT}
162
+ if args[0].is_a?(String)
163
+ options = {}
164
+ option_str, encoding_str, syntax_str = *args
165
+ opt = 0
166
+ option_str.each_byte {|x| opt |= (OPTIONS_SHORTCUTS[x.chr] || 0) }
167
+ options[:options] = opt
168
+ if encoding_str && Oniguruma::const_defined?("ENCODING_#{encoding_str.upcase}")
169
+ options[:encoding] = Oniguruma::const_get("ENCODING_#{encoding_str.upcase}")
170
+ end
171
+ if syntax_str && Oniguruma::const_defined?("SYNTAX_#{syntax_str.upcase}")
172
+ options[:syntax] = Oniguruma::const_get("SYNTAX_#{syntax_str.upcase}")
173
+ end
174
+ else
175
+ options = args[0] || {}
176
+ end
177
+ old_initialize( pattern, defaults.merge( options ).freeze )
178
+ end
179
+
180
+ # call-seq:
181
+ # rxp == other_rxp => true or false
182
+ # rxp.eql?(other_rxp) => true or false
183
+ #
184
+ # Equality---Two regexps are equal if their patterns are identical, they have
185
+ # the same character set code, and their <code>#casefold?</code> values are the
186
+ # same.
187
+
188
+ def == regexp
189
+ @pattern == regexp.source && kcode == regexp.kcode && casefold? == regexp.casefold?
190
+ end
191
+ alias eql? ==
192
+
193
+ # call-seq:
194
+ # rxp.casefold? => true of false
195
+ #
196
+ # Returns the value of the case-insensitive flag.
197
+
198
+ def casefold?
199
+ (@options[:options] & OPTION_IGNORECASE) > 0
200
+ end
201
+
202
+ # call-seq:
203
+ # rxp.kode => int
204
+ #
205
+ # Returns the character set code for the regexp.
206
+ def kcode
207
+ @options[:encoding]
208
+ end
209
+
210
+ # call-seq:
211
+ # rxp.options => fixnum
212
+ #
213
+ # Returns the set of bits corresponding to the options used when creating this
214
+ # ORegexp (see <code>ORegexp::new</code> for details. Note that additional bits
215
+ # may be set in the returned options: these are used internally by the regular
216
+ # expression code. These extra bits are ignored if the options are passed to
217
+ # <code>ORegexp::new</code>.
218
+ #
219
+ # Oniguruma::OPTION_IGNORECASE #=> 1
220
+ # Oniguruma::OPTION_EXTEND #=> 2
221
+ # Oniguruma::OPTION_MULTILINE #=> 4
222
+ #
223
+ # Regexp.new(r.source, :options => Oniguruma::OPTION_EXTEND ) #=> 2
224
+
225
+ def options
226
+ @options[:options]
227
+ end
228
+
229
+ # call-seq:
230
+ # rxp.to_s => str
231
+ #
232
+ # Returns a string containing the regular expression and its options (using the
233
+ # <code>(?xxx:yyy)</code> notation. This string can be fed back in to
234
+ # <code>Regexp::new</code> to a regular expression with the same semantics as
235
+ # the original. (However, <code>Regexp#==</code> may not return true when
236
+ # comparing the two, as the source of the regular expression itself may
237
+ # differ, as the example shows). <code>Regexp#inspect</code> produces a
238
+ # generally more readable version of <i>rxp</i>.
239
+ #
240
+ # r1 = ORegexp.new( 'ab+c', :options OPTION_IGNORECASE | OPTION_EXTEND ) #=> /ab+c/ix
241
+ # s1 = r1.to_s #=> "(?ix-m:ab+c)"
242
+ # r2 = ORegexp.new(s1) #=> /(?ix-m:ab+c)/
243
+ # r1 == r2 #=> false
244
+ # r1.source #=> "ab+c"
245
+ # r2.source #=> "(?ix-m:ab+c)"
246
+
247
+ def to_s
248
+ opt_str = "(?"
249
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) > 0
250
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) > 0
251
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) > 0
252
+ unless opt_str == "(?imx"
253
+ opt_str += "-"
254
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) == 0
255
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) == 0
256
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) == 0
257
+ end
258
+ opt_str += ")"
259
+ opt_str + ORegexp.escape( @pattern )
260
+ end
261
+
262
+
263
+ # call-seq:
264
+ # rxp.inspect => string
265
+ #
266
+ # Returns a readable version of <i>rxp</i>
267
+ #
268
+ # ORegexp.new( 'cat', :options => OPTION_MULTILINE | OPTION_IGNORECASE ).inspect => /cat/im
269
+ # ORegexp.new( 'cat', :options => OPTION_MULTILINE | OPTION_IGNORECASE ).to_s => (?im-x)cat
270
+
271
+ def inspect
272
+ opt_str = ""
273
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) > 0
274
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) > 0
275
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) > 0
276
+ "/" + ORegexp.escape( @pattern ) + "/" + opt_str
277
+ end
278
+
279
+ # call-seq:
280
+ # rxp.source => str
281
+ #
282
+ # Returns the original string of the pattern.
283
+ #
284
+ # ORegex.new( 'ab+c', 'ix' ).source #=> "ab+c"
285
+ def source
286
+ @pattern.freeze
287
+ end
288
+
289
+ alias match_all scan
290
+
291
+ end
292
+
293
+ end
294
+
295
+ class ::String
296
+ # Calls <code>Oniguruma::ORegexp#gsub</code> on this string.
297
+ def ogsub(*args)
298
+ Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
299
+ end
300
+
301
+ # Calls <code>Oniguruma::ORegexp#gsub!</code> on this string.
302
+ def ogsub!(*args)
303
+ Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
304
+ end
305
+
306
+ # Calls <code>Oniguruma::ORegexp#sub</code> on this string.
307
+ def osub(re, *args)
308
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
309
+ end
310
+
311
+ # Calls <code>Oniguruma::ORegexp#sub!</code> on this string.
312
+ def osub!(re, *args)
313
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
314
+ end
315
+ end
316
+
317
+ class ::MatchData
318
+ # call-seq:
319
+ # to_index[symbol] => int or nil
320
+ #
321
+ # Returns the group index for the corresponding named group, or
322
+ # <code>nil</code> if the group does not exist.
323
+ #
324
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
325
+ # m.to_index[:begin] #=> 1
326
+ # m.to_index[:unknown] #=> nil
327
+ def to_index symbol
328
+ @named_captures && @named_captures[symbol]
329
+ end
330
+
331
+ alias old_aref :[]
332
+
333
+ # call-seq:
334
+ # mtch[i] => obj
335
+ # mtch[start, length] => array
336
+ # mtch[range] => array
337
+ # mtch[symbol] => obj
338
+ #
339
+ # <code>MatchData</code> acts as an array, and may be
340
+ # accessed using the normal array indexing techniques. <i>mtch</i>[0] is
341
+ # equivalent to the special variable <code>$&</code>, and returns the entire
342
+ # matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
343
+ # of the matched backreferences (portions of the pattern between parentheses).
344
+ #
345
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
346
+ # m[0] #=> "HX1138"
347
+ # m[1, 2] #=> ["H", "X"]
348
+ # m[1..3] #=> ["H", "X", "113"]
349
+ # m[-3, 2] #=> ["X", "113"]
350
+ #
351
+ # If a symbol is used as index, the corresponding named group is returned,
352
+ # or <code>nil</code> if such a group does not exist.
353
+ #
354
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
355
+ # m[:begin] #=> "THX"
356
+ # m[:moddle] #=> "1"
357
+ # m[:end] #=> "138"
358
+
359
+ def [](*idx)
360
+ if idx[0].is_a?(Symbol)
361
+ k = to_index( idx[0] )
362
+ k && old_aref(k)
363
+ else
364
+ old_aref(*idx)
365
+ end
366
+ end
367
+
368
+ alias old_begin :begin
369
+
370
+ # call-seq:
371
+ # mtch.begin(n) => integer
372
+ # mtch.begin => integer
373
+ # mtch.begin(symbol) => integer
374
+ #
375
+ # Returns the offset of the start of the <em>n</em>th element of the match
376
+ # array in the string.
377
+ #
378
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
379
+ # m.begin(0) #=> 1
380
+ # m.begin(2) #=> 2
381
+ #
382
+ # If no arguments are given, the index of the
383
+ # first matching character is returned.
384
+ #
385
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
386
+ # m.begin #=> 1
387
+ #
388
+ # If the argument is a symbol, then the beginning of the
389
+ # corresponding named group is returned, or <code>nil</code>
390
+ # if the group does not exist.
391
+ #
392
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
393
+ # m.begin(:middle) #=> 3
394
+
395
+ def begin(*idx)
396
+ if idx[0].is_a?(Symbol)
397
+ k = to_index( idx[0] )
398
+ k && old_begin(k)
399
+ elsif idx.empty?
400
+ old_begin( 0 )
401
+ else
402
+ old_begin(*idx)
403
+ end
404
+ end
405
+
406
+ alias old_end :end
407
+
408
+ # call-seq:
409
+ # mtch.end(n) => integer
410
+ #
411
+ # Returns the offset of the character immediately following the end of the
412
+ # <em>n</em>th element of the match array in the string.
413
+ #
414
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
415
+ # m.end(0) #=> 7
416
+ # m.end(2) #=> 3
417
+ #
418
+ # If no arguments are given, the index of the
419
+ # last matching character is returned.
420
+ #
421
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
422
+ # m.last #=> 7
423
+ #
424
+ # If the argument is a symbol, then the beginning of the
425
+ # corresponding named group is returned, or <code>nil</code>
426
+ # if the group does not exist.
427
+ #
428
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
429
+ # m.end(:middle) #=> 4
430
+
431
+ def end(*idx)
432
+ if idx[0].is_a?(Symbol)
433
+ k = to_index( idx[0] )
434
+ k && old_end(k)
435
+ elsif idx.empty?
436
+ old_end( 0 )
437
+ else
438
+ old_end(*idx)
439
+ end
440
+ end
441
+
442
+ alias old_offset :offset
443
+
444
+ # call-seq:
445
+ # mtch.offset(n) => array
446
+ # mtch.offset => array
447
+ # mtch.offset(symbol) => array
448
+ #
449
+ # Returns a two-element array containing the beginning and ending offsets of
450
+ # the <em>n</em>th match.
451
+ #
452
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
453
+ # m.offset(0) #=> [1, 7]
454
+ # m.offset(4) #=> [6, 7]
455
+ #
456
+ # If no arguments are given, the offsets of the entire
457
+ # sequence are returned.
458
+ #
459
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
460
+ # m.offset #=> [1, 7]
461
+ #
462
+ # If the argument is a symbol, then the offsets of the
463
+ # corresponding named group are returned, or <code>nil</code>
464
+ # if the group does not exist.
465
+ #
466
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
467
+ # m.end(:middle) #=> [3, 4]
468
+
469
+ def offset(*idx)
470
+ if idx[0].is_a?(Symbol)
471
+ k = to_index( idx[0] )
472
+ k && old_offset(k)
473
+ elsif idx.empty?
474
+ old_offset( 0 )
475
+ else
476
+ old_offset(*idx)
477
+ end
478
+ end
479
+ end