jasherai-oniguruma 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/oniguruma.rb ADDED
@@ -0,0 +1,479 @@
1
+ require 'oregexp'
2
+
3
+ module Oniguruma
4
+ OPTION_NONE = 0
5
+ OPTION_IGNORECASE = 1
6
+ OPTION_EXTEND = (OPTION_IGNORECASE << 1)
7
+ OPTION_MULTILINE = (OPTION_EXTEND << 1)
8
+ OPTION_SINGLELINE = (OPTION_MULTILINE << 1)
9
+ OPTION_FIND_LONGEST = (OPTION_SINGLELINE << 1)
10
+ OPTION_FIND_NOT_EMPTY = (OPTION_FIND_LONGEST << 1)
11
+ OPTION_NEGATE_SINGLELINE = (OPTION_FIND_NOT_EMPTY << 1)
12
+ OPTION_DONT_CAPTURE_GROUP = (OPTION_NEGATE_SINGLELINE << 1)
13
+ OPTION_CAPTURE_GROUP = (OPTION_DONT_CAPTURE_GROUP << 1)
14
+ OPTION_NOTBOL = (OPTION_CAPTURE_GROUP << 1)
15
+ OPTION_NOTEOL = (OPTION_NOTBOL << 1)
16
+ OPTION_POSIX_REGION = (OPTION_NOTEOL << 1)
17
+ OPTION_MAXBIT = OPTION_POSIX_REGION
18
+ OPTION_DEFAULT = OPTION_NONE
19
+
20
+ OPTIONS_SHORTCUTS = {
21
+ 'i' => OPTION_IGNORECASE,
22
+ 'x' => OPTION_EXTEND,
23
+ 'm' => OPTION_MULTILINE,
24
+ 's' => OPTION_SINGLELINE,
25
+ 'l' => OPTION_FIND_LONGEST,
26
+ 'E' => OPTION_FIND_NOT_EMPTY,
27
+ 'S' => OPTION_NEGATE_SINGLELINE,
28
+ 'G' => OPTION_DONT_CAPTURE_GROUP,
29
+ 'g' => OPTION_CAPTURE_GROUP,
30
+ 'B' => OPTION_NOTBOL,
31
+ 'E' => OPTION_NOTEOL,
32
+ }
33
+
34
+ SYNTAX_ASIS = 0
35
+ SYNTAX_POSIX_BASIC = 1
36
+ SYNTAX_POSIX_EXTENDED = 2
37
+ SYNTAX_EMACS = 3
38
+ SYNTAX_GREP = 4
39
+ SYNTAX_GNU_REGEX = 5
40
+ SYNTAX_JAVA = 6
41
+ SYNTAX_PERL = 7
42
+ SYNTAX_PERL_NG = 8
43
+ SYNTAX_RUBY = 9
44
+ SYNTAX_DEFAULT = 10
45
+
46
+ ENCODING_ASCII = 0
47
+ ENCODING_ISO_8859_1 = 1
48
+ ENCODING_ISO_8859_2 = 2
49
+ ENCODING_ISO_8859_3 = 3
50
+ ENCODING_ISO_8859_4 = 4
51
+ ENCODING_ISO_8859_5 = 5
52
+ ENCODING_ISO_8859_6 = 6
53
+ ENCODING_ISO_8859_7 = 7
54
+ ENCODING_ISO_8859_8 = 8
55
+ ENCODING_ISO_8859_9 = 9
56
+ ENCODING_ISO_8859_10 = 10
57
+ ENCODING_ISO_8859_11 = 11
58
+ ENCODING_ISO_8859_12 = 12
59
+ ENCODING_ISO_8859_13 = 13
60
+ ENCODING_ISO_8859_14 = 14
61
+ ENCODING_ISO_8859_15 = 15
62
+ ENCODING_ISO_8859_16 = 16
63
+ ENCODING_UTF8 = 17
64
+ ENCODING_UTF16_BE = 18
65
+ ENCODING_UTF16_LE = 19
66
+ ENCODING_UTF32_BE = 20
67
+ ENCODING_UTF32_LE = 21
68
+ ENCODING_EUC_JP = 22
69
+ ENCODING_EUC_TW = 23
70
+ ENCODING_EUC_KR = 24
71
+ ENCODING_EUC_CN = 25
72
+ ENCODING_SJIS = 26
73
+ ENCODING_KOI8 = 27
74
+ ENCODING_KOI8_R = 28
75
+ ENCODING_CP1251 = 29
76
+ ENCODING_BIG5 = 30
77
+ ENCODING_GB18030 = 31
78
+ ENCODING_UNDEF = 32
79
+
80
+
81
+ class ORegexp
82
+
83
+ class << self
84
+ # :stopdoc:
85
+ alias compile new
86
+ # :startdoc:
87
+
88
+ # call-seq:
89
+ # ORegexp.escape(str) => a_str
90
+ # ORegexp.quote(str) => a_str
91
+ #
92
+ # Escapes any characters that would have special meaning in a regular
93
+ # expression. Returns a new escaped string, or self if no characters are
94
+ # escaped. For any string,
95
+ # <code>Regexp.escape(<i>str</i>)=~<i>str</i></code> will be true.
96
+ #
97
+ # ORegexp.escape('\\*?{}.') #=> \\\\\*\?\{\}\.
98
+ #
99
+
100
+ def escape( *args )
101
+ Regexp.escape( *args )
102
+ end
103
+ # :stopdoc:
104
+ alias quote escape
105
+ # :startdoc:
106
+
107
+ # call-seq:
108
+ # ORegexp.last_match => matchdata
109
+ # ORegexp.last_match(fixnum) => str
110
+ #
111
+ # The first form returns the <code>MatchData</code> object generated by the
112
+ # last successful pattern match. The second form returns the nth field in this
113
+ # <code>MatchData</code> object.
114
+ #
115
+ # ORegexp.new( 'c(.)t' ) =~ 'cat' #=> 0
116
+ # ORegexp.last_match #=> #<MatchData:0x401b3d30>
117
+ # ORegexp.last_match(0) #=> "cat"
118
+ # ORegexp.last_match(1) #=> "a"
119
+ # ORegexp.last_match(2) #=> nil
120
+
121
+ def last_match( index = nil)
122
+ if index
123
+ @@last_match[index]
124
+ else
125
+ @@last_match
126
+ end
127
+ end
128
+ end
129
+
130
+ # :stopdoc:
131
+ alias old_initialize initialize
132
+ # :startdoc:
133
+
134
+ # call-seq:
135
+ # ORegexp.new( pattern, options_hash )
136
+ # ORegexp.new( pattern, option_str, encoding_str=nil, syntax_str=nil)
137
+ #
138
+ # Constructs a new regular expression from <i>pattern</i>, which is a
139
+ # <code>String</code>. The second parameter <i></i> may be a <code>Hash</code>
140
+ # of the form:
141
+ #
142
+ # <code>{ :options => option_value, :encoding => encoding_value, :syntax => syntax_value }</code>
143
+ #
144
+ # Where <code>option_value</code> is a bitwise <code>OR</code> of
145
+ # <code>Oniguruma::OPTION_XXX</code> constants; <code>encoding_value</code>
146
+ # is one of <code>Oniguruma::ENCODING_XXX</code> constants; and
147
+ # <code>syntax_value</code> is one of <code>Oniguruma::SYNTAX_XXX</code>
148
+ # constants.
149
+ #
150
+ # r1 = ORegexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
151
+ # r2 = ORegexp.new('cat', :options => OPTION_IGNORECASE ) #=> /cat/i
152
+ # r3 = ORegexp.new('dog', :options => OPTION_EXTEND ) #=> /dog/x
153
+ #
154
+ # #Accept java syntax on SJIS encoding:
155
+ # r4 = ORegexp.new('ape', :syntax => SYNTAX_JAVA, :encoding => ENCODING_SJIS) #=> /ape/
156
+ #
157
+ # Second form uses string shortcuts to set options and encoding:
158
+ # r = ORegexp.new('cat', 'i', 'utf8', 'java')
159
+
160
+ def initialize( pattern, *args )
161
+ defaults = { :options => OPTION_DEFAULT, :encoding => ENCODING_ASCII, :syntax => SYNTAX_DEFAULT}
162
+ if args[0].is_a?(String)
163
+ options = {}
164
+ option_str, encoding_str, syntax_str = *args
165
+ opt = 0
166
+ option_str.each_byte {|x| opt |= (OPTIONS_SHORTCUTS[x.chr] || 0) }
167
+ options[:options] = opt
168
+ if encoding_str && Oniguruma::const_defined?("ENCODING_#{encoding_str.upcase}")
169
+ options[:encoding] = Oniguruma::const_get("ENCODING_#{encoding_str.upcase}")
170
+ end
171
+ if syntax_str && Oniguruma::const_defined?("SYNTAX_#{syntax_str.upcase}")
172
+ options[:syntax] = Oniguruma::const_get("SYNTAX_#{syntax_str.upcase}")
173
+ end
174
+ else
175
+ options = args[0] || {}
176
+ end
177
+ old_initialize( pattern, defaults.merge( options ).freeze )
178
+ end
179
+
180
+ # call-seq:
181
+ # rxp == other_rxp => true or false
182
+ # rxp.eql?(other_rxp) => true or false
183
+ #
184
+ # Equality---Two regexps are equal if their patterns are identical, they have
185
+ # the same character set code, and their <code>#casefold?</code> values are the
186
+ # same.
187
+
188
+ def == regexp
189
+ @pattern == regexp.source && kcode == regexp.kcode && casefold? == regexp.casefold?
190
+ end
191
+ alias eql? ==
192
+
193
+ # call-seq:
194
+ # rxp.casefold? => true of false
195
+ #
196
+ # Returns the value of the case-insensitive flag.
197
+
198
+ def casefold?
199
+ (@options[:options] & OPTION_IGNORECASE) > 0
200
+ end
201
+
202
+ # call-seq:
203
+ # rxp.kode => int
204
+ #
205
+ # Returns the character set code for the regexp.
206
+ def kcode
207
+ @options[:encoding]
208
+ end
209
+
210
+ # call-seq:
211
+ # rxp.options => fixnum
212
+ #
213
+ # Returns the set of bits corresponding to the options used when creating this
214
+ # ORegexp (see <code>ORegexp::new</code> for details. Note that additional bits
215
+ # may be set in the returned options: these are used internally by the regular
216
+ # expression code. These extra bits are ignored if the options are passed to
217
+ # <code>ORegexp::new</code>.
218
+ #
219
+ # Oniguruma::OPTION_IGNORECASE #=> 1
220
+ # Oniguruma::OPTION_EXTEND #=> 2
221
+ # Oniguruma::OPTION_MULTILINE #=> 4
222
+ #
223
+ # Regexp.new(r.source, :options => Oniguruma::OPTION_EXTEND ) #=> 2
224
+
225
+ def options
226
+ @options[:options]
227
+ end
228
+
229
+ # call-seq:
230
+ # rxp.to_s => str
231
+ #
232
+ # Returns a string containing the regular expression and its options (using the
233
+ # <code>(?xxx:yyy)</code> notation. This string can be fed back in to
234
+ # <code>Regexp::new</code> to a regular expression with the same semantics as
235
+ # the original. (However, <code>Regexp#==</code> may not return true when
236
+ # comparing the two, as the source of the regular expression itself may
237
+ # differ, as the example shows). <code>Regexp#inspect</code> produces a
238
+ # generally more readable version of <i>rxp</i>.
239
+ #
240
+ # r1 = ORegexp.new( 'ab+c', :options OPTION_IGNORECASE | OPTION_EXTEND ) #=> /ab+c/ix
241
+ # s1 = r1.to_s #=> "(?ix-m:ab+c)"
242
+ # r2 = ORegexp.new(s1) #=> /(?ix-m:ab+c)/
243
+ # r1 == r2 #=> false
244
+ # r1.source #=> "ab+c"
245
+ # r2.source #=> "(?ix-m:ab+c)"
246
+
247
+ def to_s
248
+ opt_str = "(?"
249
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) > 0
250
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) > 0
251
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) > 0
252
+ unless opt_str == "(?imx"
253
+ opt_str += "-"
254
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) == 0
255
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) == 0
256
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) == 0
257
+ end
258
+ opt_str += ")"
259
+ opt_str + @pattern
260
+ end
261
+
262
+
263
+ # call-seq:
264
+ # rxp.inspect => string
265
+ #
266
+ # Returns a readable version of <i>rxp</i>
267
+ #
268
+ # ORegexp.new( 'cat', :options => OPTION_MULTILINE | OPTION_IGNORECASE ).inspect => /cat/im
269
+ # ORegexp.new( 'cat', :options => OPTION_MULTILINE | OPTION_IGNORECASE ).to_s => (?im-x)cat
270
+
271
+ def inspect
272
+ opt_str = ""
273
+ opt_str += "i" if (@options[:options] & OPTION_IGNORECASE) > 0
274
+ opt_str += "m" if (@options[:options] & OPTION_MULTILINE) > 0
275
+ opt_str += "x" if (@options[:options] & OPTION_EXTEND) > 0
276
+ "/" + @pattern + "/" + opt_str
277
+ end
278
+
279
+ # call-seq:
280
+ # rxp.source => str
281
+ #
282
+ # Returns the original string of the pattern.
283
+ #
284
+ # ORegex.new( 'ab+c', 'ix' ).source #=> "ab+c"
285
+ def source
286
+ @pattern.freeze
287
+ end
288
+
289
+ alias match_all scan
290
+
291
+ end
292
+
293
+ end
294
+
295
+ class ::String
296
+ # Calls <code>Oniguruma::ORegexp#gsub</code> on this string.
297
+ def ogsub(*args)
298
+ Oniguruma::ORegexp.new(args.shift).gsub(self, *args)
299
+ end
300
+
301
+ # Calls <code>Oniguruma::ORegexp#gsub!</code> on this string.
302
+ def ogsub!(*args)
303
+ Oniguruma::ORegexp.new(args.shift).gsub!(self, *args)
304
+ end
305
+
306
+ # Calls <code>Oniguruma::ORegexp#sub</code> on this string.
307
+ def osub(re, *args)
308
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
309
+ end
310
+
311
+ # Calls <code>Oniguruma::ORegexp#sub!</code> on this string.
312
+ def osub!(re, *args)
313
+ Oniguruma::ORegexp.new( re ).sub(self, *args)
314
+ end
315
+ end
316
+
317
+ class ::MatchData
318
+ # call-seq:
319
+ # to_index[symbol] => int or nil
320
+ #
321
+ # Returns the group index for the corresponding named group, or
322
+ # <code>nil</code> if the group does not exist.
323
+ #
324
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
325
+ # m.to_index[:begin] #=> 1
326
+ # m.to_index[:unknown] #=> nil
327
+ def to_index symbol
328
+ @named_captures && @named_captures[symbol]
329
+ end
330
+
331
+ alias old_aref :[]
332
+
333
+ # call-seq:
334
+ # mtch[i] => obj
335
+ # mtch[start, length] => array
336
+ # mtch[range] => array
337
+ # mtch[symbol] => obj
338
+ #
339
+ # <code>MatchData</code> acts as an array, and may be
340
+ # accessed using the normal array indexing techniques. <i>mtch</i>[0] is
341
+ # equivalent to the special variable <code>$&</code>, and returns the entire
342
+ # matched string. <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
343
+ # of the matched backreferences (portions of the pattern between parentheses).
344
+ #
345
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
346
+ # m[0] #=> "HX1138"
347
+ # m[1, 2] #=> ["H", "X"]
348
+ # m[1..3] #=> ["H", "X", "113"]
349
+ # m[-3, 2] #=> ["X", "113"]
350
+ #
351
+ # If a symbol is used as index, the corresponding named group is returned,
352
+ # or <code>nil</code> if such a group does not exist.
353
+ #
354
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
355
+ # m[:begin] #=> "THX"
356
+ # m[:moddle] #=> "1"
357
+ # m[:end] #=> "138"
358
+
359
+ def [](*idx)
360
+ if idx[0].is_a?(Symbol)
361
+ k = to_index( idx[0] )
362
+ k && old_aref(k)
363
+ else
364
+ old_aref(*idx)
365
+ end
366
+ end
367
+
368
+ alias old_begin :begin
369
+
370
+ # call-seq:
371
+ # mtch.begin(n) => integer
372
+ # mtch.begin => integer
373
+ # mtch.begin(symbol) => integer
374
+ #
375
+ # Returns the offset of the start of the <em>n</em>th element of the match
376
+ # array in the string.
377
+ #
378
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
379
+ # m.begin(0) #=> 1
380
+ # m.begin(2) #=> 2
381
+ #
382
+ # If no arguments are given, the index of the
383
+ # first matching character is returned.
384
+ #
385
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
386
+ # m.begin #=> 1
387
+ #
388
+ # If the argument is a symbol, then the beginning of the
389
+ # corresponding named group is returned, or <code>nil</code>
390
+ # if the group does not exist.
391
+ #
392
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
393
+ # m.begin(:middle) #=> 3
394
+
395
+ def begin(*idx)
396
+ if idx[0].is_a?(Symbol)
397
+ k = to_index( idx[0] )
398
+ k && old_begin(k)
399
+ elsif idx.empty?
400
+ old_begin( 0 )
401
+ else
402
+ old_begin(*idx)
403
+ end
404
+ end
405
+
406
+ alias old_end :end
407
+
408
+ # call-seq:
409
+ # mtch.end(n) => integer
410
+ #
411
+ # Returns the offset of the character immediately following the end of the
412
+ # <em>n</em>th element of the match array in the string.
413
+ #
414
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
415
+ # m.end(0) #=> 7
416
+ # m.end(2) #=> 3
417
+ #
418
+ # If no arguments are given, the index of the
419
+ # last matching character is returned.
420
+ #
421
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
422
+ # m.last #=> 7
423
+ #
424
+ # If the argument is a symbol, then the beginning of the
425
+ # corresponding named group is returned, or <code>nil</code>
426
+ # if the group does not exist.
427
+ #
428
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
429
+ # m.end(:middle) #=> 4
430
+
431
+ def end(*idx)
432
+ if idx[0].is_a?(Symbol)
433
+ k = to_index( idx[0] )
434
+ k && old_end(k)
435
+ elsif idx.empty?
436
+ old_end( 0 )
437
+ else
438
+ old_end(*idx)
439
+ end
440
+ end
441
+
442
+ alias old_offset :offset
443
+
444
+ # call-seq:
445
+ # mtch.offset(n) => array
446
+ # mtch.offset => array
447
+ # mtch.offset(symbol) => array
448
+ #
449
+ # Returns a two-element array containing the beginning and ending offsets of
450
+ # the <em>n</em>th match.
451
+ #
452
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
453
+ # m.offset(0) #=> [1, 7]
454
+ # m.offset(4) #=> [6, 7]
455
+ #
456
+ # If no arguments are given, the offsets of the entire
457
+ # sequence are returned.
458
+ #
459
+ # m = ORegexp.new( '(.)(.)(\d+)(\d)' ).match("THX1138.")
460
+ # m.offset #=> [1, 7]
461
+ #
462
+ # If the argument is a symbol, then the offsets of the
463
+ # corresponding named group are returned, or <code>nil</code>
464
+ # if the group does not exist.
465
+ #
466
+ # m = ORegexp.new( '(?<begin>^.*?)(?<middle>\d)(?<end>.*)' ).match("THX1138")
467
+ # m.end(:middle) #=> [3, 4]
468
+
469
+ def offset(*idx)
470
+ if idx[0].is_a?(Symbol)
471
+ k = to_index( idx[0] )
472
+ k && old_offset(k)
473
+ elsif idx.empty?
474
+ old_offset( 0 )
475
+ else
476
+ old_offset(*idx)
477
+ end
478
+ end
479
+ end