regexp_parser 1.7.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +80 -1
- data/README.md +24 -12
- data/lib/regexp_parser/expression.rb +10 -19
- data/lib/regexp_parser/expression/classes/group.rb +17 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +6 -6
- data/lib/regexp_parser/parser.rb +45 -12
- data/lib/regexp_parser/scanner.rb +1305 -1193
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +194 -171
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/to_s_spec.rb +16 -0
- data/spec/lexer/delimiters_spec.rb +68 -0
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/options_spec.rb +28 -0
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/delimiters_spec.rb +52 -0
- data/spec/scanner/errors_spec.rb +0 -1
- data/spec/scanner/escapes_spec.rb +10 -0
- data/spec/scanner/free_space_spec.rb +32 -0
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/options_spec.rb +36 -0
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +8 -2
- metadata +65 -61
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -21,7 +21,7 @@
|
|
21
21
|
set_close = ']';
|
22
22
|
brackets = set_open | set_close;
|
23
23
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
24
|
+
comment = ('#' . [^\n]* . '\n'?);
|
25
25
|
|
26
26
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
27
|
'cntrl' | 'digit' | 'graph' |
|
@@ -62,13 +62,17 @@
|
|
62
62
|
quantifier_possessive = '?+' | '*+' | '++';
|
63
63
|
quantifier_mode = '?' | '+';
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
quantity_exact = (digit+);
|
66
|
+
quantity_minimum = (digit+) . ',';
|
67
|
+
quantity_maximum = ',' . (digit+);
|
68
|
+
quantity_range = (digit+) . ',' . (digit+);
|
69
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
70
|
+
quantity_maximum | quantity_range ) . range_close .
|
71
|
+
quantifier_mode?;
|
67
72
|
|
68
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
74
|
quantifier_possessive | quantifier_interval;
|
70
75
|
|
71
|
-
|
72
76
|
conditional = '(?(';
|
73
77
|
|
74
78
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -114,7 +118,9 @@
|
|
114
118
|
curlies | parantheses | brackets |
|
115
119
|
line_anchor | quantifier_greedy;
|
116
120
|
|
117
|
-
|
121
|
+
literal_delimiters = ']' | '}';
|
122
|
+
|
123
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
118
124
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
119
125
|
|
120
126
|
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
@@ -122,20 +128,20 @@
|
|
122
128
|
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
123
129
|
|
124
130
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
125
|
-
|
131
|
+
keep_mark | [xucCM];
|
126
132
|
|
127
133
|
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
128
134
|
multi_codepoint_char_type | [0-9cCM];
|
129
135
|
|
130
136
|
# EOF error, used where it can be detected
|
131
137
|
action premature_end_error {
|
132
|
-
text =
|
138
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
133
139
|
raise PrematureEndError.new( text )
|
134
140
|
}
|
135
141
|
|
136
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
137
143
|
action invalid_sequence_error {
|
138
|
-
text =
|
144
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
145
|
validation_error(:sequence, 'sequence', text)
|
140
146
|
}
|
141
147
|
|
@@ -150,7 +156,7 @@
|
|
150
156
|
# --------------------------------------------------------------------------
|
151
157
|
character_set := |*
|
152
158
|
set_close > (set_meta, 2) @set_closed {
|
153
|
-
emit(:set, :close,
|
159
|
+
emit(:set, :close, copy(data, ts, te))
|
154
160
|
if in_set?
|
155
161
|
fret;
|
156
162
|
else
|
@@ -159,8 +165,8 @@
|
|
159
165
|
};
|
160
166
|
|
161
167
|
'-]' @set_closed { # special case, emits two tokens
|
162
|
-
emit(:literal, :literal, copy(data, ts
|
163
|
-
emit(:set, :close, copy(data, ts+1
|
168
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
169
|
+
emit(:set, :close, copy(data, ts+1, te))
|
164
170
|
if in_set?
|
165
171
|
fret;
|
166
172
|
else
|
@@ -169,33 +175,33 @@
|
|
169
175
|
};
|
170
176
|
|
171
177
|
'-&&' { # special case, emits two tokens
|
172
|
-
emit(:literal, :literal, '-'
|
173
|
-
emit(:set, :intersection, '&&'
|
178
|
+
emit(:literal, :literal, '-')
|
179
|
+
emit(:set, :intersection, '&&')
|
174
180
|
};
|
175
181
|
|
176
182
|
'^' {
|
177
|
-
text =
|
183
|
+
text = copy(data, ts, te)
|
178
184
|
if tokens.last[1] == :open
|
179
|
-
emit(:set, :negate, text
|
185
|
+
emit(:set, :negate, text)
|
180
186
|
else
|
181
|
-
emit(:literal, :literal, text
|
187
|
+
emit(:literal, :literal, text)
|
182
188
|
end
|
183
189
|
};
|
184
190
|
|
185
191
|
'-' {
|
186
|
-
text =
|
192
|
+
text = copy(data, ts, te)
|
187
193
|
# ranges cant start with a subset or intersection/negation/range operator
|
188
194
|
if tokens.last[0] == :set
|
189
|
-
emit(:literal, :literal, text
|
195
|
+
emit(:literal, :literal, text)
|
190
196
|
else
|
191
|
-
emit(:set, :range, text
|
197
|
+
emit(:set, :range, text)
|
192
198
|
end
|
193
199
|
};
|
194
200
|
|
195
201
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
196
202
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
197
203
|
'&&' {
|
198
|
-
emit(:set, :intersection,
|
204
|
+
emit(:set, :intersection, copy(data, ts, te))
|
199
205
|
};
|
200
206
|
|
201
207
|
backslash {
|
@@ -203,12 +209,12 @@
|
|
203
209
|
};
|
204
210
|
|
205
211
|
set_open >(open_bracket, 1) >set_opened {
|
206
|
-
emit(:set, :open,
|
212
|
+
emit(:set, :open, copy(data, ts, te))
|
207
213
|
fcall character_set;
|
208
214
|
};
|
209
215
|
|
210
216
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
211
|
-
text =
|
217
|
+
text = copy(data, ts, te)
|
212
218
|
|
213
219
|
type = :posixclass
|
214
220
|
class_name = text[2..-3]
|
@@ -217,19 +223,19 @@
|
|
217
223
|
type = :nonposixclass
|
218
224
|
end
|
219
225
|
|
220
|
-
emit(type, class_name.to_sym, text
|
226
|
+
emit(type, class_name.to_sym, text)
|
221
227
|
};
|
222
228
|
|
223
229
|
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
224
|
-
emit(:set, :collation,
|
230
|
+
emit(:set, :collation, copy(data, ts, te))
|
225
231
|
};
|
226
232
|
|
227
233
|
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
228
|
-
emit(:set, :equivalent,
|
234
|
+
emit(:set, :equivalent, copy(data, ts, te))
|
229
235
|
};
|
230
236
|
|
231
237
|
meta_char > (set_meta, 1) {
|
232
|
-
emit(:literal, :literal,
|
238
|
+
emit(:literal, :literal, copy(data, ts, te))
|
233
239
|
};
|
234
240
|
|
235
241
|
any |
|
@@ -237,9 +243,8 @@
|
|
237
243
|
utf8_2_byte |
|
238
244
|
utf8_3_byte |
|
239
245
|
utf8_4_byte {
|
240
|
-
|
241
|
-
|
242
|
-
emit(:literal, :literal, char, *rest)
|
246
|
+
text = copy(data, ts, te)
|
247
|
+
emit(:literal, :literal, text)
|
243
248
|
};
|
244
249
|
*|;
|
245
250
|
|
@@ -247,7 +252,7 @@
|
|
247
252
|
# --------------------------------------------------------------------------
|
248
253
|
set_escape_sequence := |*
|
249
254
|
non_set_escape > (escaped_set_alpha, 2) {
|
250
|
-
emit(:escape, :literal,
|
255
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
251
256
|
fret;
|
252
257
|
};
|
253
258
|
|
@@ -263,33 +268,33 @@
|
|
263
268
|
# --------------------------------------------------------------------------
|
264
269
|
escape_sequence := |*
|
265
270
|
[1-9] {
|
266
|
-
text =
|
267
|
-
emit(:backref, :number, text
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:backref, :number, text)
|
268
273
|
fret;
|
269
274
|
};
|
270
275
|
|
271
276
|
octal_sequence {
|
272
|
-
emit(:escape, :octal,
|
277
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
273
278
|
fret;
|
274
279
|
};
|
275
280
|
|
276
281
|
meta_char {
|
277
|
-
case text =
|
278
|
-
when '\.'; emit(:escape, :dot, text
|
279
|
-
when '\|'; emit(:escape, :alternation, text
|
280
|
-
when '\^'; emit(:escape, :bol, text
|
281
|
-
when '\$'; emit(:escape, :eol, text
|
282
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
283
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
284
|
-
when '\+'; emit(:escape, :one_or_more, text
|
285
|
-
when '\('; emit(:escape, :group_open, text
|
286
|
-
when '\)'; emit(:escape, :group_close, text
|
287
|
-
when '\{'; emit(:escape, :interval_open, text
|
288
|
-
when '\}'; emit(:escape, :interval_close, text
|
289
|
-
when '\['; emit(:escape, :set_open, text
|
290
|
-
when '\]'; emit(:escape, :set_close, text
|
282
|
+
case text = copy(data, ts-1, te)
|
283
|
+
when '\.'; emit(:escape, :dot, text)
|
284
|
+
when '\|'; emit(:escape, :alternation, text)
|
285
|
+
when '\^'; emit(:escape, :bol, text)
|
286
|
+
when '\$'; emit(:escape, :eol, text)
|
287
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
288
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
289
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
290
|
+
when '\('; emit(:escape, :group_open, text)
|
291
|
+
when '\)'; emit(:escape, :group_close, text)
|
292
|
+
when '\{'; emit(:escape, :interval_open, text)
|
293
|
+
when '\}'; emit(:escape, :interval_close, text)
|
294
|
+
when '\['; emit(:escape, :set_open, text)
|
295
|
+
when '\]'; emit(:escape, :set_close, text)
|
291
296
|
when "\\\\";
|
292
|
-
emit(:escape, :backslash, text
|
297
|
+
emit(:escape, :backslash, text)
|
293
298
|
end
|
294
299
|
fret;
|
295
300
|
};
|
@@ -297,31 +302,31 @@
|
|
297
302
|
escaped_ascii > (escaped_alpha, 7) {
|
298
303
|
# \b is emitted as backspace only when inside a character set, otherwise
|
299
304
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
300
|
-
case text =
|
301
|
-
when '\a'; emit(:escape, :bell, text
|
302
|
-
when '\b'; emit(:escape, :backspace, text
|
303
|
-
when '\e'; emit(:escape, :escape, text
|
304
|
-
when '\f'; emit(:escape, :form_feed, text
|
305
|
-
when '\n'; emit(:escape, :newline, text
|
306
|
-
when '\r'; emit(:escape, :carriage, text
|
307
|
-
when '\t'; emit(:escape, :tab, text
|
308
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
305
|
+
case text = copy(data, ts-1, te)
|
306
|
+
when '\a'; emit(:escape, :bell, text)
|
307
|
+
when '\b'; emit(:escape, :backspace, text)
|
308
|
+
when '\e'; emit(:escape, :escape, text)
|
309
|
+
when '\f'; emit(:escape, :form_feed, text)
|
310
|
+
when '\n'; emit(:escape, :newline, text)
|
311
|
+
when '\r'; emit(:escape, :carriage, text)
|
312
|
+
when '\t'; emit(:escape, :tab, text)
|
313
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
309
314
|
end
|
310
315
|
fret;
|
311
316
|
};
|
312
317
|
|
313
318
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
314
|
-
text =
|
319
|
+
text = copy(data, ts-1, te)
|
315
320
|
if text[2].chr == '{'
|
316
|
-
emit(:escape, :codepoint_list, text
|
321
|
+
emit(:escape, :codepoint_list, text)
|
317
322
|
else
|
318
|
-
emit(:escape, :codepoint, text
|
323
|
+
emit(:escape, :codepoint, text)
|
319
324
|
end
|
320
325
|
fret;
|
321
326
|
};
|
322
327
|
|
323
328
|
hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
324
|
-
emit(:escape, :hex,
|
329
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
325
330
|
fret;
|
326
331
|
};
|
327
332
|
|
@@ -351,8 +356,11 @@
|
|
351
356
|
fcall unicode_property;
|
352
357
|
};
|
353
358
|
|
354
|
-
(any -- non_literal_escape)
|
355
|
-
|
359
|
+
(any -- non_literal_escape) |
|
360
|
+
utf8_2_byte |
|
361
|
+
utf8_3_byte |
|
362
|
+
utf8_4_byte > (escaped_alpha, 1) {
|
363
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
356
364
|
fret;
|
357
365
|
};
|
358
366
|
*|;
|
@@ -362,9 +370,9 @@
|
|
362
370
|
# --------------------------------------------------------------------------
|
363
371
|
conditional_expression := |*
|
364
372
|
group_lookup . ')' {
|
365
|
-
text =
|
366
|
-
emit(:conditional, :condition, text
|
367
|
-
emit(:conditional, :condition_close, ')'
|
373
|
+
text = copy(data, ts, te-1)
|
374
|
+
emit(:conditional, :condition, text)
|
375
|
+
emit(:conditional, :condition_close, ')')
|
368
376
|
};
|
369
377
|
|
370
378
|
any {
|
@@ -381,46 +389,50 @@
|
|
381
389
|
# Meta characters
|
382
390
|
# ------------------------------------------------------------------------
|
383
391
|
dot {
|
384
|
-
emit(:meta, :dot,
|
392
|
+
emit(:meta, :dot, copy(data, ts, te))
|
385
393
|
};
|
386
394
|
|
387
395
|
alternation {
|
388
396
|
if conditional_stack.last == group_depth
|
389
|
-
emit(:conditional, :separator,
|
397
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
390
398
|
else
|
391
|
-
emit(:meta, :alternation,
|
399
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
392
400
|
end
|
393
401
|
};
|
394
402
|
|
395
403
|
# Anchors
|
396
404
|
# ------------------------------------------------------------------------
|
397
405
|
beginning_of_line {
|
398
|
-
emit(:anchor, :bol,
|
406
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
399
407
|
};
|
400
408
|
|
401
409
|
end_of_line {
|
402
|
-
emit(:anchor, :eol,
|
410
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
403
411
|
};
|
404
412
|
|
405
413
|
backslash . keep_mark > (backslashed, 4) {
|
406
|
-
emit(:keep, :mark,
|
414
|
+
emit(:keep, :mark, copy(data, ts, te))
|
407
415
|
};
|
408
416
|
|
409
417
|
backslash . anchor_char > (backslashed, 3) {
|
410
|
-
case text =
|
411
|
-
when '\\A'; emit(:anchor, :bos, text
|
412
|
-
when '\\z'; emit(:anchor, :eos, text
|
413
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
414
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
415
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
416
|
-
when '\\G'; emit(:anchor, :match_start, text
|
418
|
+
case text = copy(data, ts, te)
|
419
|
+
when '\\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
417
425
|
end
|
418
426
|
};
|
419
427
|
|
428
|
+
literal_delimiters {
|
429
|
+
append_literal(data, ts, te)
|
430
|
+
};
|
431
|
+
|
420
432
|
# Character sets
|
421
433
|
# ------------------------------------------------------------------------
|
422
434
|
set_open >set_opened {
|
423
|
-
emit(:set, :open,
|
435
|
+
emit(:set, :open, copy(data, ts, te))
|
424
436
|
fcall character_set;
|
425
437
|
};
|
426
438
|
|
@@ -429,12 +441,12 @@
|
|
429
441
|
# (?(condition)Y|N) conditional expression
|
430
442
|
# ------------------------------------------------------------------------
|
431
443
|
conditional {
|
432
|
-
text =
|
444
|
+
text = copy(data, ts, te)
|
433
445
|
|
434
446
|
conditional_stack << group_depth
|
435
447
|
|
436
|
-
emit(:conditional, :open, text[0..-2]
|
437
|
-
emit(:conditional, :condition_open, '('
|
448
|
+
emit(:conditional, :open, text[0..-2])
|
449
|
+
emit(:conditional, :condition_open, '(')
|
438
450
|
fcall conditional_expression;
|
439
451
|
};
|
440
452
|
|
@@ -445,7 +457,7 @@
|
|
445
457
|
# correct closing count.
|
446
458
|
# ------------------------------------------------------------------------
|
447
459
|
group_open . group_comment $group_closed {
|
448
|
-
emit(:group, :comment,
|
460
|
+
emit(:group, :comment, copy(data, ts, te))
|
449
461
|
};
|
450
462
|
|
451
463
|
# Expression options:
|
@@ -460,11 +472,11 @@
|
|
460
472
|
# (?imxdau-imx:subexp) option on/off for subexp
|
461
473
|
# ------------------------------------------------------------------------
|
462
474
|
group_open . group_options >group_opened {
|
463
|
-
text =
|
475
|
+
text = copy(data, ts, te)
|
464
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
465
477
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
466
478
|
end
|
467
|
-
emit_options(text
|
479
|
+
emit_options(text)
|
468
480
|
};
|
469
481
|
|
470
482
|
# Assertions
|
@@ -474,11 +486,11 @@
|
|
474
486
|
# (?<!subexp) negative look-behind
|
475
487
|
# ------------------------------------------------------------------------
|
476
488
|
group_open . assertion_type >group_opened {
|
477
|
-
case text =
|
478
|
-
when '(?='; emit(:assertion, :lookahead, text
|
479
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
480
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
481
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
489
|
+
case text = copy(data, ts, te)
|
490
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
491
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
492
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
493
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
482
494
|
end
|
483
495
|
};
|
484
496
|
|
@@ -491,32 +503,32 @@
|
|
491
503
|
# (subexp) captured group
|
492
504
|
# ------------------------------------------------------------------------
|
493
505
|
group_open . group_type >group_opened {
|
494
|
-
case text =
|
495
|
-
when '(?:'; emit(:group, :passive, text
|
496
|
-
when '(?>'; emit(:group, :atomic, text
|
497
|
-
when '(?~'; emit(:group, :absence, text
|
506
|
+
case text = copy(data, ts, te)
|
507
|
+
when '(?:'; emit(:group, :passive, text)
|
508
|
+
when '(?>'; emit(:group, :atomic, text)
|
509
|
+
when '(?~'; emit(:group, :absence, text)
|
498
510
|
|
499
511
|
when /^\(\?(?:<>|'')/
|
500
512
|
validation_error(:group, 'named group', 'name is empty')
|
501
513
|
|
502
514
|
when /^\(\?<\w*>/
|
503
|
-
emit(:group, :named_ab, text
|
515
|
+
emit(:group, :named_ab, text)
|
504
516
|
|
505
517
|
when /^\(\?'\w*'/
|
506
|
-
emit(:group, :named_sq, text
|
518
|
+
emit(:group, :named_sq, text)
|
507
519
|
|
508
520
|
end
|
509
521
|
};
|
510
522
|
|
511
523
|
group_open @group_opened {
|
512
|
-
text =
|
513
|
-
emit(:group, :capture, text
|
524
|
+
text = copy(data, ts, te)
|
525
|
+
emit(:group, :capture, text)
|
514
526
|
};
|
515
527
|
|
516
528
|
group_close @group_closed {
|
517
529
|
if conditional_stack.last == group_depth + 1
|
518
530
|
conditional_stack.pop
|
519
|
-
emit(:conditional, :close,
|
531
|
+
emit(:conditional, :close, copy(data, ts, te))
|
520
532
|
else
|
521
533
|
if spacing_stack.length > 1 &&
|
522
534
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -524,7 +536,7 @@
|
|
524
536
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
525
537
|
end
|
526
538
|
|
527
|
-
emit(:group, :close,
|
539
|
+
emit(:group, :close, copy(data, ts, te))
|
528
540
|
end
|
529
541
|
};
|
530
542
|
|
@@ -532,63 +544,63 @@
|
|
532
544
|
# Group backreference, named and numbered
|
533
545
|
# ------------------------------------------------------------------------
|
534
546
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
535
|
-
case text =
|
547
|
+
case text = copy(data, ts, te)
|
536
548
|
when /^\\([gk])(<>|'')/ # angle brackets
|
537
549
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
538
550
|
|
539
551
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
540
552
|
if $1 == 'k'
|
541
|
-
emit(:backref, :name_ref_ab, text
|
553
|
+
emit(:backref, :name_ref_ab, text)
|
542
554
|
else
|
543
|
-
emit(:backref, :name_call_ab, text
|
555
|
+
emit(:backref, :name_call_ab, text)
|
544
556
|
end
|
545
557
|
|
546
558
|
when /^\\([gk])'[^\d+-]\w*'/ #single quotes
|
547
559
|
if $1 == 'k'
|
548
|
-
emit(:backref, :name_ref_sq, text
|
560
|
+
emit(:backref, :name_ref_sq, text)
|
549
561
|
else
|
550
|
-
emit(:backref, :name_call_sq, text
|
562
|
+
emit(:backref, :name_call_sq, text)
|
551
563
|
end
|
552
564
|
|
553
565
|
when /^\\([gk])<\d+>/ # angle-brackets
|
554
566
|
if $1 == 'k'
|
555
|
-
emit(:backref, :number_ref_ab, text
|
567
|
+
emit(:backref, :number_ref_ab, text)
|
556
568
|
else
|
557
|
-
emit(:backref, :number_call_ab, text
|
569
|
+
emit(:backref, :number_call_ab, text)
|
558
570
|
end
|
559
571
|
|
560
572
|
when /^\\([gk])'\d+'/ # single quotes
|
561
573
|
if $1 == 'k'
|
562
|
-
emit(:backref, :number_ref_sq, text
|
574
|
+
emit(:backref, :number_ref_sq, text)
|
563
575
|
else
|
564
|
-
emit(:backref, :number_call_sq, text
|
576
|
+
emit(:backref, :number_call_sq, text)
|
565
577
|
end
|
566
578
|
|
567
579
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
568
580
|
if $1 == 'k'
|
569
|
-
emit(:backref, :number_rel_ref_ab, text
|
581
|
+
emit(:backref, :number_rel_ref_ab, text)
|
570
582
|
else
|
571
|
-
emit(:backref, :number_rel_call_ab, text
|
583
|
+
emit(:backref, :number_rel_call_ab, text)
|
572
584
|
end
|
573
585
|
|
574
586
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
575
587
|
if $1 == 'k'
|
576
|
-
emit(:backref, :number_rel_ref_sq, text
|
588
|
+
emit(:backref, :number_rel_ref_sq, text)
|
577
589
|
else
|
578
|
-
emit(:backref, :number_rel_call_sq, text
|
590
|
+
emit(:backref, :number_rel_call_sq, text)
|
579
591
|
end
|
580
592
|
|
581
593
|
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
582
|
-
emit(:backref, :name_recursion_ref_ab, text
|
594
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
583
595
|
|
584
596
|
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
585
|
-
emit(:backref, :name_recursion_ref_sq, text
|
597
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
586
598
|
|
587
599
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
588
|
-
emit(:backref, :number_recursion_ref_ab, text
|
600
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
589
601
|
|
590
602
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
591
|
-
emit(:backref, :number_recursion_ref_sq, text
|
603
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
592
604
|
|
593
605
|
end
|
594
606
|
};
|
@@ -597,31 +609,36 @@
|
|
597
609
|
# Quantifiers
|
598
610
|
# ------------------------------------------------------------------------
|
599
611
|
zero_or_one {
|
600
|
-
case text =
|
601
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
602
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
603
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
612
|
+
case text = copy(data, ts, te)
|
613
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
614
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
615
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
604
616
|
end
|
605
617
|
};
|
606
618
|
|
607
619
|
zero_or_more {
|
608
|
-
case text =
|
609
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
610
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
611
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
620
|
+
case text = copy(data, ts, te)
|
621
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
622
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
623
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
612
624
|
end
|
613
625
|
};
|
614
626
|
|
615
627
|
one_or_more {
|
616
|
-
case text =
|
617
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
618
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
619
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
628
|
+
case text = copy(data, ts, te)
|
629
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
630
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
631
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
620
632
|
end
|
621
633
|
};
|
622
634
|
|
623
|
-
quantifier_interval
|
624
|
-
emit(:quantifier, :interval,
|
635
|
+
quantifier_interval {
|
636
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
637
|
+
};
|
638
|
+
|
639
|
+
# Catch unmatched curly braces as literals
|
640
|
+
range_open {
|
641
|
+
append_literal(data, ts, te)
|
625
642
|
};
|
626
643
|
|
627
644
|
# Escaped sequences
|
@@ -632,15 +649,17 @@
|
|
632
649
|
|
633
650
|
comment {
|
634
651
|
if free_spacing
|
635
|
-
emit(:free_space, :comment,
|
652
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
636
653
|
else
|
637
|
-
|
654
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
655
|
+
append_literal(data, ts, ts + 1)
|
656
|
+
fexec ts + 1;
|
638
657
|
end
|
639
658
|
};
|
640
659
|
|
641
660
|
space+ {
|
642
661
|
if free_spacing
|
643
|
-
emit(:free_space, :whitespace,
|
662
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
644
663
|
else
|
645
664
|
append_literal(data, ts, te)
|
646
665
|
end
|
@@ -722,21 +741,16 @@ class Regexp::Scanner
|
|
722
741
|
#
|
723
742
|
# This method may raise errors if a syntax error is encountered.
|
724
743
|
# --------------------------------------------------------------------------
|
725
|
-
def self.scan(input_object, &block)
|
726
|
-
new.scan(input_object, &block)
|
744
|
+
def self.scan(input_object, options: nil, &block)
|
745
|
+
new.scan(input_object, options: options, &block)
|
727
746
|
end
|
728
747
|
|
729
|
-
def scan(input_object, &block)
|
748
|
+
def scan(input_object, options: nil, &block)
|
730
749
|
self.literal = nil
|
731
750
|
stack = []
|
732
751
|
|
733
|
-
|
734
|
-
|
735
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
736
|
-
else
|
737
|
-
input = input_object
|
738
|
-
self.free_spacing = false
|
739
|
-
end
|
752
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
753
|
+
self.free_spacing = free_spacing?(input_object, options)
|
740
754
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
741
755
|
|
742
756
|
data = input.unpack("c*") if input.is_a?(String)
|
@@ -748,6 +762,7 @@ class Regexp::Scanner
|
|
748
762
|
self.set_depth = 0
|
749
763
|
self.group_depth = 0
|
750
764
|
self.conditional_stack = []
|
765
|
+
self.char_pos = 0
|
751
766
|
|
752
767
|
%% write data;
|
753
768
|
%% write init;
|
@@ -757,7 +772,7 @@ class Regexp::Scanner
|
|
757
772
|
testEof = testEof
|
758
773
|
|
759
774
|
if cs == re_scanner_error
|
760
|
-
text =
|
775
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
761
776
|
raise ScannerError.new("Scan error at '#{text}'")
|
762
777
|
end
|
763
778
|
|
@@ -785,22 +800,41 @@ class Regexp::Scanner
|
|
785
800
|
end
|
786
801
|
|
787
802
|
# Emits an array with the details of the scanned pattern
|
788
|
-
def emit(type, token, text
|
803
|
+
def emit(type, token, text)
|
789
804
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
790
805
|
|
791
806
|
emit_literal if literal
|
792
807
|
|
808
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
809
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
810
|
+
ts_char_pos = char_pos
|
811
|
+
te_char_pos = char_pos + text.length
|
812
|
+
|
793
813
|
if block
|
794
|
-
block.call type, token, text,
|
814
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
795
815
|
end
|
796
816
|
|
797
|
-
tokens << [type, token, text,
|
817
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
818
|
+
|
819
|
+
self.char_pos = te_char_pos
|
798
820
|
end
|
799
821
|
|
800
822
|
private
|
801
823
|
|
802
824
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
803
|
-
:group_depth, :set_depth, :conditional_stack
|
825
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
826
|
+
|
827
|
+
def free_spacing?(input_object, options)
|
828
|
+
if options && !input_object.is_a?(String)
|
829
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
830
|
+
end
|
831
|
+
|
832
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
833
|
+
|
834
|
+
return false unless options
|
835
|
+
|
836
|
+
options & Regexp::EXTENDED != 0
|
837
|
+
end
|
804
838
|
|
805
839
|
def in_group?
|
806
840
|
group_depth > 0
|
@@ -811,36 +845,25 @@ class Regexp::Scanner
|
|
811
845
|
end
|
812
846
|
|
813
847
|
# Copy from ts to te from data as text
|
814
|
-
def copy(data,
|
815
|
-
data[
|
816
|
-
end
|
817
|
-
|
818
|
-
# Copy from ts to te from data as text, returning an array with the text
|
819
|
-
# and the offsets used to copy it.
|
820
|
-
def text(data, ts, te, soff = 0)
|
821
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
848
|
+
def copy(data, ts, te)
|
849
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
822
850
|
end
|
823
851
|
|
824
852
|
# Appends one or more characters to the literal buffer, to be emitted later
|
825
|
-
# by a call to emit_literal.
|
853
|
+
# by a call to emit_literal.
|
826
854
|
def append_literal(data, ts, te)
|
827
855
|
self.literal = literal || []
|
828
|
-
literal <<
|
856
|
+
literal << copy(data, ts, te)
|
829
857
|
end
|
830
858
|
|
831
|
-
# Emits the literal run collected by calls to the append_literal method
|
832
|
-
# using the total start (ts) and end (te) offsets of the run.
|
859
|
+
# Emits the literal run collected by calls to the append_literal method.
|
833
860
|
def emit_literal
|
834
|
-
|
835
|
-
text = literal.map {|t| t[0]}.join
|
836
|
-
|
837
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
838
|
-
|
861
|
+
text = literal.join
|
839
862
|
self.literal = nil
|
840
|
-
emit(:literal, :literal, text
|
863
|
+
emit(:literal, :literal, text)
|
841
864
|
end
|
842
865
|
|
843
|
-
def emit_options(text
|
866
|
+
def emit_options(text)
|
844
867
|
token = nil
|
845
868
|
|
846
869
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -866,14 +889,14 @@ class Regexp::Scanner
|
|
866
889
|
token = :options_switch
|
867
890
|
end
|
868
891
|
|
869
|
-
emit(:group, token, text
|
892
|
+
emit(:group, token, text)
|
870
893
|
end
|
871
894
|
|
872
895
|
def emit_meta_control_sequence(data, ts, te, token)
|
873
896
|
if data.last < 0x00 || data.last > 0x7F
|
874
897
|
validation_error(:sequence, 'escape', token.to_s)
|
875
898
|
end
|
876
|
-
emit(:escape, token,
|
899
|
+
emit(:escape, token, copy(data, ts-1, te))
|
877
900
|
end
|
878
901
|
|
879
902
|
# Centralizes and unifies the handling of validation related
|