regexp_parser 2.6.0 → 2.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +34 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +101 -100
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1132 -1283
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +14 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
@@ -30,11 +30,6 @@
|
|
30
30
|
|
31
31
|
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
32
32
|
|
33
|
-
|
34
|
-
# these are not supported in ruby at the moment
|
35
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
|
-
character_equivalent = '[=' . alpha . '=]';
|
37
|
-
|
38
33
|
line_anchor = beginning_of_line | end_of_line;
|
39
34
|
anchor_char = [AbBzZG];
|
40
35
|
|
@@ -59,9 +54,6 @@
|
|
59
54
|
one_or_more = '+' | '+?' | '++';
|
60
55
|
|
61
56
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
57
|
|
66
58
|
quantity_exact = (digit+);
|
67
59
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +62,6 @@
|
|
70
62
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
63
|
quantity_maximum | quantity_range ) . range_close;
|
72
64
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
65
|
conditional = '(?(';
|
77
66
|
|
78
67
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -89,10 +78,9 @@
|
|
89
78
|
# try to treat every other group head as options group, like Ruby
|
90
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
80
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] . [0-9]*;
|
81
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
|
+
group_number = '-'? . [0-9]+;
|
96
84
|
group_level = [+\-] . [0-9]+;
|
97
85
|
|
98
86
|
group_name = ('<' . group_name_id_ab? . '>') |
|
@@ -101,15 +89,11 @@
|
|
101
89
|
|
102
90
|
group_named = ('?' . group_name );
|
103
91
|
|
104
|
-
|
105
|
-
|
106
|
-
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
107
|
-
("'" . group_name_id_sq? . group_level? "'"));
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
108
94
|
|
109
|
-
|
110
|
-
|
111
|
-
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
112
|
-
("'" . ((group_number . group_level?) | '0') "'"));
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
113
97
|
|
114
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
115
99
|
|
@@ -132,20 +116,21 @@
|
|
132
116
|
keep_mark | sequence_char;
|
133
117
|
|
134
118
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
136
121
|
sequence_char | single_codepoint_char_type;
|
137
122
|
|
138
123
|
|
139
124
|
# EOF error, used where it can be detected
|
140
125
|
action premature_end_error {
|
141
126
|
text = copy(data, ts ? ts-1 : 0, -1)
|
142
|
-
raise PrematureEndError.new(
|
127
|
+
raise PrematureEndError.new(text)
|
143
128
|
}
|
144
129
|
|
145
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
146
131
|
action invalid_sequence_error {
|
147
132
|
text = copy(data, ts ? ts-1 : 0, -1)
|
148
|
-
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
149
134
|
}
|
150
135
|
|
151
136
|
# group (nesting) and set open/close actions
|
@@ -168,8 +153,8 @@
|
|
168
153
|
};
|
169
154
|
|
170
155
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
173
158
|
if in_set?
|
174
159
|
fret;
|
175
160
|
else
|
@@ -183,28 +168,27 @@
|
|
183
168
|
};
|
184
169
|
|
185
170
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
189
173
|
else
|
190
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
191
175
|
end
|
192
176
|
};
|
193
177
|
|
194
178
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
199
183
|
else
|
200
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
201
185
|
end
|
202
186
|
};
|
203
187
|
|
204
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
190
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
208
192
|
};
|
209
193
|
|
210
194
|
backslash {
|
@@ -212,7 +196,7 @@
|
|
212
196
|
};
|
213
197
|
|
214
198
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
216
200
|
fcall character_set;
|
217
201
|
};
|
218
202
|
|
@@ -227,20 +211,12 @@
|
|
227
211
|
end
|
228
212
|
|
229
213
|
unless self.class.posix_classes.include?(class_name)
|
230
|
-
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
231
215
|
end
|
232
216
|
|
233
217
|
emit(type, class_name.to_sym, text)
|
234
218
|
};
|
235
219
|
|
236
|
-
# These are not supported in ruby at the moment. Enable them if they are.
|
237
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
238
|
-
# emit(:set, :collation, copy(data, ts, te))
|
239
|
-
# };
|
240
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
-
# emit(:set, :equivalent, copy(data, ts, te))
|
242
|
-
# };
|
243
|
-
|
244
220
|
meta_char > (set_meta, 1) {
|
245
221
|
emit(:literal, :literal, copy(data, ts, te))
|
246
222
|
};
|
@@ -254,12 +230,22 @@
|
|
254
230
|
# set escapes scanner
|
255
231
|
# --------------------------------------------------------------------------
|
256
232
|
set_escape_sequence := |*
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
236
|
+
fret;
|
237
|
+
};
|
238
|
+
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
240
|
set_escape > (escaped_set_alpha, 2) {
|
258
241
|
fhold;
|
259
242
|
fnext character_set;
|
260
243
|
fcall escape_sequence;
|
261
244
|
};
|
262
245
|
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
249
|
any > (escaped_set_alpha, 1) {
|
264
250
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
251
|
fret;
|
@@ -281,6 +267,13 @@
|
|
281
267
|
fret;
|
282
268
|
};
|
283
269
|
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
273
|
+
emit(:literal, :literal, text[2])
|
274
|
+
fret;
|
275
|
+
};
|
276
|
+
|
284
277
|
meta_char {
|
285
278
|
case text = copy(data, ts-1, te)
|
286
279
|
when '\.'; emit(:escape, :dot, text)
|
@@ -371,6 +364,7 @@
|
|
371
364
|
conditional_expression := |*
|
372
365
|
group_lookup . ')' {
|
373
366
|
text = copy(data, ts, te-1)
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
374
368
|
emit(:conditional, :condition, text)
|
375
369
|
emit(:conditional, :condition_close, ')')
|
376
370
|
};
|
@@ -453,10 +447,9 @@
|
|
453
447
|
|
454
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
455
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
456
|
-
# special case for close
|
457
|
-
# correct closing count.
|
450
|
+
# special case for close to get the correct closing count.
|
458
451
|
# ------------------------------------------------------------------------
|
459
|
-
group_open . group_comment
|
452
|
+
(group_open . group_comment) @group_closed {
|
460
453
|
emit(:group, :comment, copy(data, ts, te))
|
461
454
|
};
|
462
455
|
|
@@ -471,10 +464,10 @@
|
|
471
464
|
#
|
472
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
473
466
|
# ------------------------------------------------------------------------
|
474
|
-
group_open . group_options >group_opened {
|
467
|
+
(group_open . group_options) >group_opened {
|
475
468
|
text = copy(data, ts, te)
|
476
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
477
|
-
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
478
471
|
end
|
479
472
|
emit_options(text)
|
480
473
|
};
|
@@ -485,7 +478,7 @@
|
|
485
478
|
# (?<=subexp) look-behind
|
486
479
|
# (?<!subexp) negative look-behind
|
487
480
|
# ------------------------------------------------------------------------
|
488
|
-
group_open . assertion_type >group_opened {
|
481
|
+
(group_open . assertion_type) >group_opened {
|
489
482
|
case text = copy(data, ts, te)
|
490
483
|
when '(?='; emit(:assertion, :lookahead, text)
|
491
484
|
when '(?!'; emit(:assertion, :nlookahead, text)
|
@@ -502,14 +495,14 @@
|
|
502
495
|
# (?'name'subexp) named group (single quoted version)
|
503
496
|
# (subexp) captured group
|
504
497
|
# ------------------------------------------------------------------------
|
505
|
-
group_open . group_type >group_opened {
|
498
|
+
(group_open . group_type) >group_opened {
|
506
499
|
case text = copy(data, ts, te)
|
507
500
|
when '(?:'; emit(:group, :passive, text)
|
508
501
|
when '(?>'; emit(:group, :atomic, text)
|
509
502
|
when '(?~'; emit(:group, :absence, text)
|
510
503
|
|
511
504
|
when /^\(\?(?:<>|'')/
|
512
|
-
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
513
506
|
|
514
507
|
when /^\(\?<[^>]+>/
|
515
508
|
emit(:group, :named_ab, text)
|
@@ -528,50 +521,52 @@
|
|
528
521
|
group_close @group_closed {
|
529
522
|
if conditional_stack.last == group_depth + 1
|
530
523
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
532
|
-
|
524
|
+
emit(:conditional, :close, ')')
|
525
|
+
elsif group_depth >= 0
|
533
526
|
if spacing_stack.length > 1 &&
|
534
527
|
spacing_stack.last[:depth] == group_depth + 1
|
535
528
|
spacing_stack.pop
|
536
529
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
530
|
end
|
538
531
|
|
539
|
-
emit(:group, :close,
|
532
|
+
emit(:group, :close, ')')
|
533
|
+
else
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
540
535
|
end
|
541
536
|
};
|
542
537
|
|
543
538
|
|
544
539
|
# Group backreference, named and numbered
|
545
540
|
# ------------------------------------------------------------------------
|
546
|
-
backslash . (
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
547
542
|
case text = copy(data, ts, te)
|
548
|
-
when /^\\k(
|
549
|
-
validation_error(:backref, 'backreference', 'ref ID is empty')
|
550
|
-
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
551
544
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
552
|
-
when /^\\k(.)\d
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
553
546
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
554
|
-
when /^\\k(.)
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
555
548
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
556
|
-
when /^\\k(.)[
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
557
550
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
558
|
-
when /^\\k(.)
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
559
552
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
553
|
+
else
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
560
555
|
end
|
561
556
|
};
|
562
557
|
|
563
558
|
# Group call, named and numbered
|
564
559
|
# ------------------------------------------------------------------------
|
565
|
-
backslash . (
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
566
561
|
case text = copy(data, ts, te)
|
567
|
-
when /^\\g(
|
568
|
-
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
569
|
-
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
570
563
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
571
|
-
when /^\\g(.)\d
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
572
565
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
573
|
-
when /^\\g(.)[+-]\d
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
574
567
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
568
|
+
else
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
575
570
|
end
|
576
571
|
};
|
577
572
|
|
@@ -645,95 +640,35 @@
|
|
645
640
|
*|;
|
646
641
|
}%%
|
647
642
|
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
require 'regexp_parser/error'
|
643
|
+
require_relative 'scanner/errors/scanner_error'
|
644
|
+
require_relative 'scanner/errors/premature_end_error'
|
645
|
+
require_relative 'scanner/errors/validation_error'
|
652
646
|
|
653
647
|
class Regexp::Scanner
|
654
|
-
# General scanner error (catch all)
|
655
|
-
class ScannerError < Regexp::Parser::Error; end
|
656
|
-
|
657
|
-
# Base for all scanner validation errors
|
658
|
-
class ValidationError < Regexp::Parser::Error
|
659
|
-
def initialize(reason)
|
660
|
-
super reason
|
661
|
-
end
|
662
|
-
end
|
663
|
-
|
664
|
-
# Unexpected end of pattern
|
665
|
-
class PrematureEndError < ScannerError
|
666
|
-
def initialize(where = '')
|
667
|
-
super "Premature end of pattern at #{where}"
|
668
|
-
end
|
669
|
-
end
|
670
|
-
|
671
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
672
|
-
class InvalidSequenceError < ValidationError
|
673
|
-
def initialize(what = 'sequence', where = '')
|
674
|
-
super "Invalid #{what} at #{where}"
|
675
|
-
end
|
676
|
-
end
|
677
|
-
|
678
|
-
# Invalid group. Used for named groups.
|
679
|
-
class InvalidGroupError < ValidationError
|
680
|
-
def initialize(what, reason)
|
681
|
-
super "Invalid #{what}, #{reason}."
|
682
|
-
end
|
683
|
-
end
|
684
|
-
|
685
|
-
# Invalid groupOption. Used for inline options.
|
686
|
-
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
687
|
-
class InvalidGroupOption < ValidationError
|
688
|
-
def initialize(option, text)
|
689
|
-
super "Invalid group option #{option} in #{text}"
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
# Invalid back reference. Used for name a number refs/calls.
|
694
|
-
class InvalidBackrefError < ValidationError
|
695
|
-
def initialize(what, reason)
|
696
|
-
super "Invalid back reference #{what}, #{reason}"
|
697
|
-
end
|
698
|
-
end
|
699
|
-
|
700
|
-
# The property name was not recognized by the scanner.
|
701
|
-
class UnknownUnicodePropertyError < ValidationError
|
702
|
-
def initialize(name)
|
703
|
-
super "Unknown unicode character property name #{name}"
|
704
|
-
end
|
705
|
-
end
|
706
|
-
|
707
|
-
# The POSIX class name was not recognized by the scanner.
|
708
|
-
class UnknownPosixClassError < ValidationError
|
709
|
-
def initialize(text)
|
710
|
-
super "Unknown POSIX class #{text}"
|
711
|
-
end
|
712
|
-
end
|
713
|
-
|
714
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
715
649
|
# emitted token into an array that gets returned at the end. If a block is
|
716
650
|
# given, it gets called for each emitted token.
|
717
651
|
#
|
718
652
|
# This method may raise errors if a syntax error is encountered.
|
719
653
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
654
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
655
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
656
|
end
|
723
657
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
658
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
659
|
+
self.collect_tokens = collect_tokens
|
660
|
+
self.literal_run = nil
|
726
661
|
stack = []
|
727
662
|
|
728
663
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
664
|
self.free_spacing = free_spacing?(input_object, options)
|
730
665
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
666
|
|
732
|
-
data = input.unpack("c*")
|
667
|
+
data = input.unpack("c*")
|
733
668
|
eof = data.length
|
734
669
|
|
735
670
|
self.tokens = []
|
736
|
-
self.block =
|
671
|
+
self.block = block
|
737
672
|
|
738
673
|
self.set_depth = 0
|
739
674
|
self.group_depth = 0
|
@@ -758,7 +693,7 @@ class Regexp::Scanner
|
|
758
693
|
"[#{set_depth}]") if in_set?
|
759
694
|
|
760
695
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
696
|
+
emit_literal if literal_run
|
762
697
|
|
763
698
|
tokens
|
764
699
|
end
|
@@ -785,26 +720,37 @@ class Regexp::Scanner
|
|
785
720
|
def emit(type, token, text)
|
786
721
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
722
|
|
788
|
-
emit_literal if
|
723
|
+
emit_literal if literal_run
|
789
724
|
|
790
725
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
726
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
727
|
ts_char_pos = char_pos
|
793
728
|
te_char_pos = char_pos + text.length
|
794
729
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
730
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
731
|
|
799
|
-
|
732
|
+
self.prev_token = tok
|
800
733
|
|
801
734
|
self.char_pos = te_char_pos
|
735
|
+
|
736
|
+
if block
|
737
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
738
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
739
|
+
tokens << tok if collect_tokens
|
740
|
+
elsif collect_tokens
|
741
|
+
tokens << tok
|
742
|
+
end
|
802
743
|
end
|
803
744
|
|
745
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
746
|
+
|
804
747
|
private
|
805
748
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
749
|
+
attr_accessor :block,
|
750
|
+
:collect_tokens, :tokens, :prev_token,
|
751
|
+
:free_spacing, :spacing_stack,
|
752
|
+
:group_depth, :set_depth, :conditional_stack,
|
753
|
+
:char_pos
|
808
754
|
|
809
755
|
def free_spacing?(input_object, options)
|
810
756
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +780,13 @@ class Regexp::Scanner
|
|
834
780
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
781
|
# by a call to emit_literal.
|
836
782
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
783
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
784
|
end
|
840
785
|
|
841
786
|
# Emits the literal run collected by calls to the append_literal method.
|
842
787
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
788
|
+
text = literal_run.join
|
789
|
+
self.literal_run = nil
|
845
790
|
emit(:literal, :literal, text)
|
846
791
|
end
|
847
792
|
|
@@ -876,24 +821,8 @@ class Regexp::Scanner
|
|
876
821
|
|
877
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
878
823
|
if data.last < 0x00 || data.last > 0x7F
|
879
|
-
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
880
825
|
end
|
881
826
|
emit(:escape, token, copy(data, ts-1, te))
|
882
827
|
end
|
883
|
-
|
884
|
-
# Centralizes and unifies the handling of validation related
|
885
|
-
# errors.
|
886
|
-
def validation_error(type, what, reason = nil)
|
887
|
-
error =
|
888
|
-
case type
|
889
|
-
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
-
when :group then InvalidGroupError.new(what, reason)
|
891
|
-
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
-
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
-
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
-
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
-
end
|
896
|
-
|
897
|
-
raise error # unless @@config.validation_ignore
|
898
|
-
end
|
899
828
|
end # module Regexp::Scanner
|