regexp_parser 2.6.2 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +67 -0
- data/Gemfile +2 -2
- data/README.md +32 -29
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +4 -2
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +2 -0
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +97 -97
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/mapping.rb +89 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +90 -169
- data/lib/regexp_parser/scanner.rb +1157 -1330
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +10 -3
@@ -30,11 +30,6 @@
|
|
30
30
|
|
31
31
|
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
32
32
|
|
33
|
-
|
34
|
-
# these are not supported in ruby at the moment
|
35
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
|
-
character_equivalent = '[=' . alpha . '=]';
|
37
|
-
|
38
33
|
line_anchor = beginning_of_line | end_of_line;
|
39
34
|
anchor_char = [AbBzZG];
|
40
35
|
|
@@ -59,9 +54,6 @@
|
|
59
54
|
one_or_more = '+' | '+?' | '++';
|
60
55
|
|
61
56
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
57
|
|
66
58
|
quantity_exact = (digit+);
|
67
59
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +62,6 @@
|
|
70
62
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
63
|
quantity_maximum | quantity_range ) . range_close;
|
72
64
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
65
|
conditional = '(?(';
|
77
66
|
|
78
67
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -89,10 +78,9 @@
|
|
89
78
|
# try to treat every other group head as options group, like Ruby
|
90
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
80
|
|
92
|
-
group_ref = [gk];
|
93
81
|
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
94
82
|
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
95
|
-
group_number = '-'? . [
|
83
|
+
group_number = '-'? . [0-9]+;
|
96
84
|
group_level = [+\-] . [0-9]+;
|
97
85
|
|
98
86
|
group_name = ('<' . group_name_id_ab? . '>') |
|
@@ -101,15 +89,11 @@
|
|
101
89
|
|
102
90
|
group_named = ('?' . group_name );
|
103
91
|
|
104
|
-
|
105
|
-
|
106
|
-
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
107
|
-
("'" . group_name_id_sq? . group_level? "'"));
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
108
94
|
|
109
|
-
|
110
|
-
|
111
|
-
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
112
|
-
("'" . ((group_number . group_level?) | '0') "'"));
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
113
97
|
|
114
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
115
99
|
|
@@ -132,20 +116,21 @@
|
|
132
116
|
keep_mark | sequence_char;
|
133
117
|
|
134
118
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
136
121
|
sequence_char | single_codepoint_char_type;
|
137
122
|
|
138
123
|
|
139
124
|
# EOF error, used where it can be detected
|
140
125
|
action premature_end_error {
|
141
126
|
text = copy(data, ts ? ts-1 : 0, -1)
|
142
|
-
raise PrematureEndError.new(
|
127
|
+
raise PrematureEndError.new(text)
|
143
128
|
}
|
144
129
|
|
145
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
146
131
|
action invalid_sequence_error {
|
147
132
|
text = copy(data, ts ? ts-1 : 0, -1)
|
148
|
-
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
149
134
|
}
|
150
135
|
|
151
136
|
# group (nesting) and set open/close actions
|
@@ -168,8 +153,8 @@
|
|
168
153
|
};
|
169
154
|
|
170
155
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
173
158
|
if in_set?
|
174
159
|
fret;
|
175
160
|
else
|
@@ -183,28 +168,27 @@
|
|
183
168
|
};
|
184
169
|
|
185
170
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
189
173
|
else
|
190
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
191
175
|
end
|
192
176
|
};
|
193
177
|
|
194
178
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
199
183
|
else
|
200
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
201
185
|
end
|
202
186
|
};
|
203
187
|
|
204
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
190
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
208
192
|
};
|
209
193
|
|
210
194
|
backslash {
|
@@ -212,7 +196,7 @@
|
|
212
196
|
};
|
213
197
|
|
214
198
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
216
200
|
fcall character_set;
|
217
201
|
};
|
218
202
|
|
@@ -227,20 +211,12 @@
|
|
227
211
|
end
|
228
212
|
|
229
213
|
unless self.class.posix_classes.include?(class_name)
|
230
|
-
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
231
215
|
end
|
232
216
|
|
233
217
|
emit(type, class_name.to_sym, text)
|
234
218
|
};
|
235
219
|
|
236
|
-
# These are not supported in ruby at the moment. Enable them if they are.
|
237
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
238
|
-
# emit(:set, :collation, copy(data, ts, te))
|
239
|
-
# };
|
240
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
-
# emit(:set, :equivalent, copy(data, ts, te))
|
242
|
-
# };
|
243
|
-
|
244
220
|
meta_char > (set_meta, 1) {
|
245
221
|
emit(:literal, :literal, copy(data, ts, te))
|
246
222
|
};
|
@@ -254,12 +230,22 @@
|
|
254
230
|
# set escapes scanner
|
255
231
|
# --------------------------------------------------------------------------
|
256
232
|
set_escape_sequence := |*
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
236
|
+
fret;
|
237
|
+
};
|
238
|
+
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
240
|
set_escape > (escaped_set_alpha, 2) {
|
258
241
|
fhold;
|
259
242
|
fnext character_set;
|
260
243
|
fcall escape_sequence;
|
261
244
|
};
|
262
245
|
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
249
|
any > (escaped_set_alpha, 1) {
|
264
250
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
251
|
fret;
|
@@ -453,10 +439,9 @@
|
|
453
439
|
|
454
440
|
# (?#...) comments: parsed as a single expression, without introducing a
|
455
441
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
456
|
-
# special case for close
|
457
|
-
# correct closing count.
|
442
|
+
# special case for close to get the correct closing count.
|
458
443
|
# ------------------------------------------------------------------------
|
459
|
-
group_open . group_comment
|
444
|
+
(group_open . group_comment) @group_closed {
|
460
445
|
emit(:group, :comment, copy(data, ts, te))
|
461
446
|
};
|
462
447
|
|
@@ -471,10 +456,10 @@
|
|
471
456
|
#
|
472
457
|
# (?imxdau-imx:subexp) option on/off for subexp
|
473
458
|
# ------------------------------------------------------------------------
|
474
|
-
group_open . group_options >group_opened {
|
459
|
+
(group_open . group_options) >group_opened {
|
475
460
|
text = copy(data, ts, te)
|
476
461
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
477
|
-
|
462
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
478
463
|
end
|
479
464
|
emit_options(text)
|
480
465
|
};
|
@@ -485,7 +470,7 @@
|
|
485
470
|
# (?<=subexp) look-behind
|
486
471
|
# (?<!subexp) negative look-behind
|
487
472
|
# ------------------------------------------------------------------------
|
488
|
-
group_open . assertion_type >group_opened {
|
473
|
+
(group_open . assertion_type) >group_opened {
|
489
474
|
case text = copy(data, ts, te)
|
490
475
|
when '(?='; emit(:assertion, :lookahead, text)
|
491
476
|
when '(?!'; emit(:assertion, :nlookahead, text)
|
@@ -502,14 +487,14 @@
|
|
502
487
|
# (?'name'subexp) named group (single quoted version)
|
503
488
|
# (subexp) captured group
|
504
489
|
# ------------------------------------------------------------------------
|
505
|
-
group_open . group_type >group_opened {
|
490
|
+
(group_open . group_type) >group_opened {
|
506
491
|
case text = copy(data, ts, te)
|
507
492
|
when '(?:'; emit(:group, :passive, text)
|
508
493
|
when '(?>'; emit(:group, :atomic, text)
|
509
494
|
when '(?~'; emit(:group, :absence, text)
|
510
495
|
|
511
496
|
when /^\(\?(?:<>|'')/
|
512
|
-
|
497
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
513
498
|
|
514
499
|
when /^\(\?<[^>]+>/
|
515
500
|
emit(:group, :named_ab, text)
|
@@ -528,50 +513,52 @@
|
|
528
513
|
group_close @group_closed {
|
529
514
|
if conditional_stack.last == group_depth + 1
|
530
515
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
532
|
-
|
516
|
+
emit(:conditional, :close, ')')
|
517
|
+
elsif group_depth >= 0
|
533
518
|
if spacing_stack.length > 1 &&
|
534
519
|
spacing_stack.last[:depth] == group_depth + 1
|
535
520
|
spacing_stack.pop
|
536
521
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
522
|
end
|
538
523
|
|
539
|
-
emit(:group, :close,
|
524
|
+
emit(:group, :close, ')')
|
525
|
+
else
|
526
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
540
527
|
end
|
541
528
|
};
|
542
529
|
|
543
530
|
|
544
531
|
# Group backreference, named and numbered
|
545
532
|
# ------------------------------------------------------------------------
|
546
|
-
backslash . (
|
533
|
+
backslash . (group_ref) > (backslashed, 4) {
|
547
534
|
case text = copy(data, ts, te)
|
548
|
-
when /^\\k(
|
549
|
-
validation_error(:backref, 'backreference', 'ref ID is empty')
|
550
|
-
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
535
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
551
536
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
552
|
-
when /^\\k(.)\d
|
537
|
+
when /^\\k(.)[1-9]\d*['>]$/
|
553
538
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
554
|
-
when /^\\k(.)
|
539
|
+
when /^\\k(.)-[1-9]\d*['>]$/
|
555
540
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
556
|
-
when /^\\k(.)[
|
541
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
557
542
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
558
|
-
when /^\\k(.)
|
543
|
+
when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
|
559
544
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
545
|
+
else
|
546
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
560
547
|
end
|
561
548
|
};
|
562
549
|
|
563
550
|
# Group call, named and numbered
|
564
551
|
# ------------------------------------------------------------------------
|
565
|
-
backslash . (
|
552
|
+
backslash . (group_call) > (backslashed, 4) {
|
566
553
|
case text = copy(data, ts, te)
|
567
|
-
when /^\\g(
|
568
|
-
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
569
|
-
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
554
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
570
555
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
571
|
-
when /^\\g(.)\d
|
556
|
+
when /^\\g(.)\d+['>]$/
|
572
557
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
573
558
|
when /^\\g(.)[+-]\d+/
|
574
559
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
560
|
+
else
|
561
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
575
562
|
end
|
576
563
|
};
|
577
564
|
|
@@ -645,95 +632,35 @@
|
|
645
632
|
*|;
|
646
633
|
}%%
|
647
634
|
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
require 'regexp_parser/error'
|
635
|
+
require 'regexp_parser/scanner/errors/scanner_error'
|
636
|
+
require 'regexp_parser/scanner/errors/premature_end_error'
|
637
|
+
require 'regexp_parser/scanner/errors/validation_error'
|
652
638
|
|
653
639
|
class Regexp::Scanner
|
654
|
-
# General scanner error (catch all)
|
655
|
-
class ScannerError < Regexp::Parser::Error; end
|
656
|
-
|
657
|
-
# Base for all scanner validation errors
|
658
|
-
class ValidationError < Regexp::Parser::Error
|
659
|
-
def initialize(reason)
|
660
|
-
super reason
|
661
|
-
end
|
662
|
-
end
|
663
|
-
|
664
|
-
# Unexpected end of pattern
|
665
|
-
class PrematureEndError < ScannerError
|
666
|
-
def initialize(where = '')
|
667
|
-
super "Premature end of pattern at #{where}"
|
668
|
-
end
|
669
|
-
end
|
670
|
-
|
671
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
672
|
-
class InvalidSequenceError < ValidationError
|
673
|
-
def initialize(what = 'sequence', where = '')
|
674
|
-
super "Invalid #{what} at #{where}"
|
675
|
-
end
|
676
|
-
end
|
677
|
-
|
678
|
-
# Invalid group. Used for named groups.
|
679
|
-
class InvalidGroupError < ValidationError
|
680
|
-
def initialize(what, reason)
|
681
|
-
super "Invalid #{what}, #{reason}."
|
682
|
-
end
|
683
|
-
end
|
684
|
-
|
685
|
-
# Invalid groupOption. Used for inline options.
|
686
|
-
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
687
|
-
class InvalidGroupOption < ValidationError
|
688
|
-
def initialize(option, text)
|
689
|
-
super "Invalid group option #{option} in #{text}"
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
# Invalid back reference. Used for name a number refs/calls.
|
694
|
-
class InvalidBackrefError < ValidationError
|
695
|
-
def initialize(what, reason)
|
696
|
-
super "Invalid back reference #{what}, #{reason}"
|
697
|
-
end
|
698
|
-
end
|
699
|
-
|
700
|
-
# The property name was not recognized by the scanner.
|
701
|
-
class UnknownUnicodePropertyError < ValidationError
|
702
|
-
def initialize(name)
|
703
|
-
super "Unknown unicode character property name #{name}"
|
704
|
-
end
|
705
|
-
end
|
706
|
-
|
707
|
-
# The POSIX class name was not recognized by the scanner.
|
708
|
-
class UnknownPosixClassError < ValidationError
|
709
|
-
def initialize(text)
|
710
|
-
super "Unknown POSIX class #{text}"
|
711
|
-
end
|
712
|
-
end
|
713
|
-
|
714
640
|
# Scans the given regular expression text, or Regexp object and collects the
|
715
641
|
# emitted token into an array that gets returned at the end. If a block is
|
716
642
|
# given, it gets called for each emitted token.
|
717
643
|
#
|
718
644
|
# This method may raise errors if a syntax error is encountered.
|
719
645
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
646
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
647
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
648
|
end
|
723
649
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
650
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
651
|
+
self.collect_tokens = collect_tokens
|
652
|
+
self.literal_run = nil
|
726
653
|
stack = []
|
727
654
|
|
728
655
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
656
|
self.free_spacing = free_spacing?(input_object, options)
|
730
657
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
658
|
|
732
|
-
data = input.unpack("c*")
|
659
|
+
data = input.unpack("c*")
|
733
660
|
eof = data.length
|
734
661
|
|
735
662
|
self.tokens = []
|
736
|
-
self.block =
|
663
|
+
self.block = block
|
737
664
|
|
738
665
|
self.set_depth = 0
|
739
666
|
self.group_depth = 0
|
@@ -758,7 +685,7 @@ class Regexp::Scanner
|
|
758
685
|
"[#{set_depth}]") if in_set?
|
759
686
|
|
760
687
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
688
|
+
emit_literal if literal_run
|
762
689
|
|
763
690
|
tokens
|
764
691
|
end
|
@@ -785,26 +712,37 @@ class Regexp::Scanner
|
|
785
712
|
def emit(type, token, text)
|
786
713
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
714
|
|
788
|
-
emit_literal if
|
715
|
+
emit_literal if literal_run
|
789
716
|
|
790
717
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
718
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
719
|
ts_char_pos = char_pos
|
793
720
|
te_char_pos = char_pos + text.length
|
794
721
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
722
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
723
|
|
799
|
-
|
724
|
+
self.prev_token = tok
|
800
725
|
|
801
726
|
self.char_pos = te_char_pos
|
727
|
+
|
728
|
+
if block
|
729
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
730
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
731
|
+
tokens << tok if collect_tokens
|
732
|
+
elsif collect_tokens
|
733
|
+
tokens << tok
|
734
|
+
end
|
802
735
|
end
|
803
736
|
|
737
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
738
|
+
|
804
739
|
private
|
805
740
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
741
|
+
attr_accessor :block,
|
742
|
+
:collect_tokens, :tokens, :prev_token,
|
743
|
+
:free_spacing, :spacing_stack,
|
744
|
+
:group_depth, :set_depth, :conditional_stack,
|
745
|
+
:char_pos
|
808
746
|
|
809
747
|
def free_spacing?(input_object, options)
|
810
748
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +772,13 @@ class Regexp::Scanner
|
|
834
772
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
773
|
# by a call to emit_literal.
|
836
774
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
775
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
776
|
end
|
840
777
|
|
841
778
|
# Emits the literal run collected by calls to the append_literal method.
|
842
779
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
780
|
+
text = literal_run.join
|
781
|
+
self.literal_run = nil
|
845
782
|
emit(:literal, :literal, text)
|
846
783
|
end
|
847
784
|
|
@@ -876,24 +813,8 @@ class Regexp::Scanner
|
|
876
813
|
|
877
814
|
def emit_meta_control_sequence(data, ts, te, token)
|
878
815
|
if data.last < 0x00 || data.last > 0x7F
|
879
|
-
|
816
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
880
817
|
end
|
881
818
|
emit(:escape, token, copy(data, ts-1, te))
|
882
819
|
end
|
883
|
-
|
884
|
-
# Centralizes and unifies the handling of validation related
|
885
|
-
# errors.
|
886
|
-
def validation_error(type, what, reason = nil)
|
887
|
-
error =
|
888
|
-
case type
|
889
|
-
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
-
when :group then InvalidGroupError.new(what, reason)
|
891
|
-
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
-
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
-
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
-
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
-
end
|
896
|
-
|
897
|
-
raise error # unless @@config.validation_ignore
|
898
|
-
end
|
899
820
|
end # module Regexp::Scanner
|