regexp_parser 2.4.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -42
- data/README.md +46 -30
- data/lib/regexp_parser/expression/base.rb +17 -9
- data/lib/regexp_parser/expression/classes/backreference.rb +19 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +8 -0
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +10 -0
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/root.rb +3 -5
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
- data/lib/regexp_parser/expression/methods/traverse.rb +6 -3
- data/lib/regexp_parser/expression/quantifier.rb +6 -5
- data/lib/regexp_parser/expression/sequence.rb +6 -21
- data/lib/regexp_parser/expression/shared.rb +20 -3
- data/lib/regexp_parser/expression/subexpression.rb +4 -1
- data/lib/regexp_parser/expression.rb +4 -2
- data/lib/regexp_parser/lexer.rb +61 -29
- data/lib/regexp_parser/parser.rb +36 -26
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +57 -42
- data/lib/regexp_parser/scanner.rb +873 -823
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/version.rb +1 -1
- metadata +7 -5
@@ -59,9 +59,6 @@
|
|
59
59
|
one_or_more = '+' | '+?' | '++';
|
60
60
|
|
61
61
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
62
|
|
66
63
|
quantity_exact = (digit+);
|
67
64
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +67,6 @@
|
|
70
67
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
68
|
quantity_maximum | quantity_range ) . range_close;
|
72
69
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
70
|
conditional = '(?(';
|
77
71
|
|
78
72
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -90,8 +84,8 @@
|
|
90
84
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
85
|
|
92
86
|
group_ref = [gk];
|
93
|
-
group_name_id_ab = ([
|
94
|
-
group_name_id_sq = ([^0-9\-']
|
87
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
88
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
95
89
|
group_number = '-'? . [1-9] . [0-9]*;
|
96
90
|
group_level = [+\-] . [0-9]+;
|
97
91
|
|
@@ -132,7 +126,8 @@
|
|
132
126
|
keep_mark | sequence_char;
|
133
127
|
|
134
128
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
129
|
+
set_escape = backslash | brackets | escaped_ascii |
|
130
|
+
octal_sequence | property_char |
|
136
131
|
sequence_char | single_codepoint_char_type;
|
137
132
|
|
138
133
|
|
@@ -168,8 +163,8 @@
|
|
168
163
|
};
|
169
164
|
|
170
165
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :close, ']')
|
173
168
|
if in_set?
|
174
169
|
fret;
|
175
170
|
else
|
@@ -183,28 +178,27 @@
|
|
183
178
|
};
|
184
179
|
|
185
180
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
181
|
+
if prev_token[1] == :open
|
182
|
+
emit(:set, :negate, '^')
|
189
183
|
else
|
190
|
-
emit(:literal, :literal,
|
184
|
+
emit(:literal, :literal, '^')
|
191
185
|
end
|
192
186
|
};
|
193
187
|
|
194
188
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
189
|
+
# ranges cant start with the opening bracket, a subset, or
|
190
|
+
# intersection/negation/range operators
|
191
|
+
if prev_token[0] == :set
|
192
|
+
emit(:literal, :literal, '-')
|
199
193
|
else
|
200
|
-
emit(:set, :range,
|
194
|
+
emit(:set, :range, '-')
|
201
195
|
end
|
202
196
|
};
|
203
197
|
|
204
198
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
199
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
200
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
201
|
+
emit(:set, :intersection, '&&')
|
208
202
|
};
|
209
203
|
|
210
204
|
backslash {
|
@@ -212,7 +206,7 @@
|
|
212
206
|
};
|
213
207
|
|
214
208
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
209
|
+
emit(:set, :open, '[')
|
216
210
|
fcall character_set;
|
217
211
|
};
|
218
212
|
|
@@ -254,12 +248,22 @@
|
|
254
248
|
# set escapes scanner
|
255
249
|
# --------------------------------------------------------------------------
|
256
250
|
set_escape_sequence := |*
|
251
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
252
|
+
octal_sequence {
|
253
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
254
|
+
fret;
|
255
|
+
};
|
256
|
+
|
257
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
258
|
set_escape > (escaped_set_alpha, 2) {
|
258
259
|
fhold;
|
259
260
|
fnext character_set;
|
260
261
|
fcall escape_sequence;
|
261
262
|
};
|
262
263
|
|
264
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
265
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
266
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
267
|
any > (escaped_set_alpha, 1) {
|
264
268
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
269
|
fret;
|
@@ -528,7 +532,7 @@
|
|
528
532
|
group_close @group_closed {
|
529
533
|
if conditional_stack.last == group_depth + 1
|
530
534
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
535
|
+
emit(:conditional, :close, ')')
|
532
536
|
else
|
533
537
|
if spacing_stack.length > 1 &&
|
534
538
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -536,7 +540,7 @@
|
|
536
540
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
541
|
end
|
538
542
|
|
539
|
-
emit(:group, :close,
|
543
|
+
emit(:group, :close, ')')
|
540
544
|
end
|
541
545
|
};
|
542
546
|
|
@@ -717,23 +721,24 @@ class Regexp::Scanner
|
|
717
721
|
#
|
718
722
|
# This method may raise errors if a syntax error is encountered.
|
719
723
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
724
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
725
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
726
|
end
|
723
727
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
728
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
729
|
+
self.collect_tokens = collect_tokens
|
730
|
+
self.literal_run = nil
|
726
731
|
stack = []
|
727
732
|
|
728
733
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
734
|
self.free_spacing = free_spacing?(input_object, options)
|
730
735
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
736
|
|
732
|
-
data = input.unpack("c*")
|
737
|
+
data = input.unpack("c*")
|
733
738
|
eof = data.length
|
734
739
|
|
735
740
|
self.tokens = []
|
736
|
-
self.block =
|
741
|
+
self.block = block
|
737
742
|
|
738
743
|
self.set_depth = 0
|
739
744
|
self.group_depth = 0
|
@@ -758,7 +763,7 @@ class Regexp::Scanner
|
|
758
763
|
"[#{set_depth}]") if in_set?
|
759
764
|
|
760
765
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
766
|
+
emit_literal if literal_run
|
762
767
|
|
763
768
|
tokens
|
764
769
|
end
|
@@ -785,26 +790,37 @@ class Regexp::Scanner
|
|
785
790
|
def emit(type, token, text)
|
786
791
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
792
|
|
788
|
-
emit_literal if
|
793
|
+
emit_literal if literal_run
|
789
794
|
|
790
795
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
796
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
797
|
ts_char_pos = char_pos
|
793
798
|
te_char_pos = char_pos + text.length
|
794
799
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
800
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
801
|
|
799
|
-
|
802
|
+
self.prev_token = tok
|
800
803
|
|
801
804
|
self.char_pos = te_char_pos
|
805
|
+
|
806
|
+
if block
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
808
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
809
|
+
tokens << tok if collect_tokens
|
810
|
+
elsif collect_tokens
|
811
|
+
tokens << tok
|
812
|
+
end
|
802
813
|
end
|
803
814
|
|
815
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
816
|
+
|
804
817
|
private
|
805
818
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
819
|
+
attr_accessor :block,
|
820
|
+
:collect_tokens, :tokens, :prev_token,
|
821
|
+
:free_spacing, :spacing_stack,
|
822
|
+
:group_depth, :set_depth, :conditional_stack,
|
823
|
+
:char_pos
|
808
824
|
|
809
825
|
def free_spacing?(input_object, options)
|
810
826
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +850,13 @@ class Regexp::Scanner
|
|
834
850
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
851
|
# by a call to emit_literal.
|
836
852
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
853
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
854
|
end
|
840
855
|
|
841
856
|
# Emits the literal run collected by calls to the append_literal method.
|
842
857
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
858
|
+
text = literal_run.join
|
859
|
+
self.literal_run = nil
|
845
860
|
emit(:literal, :literal, text)
|
846
861
|
end
|
847
862
|
|