regexp_parser 2.10.0 → 2.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/LICENSE +1 -1
- data/Rakefile +5 -3
- data/lib/regexp_parser/error.rb +2 -0
- data/lib/regexp_parser/expression/base.rb +2 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +2 -0
- data/lib/regexp_parser/expression/classes/anchor.rb +2 -0
- data/lib/regexp_parser/expression/classes/backreference.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_set/intersection.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_set.rb +2 -0
- data/lib/regexp_parser/expression/classes/character_type.rb +2 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -0
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -0
- data/lib/regexp_parser/expression/classes/group.rb +2 -0
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +2 -0
- data/lib/regexp_parser/expression/classes/posix_class.rb +2 -0
- data/lib/regexp_parser/expression/classes/root.rb +2 -0
- data/lib/regexp_parser/expression/classes/unicode_property.rb +2 -0
- data/lib/regexp_parser/expression/methods/construct.rb +2 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +2 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +8 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +2 -0
- data/lib/regexp_parser/expression/methods/match.rb +2 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -0
- data/lib/regexp_parser/expression/methods/negative.rb +2 -0
- data/lib/regexp_parser/expression/methods/options.rb +2 -0
- data/lib/regexp_parser/expression/methods/parts.rb +2 -0
- data/lib/regexp_parser/expression/methods/printing.rb +2 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +2 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +2 -0
- data/lib/regexp_parser/expression/methods/tests.rb +2 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -0
- data/lib/regexp_parser/expression/quantifier.rb +3 -1
- data/lib/regexp_parser/expression/sequence.rb +2 -0
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -0
- data/lib/regexp_parser/expression/shared.rb +6 -3
- data/lib/regexp_parser/expression/subexpression.rb +2 -0
- data/lib/regexp_parser/expression.rb +2 -0
- data/lib/regexp_parser/lexer.rb +2 -0
- data/lib/regexp_parser/parser.rb +3 -0
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +2 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +2 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +2 -0
- data/lib/regexp_parser/scanner/properties/long.csv +19 -0
- data/lib/regexp_parser/scanner/properties/short.csv +8 -0
- data/lib/regexp_parser/scanner/scanner.rl +43 -7
- data/lib/regexp_parser/scanner.rb +509 -471
- data/lib/regexp_parser/syntax/any.rb +2 -0
- data/lib/regexp_parser/syntax/base.rb +2 -0
- data/lib/regexp_parser/syntax/token/anchor.rb +5 -3
- data/lib/regexp_parser/syntax/token/assertion.rb +4 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +8 -6
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -1
- data/lib/regexp_parser/syntax/token/character_type.rb +6 -4
- data/lib/regexp_parser/syntax/token/conditional.rb +5 -3
- data/lib/regexp_parser/syntax/token/escape.rb +9 -7
- data/lib/regexp_parser/syntax/token/group.rb +8 -6
- data/lib/regexp_parser/syntax/token/keep.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +4 -2
- data/lib/regexp_parser/syntax/token/posix_class.rb +4 -2
- data/lib/regexp_parser/syntax/token/quantifier.rb +8 -6
- data/lib/regexp_parser/syntax/token/unicode_property.rb +62 -47
- data/lib/regexp_parser/syntax/token/virtual.rb +5 -3
- data/lib/regexp_parser/syntax/token.rb +5 -3
- data/lib/regexp_parser/syntax/version_lookup.rb +4 -2
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -0
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +2 -0
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +2 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +2 -0
- data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +2 -0
- data/lib/regexp_parser/syntax.rb +2 -0
- data/lib/regexp_parser/token.rb +2 -0
- data/lib/regexp_parser/version.rb +3 -1
- data/lib/regexp_parser.rb +2 -0
- data/regexp_parser.gemspec +2 -0
- metadata +4 -6
@@ -37,7 +37,8 @@
|
|
37
37
|
octal_sequence = [0-7]{1,3};
|
38
38
|
|
39
39
|
hex_sequence = 'x' . xdigit{1,2};
|
40
|
-
hex_sequence_err = 'x' . [^0-
|
40
|
+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
|
41
|
+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
|
41
42
|
|
42
43
|
codepoint_single = 'u' . xdigit{4};
|
43
44
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
@@ -210,7 +211,7 @@
|
|
210
211
|
type = :nonposixclass
|
211
212
|
end
|
212
213
|
|
213
|
-
unless
|
214
|
+
unless POSIX_CLASSES[class_name]
|
214
215
|
raise ValidationError.for(:posix_class, text)
|
215
216
|
end
|
216
217
|
|
@@ -256,9 +257,21 @@
|
|
256
257
|
# escape sequence scanner
|
257
258
|
# --------------------------------------------------------------------------
|
258
259
|
escape_sequence := |*
|
259
|
-
[1-9] {
|
260
|
+
[1-9] . [0-9]* {
|
260
261
|
text = copy(data, ts-1, te)
|
261
|
-
|
262
|
+
|
263
|
+
# If not enough groups have been opened, there is a fallback to either an
|
264
|
+
# octal or literal interpretation for 2+ digit numerical escapes.
|
265
|
+
digits = text[1..-1]
|
266
|
+
if digits.size == 1 || digits.to_i <= capturing_group_count
|
267
|
+
emit(:backref, :number, text)
|
268
|
+
elsif digits =~ /\A[0-7]{2,}\z/
|
269
|
+
emit(:escape, :octal, text)
|
270
|
+
else
|
271
|
+
emit(:escape, :literal, text[0..1])
|
272
|
+
emit(:literal, :literal, text[2..-1])
|
273
|
+
end
|
274
|
+
|
262
275
|
fret;
|
263
276
|
};
|
264
277
|
|
@@ -321,6 +334,16 @@
|
|
321
334
|
fret;
|
322
335
|
};
|
323
336
|
|
337
|
+
high_hex_sequence > (escaped_alpha, 5) {
|
338
|
+
text = copy(data, ts-1, te)
|
339
|
+
if regexp_encoding == Encoding::BINARY
|
340
|
+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
|
341
|
+
else
|
342
|
+
emit(:escape, :utf8_hex, text)
|
343
|
+
end
|
344
|
+
fret;
|
345
|
+
};
|
346
|
+
|
324
347
|
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
348
|
emit(:escape, :hex, copy(data, ts-1, te))
|
326
349
|
fret;
|
@@ -514,6 +537,7 @@
|
|
514
537
|
};
|
515
538
|
|
516
539
|
group_open @group_opened {
|
540
|
+
self.capturing_group_count = capturing_group_count + 1
|
517
541
|
text = copy(data, ts, te)
|
518
542
|
emit(:group, :capture, text)
|
519
543
|
};
|
@@ -662,6 +686,7 @@ class Regexp::Scanner
|
|
662
686
|
|
663
687
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
688
|
self.free_spacing = free_spacing?(input_object, options)
|
689
|
+
self.regexp_encoding = extract_encoding(input_object, options)
|
665
690
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
666
691
|
|
667
692
|
data = input.unpack("c*")
|
@@ -672,6 +697,7 @@ class Regexp::Scanner
|
|
672
697
|
|
673
698
|
self.set_depth = 0
|
674
699
|
self.group_depth = 0
|
700
|
+
self.capturing_group_count = 0
|
675
701
|
self.conditional_stack = []
|
676
702
|
self.char_pos = 0
|
677
703
|
|
@@ -711,10 +737,11 @@ class Regexp::Scanner
|
|
711
737
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
738
|
end
|
713
739
|
|
714
|
-
|
740
|
+
# Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
|
741
|
+
POSIX_CLASSES =
|
715
742
|
%w[alnum alpha ascii blank cntrl digit graph
|
716
743
|
lower print punct space upper word xdigit]
|
717
|
-
|
744
|
+
.inject({}) { |o, e| o.merge(e => true) }.freeze
|
718
745
|
|
719
746
|
# Emits an array with the details of the scanned pattern
|
720
747
|
def emit(type, token, text)
|
@@ -742,16 +769,25 @@ class Regexp::Scanner
|
|
742
769
|
end
|
743
770
|
end
|
744
771
|
|
745
|
-
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
772
|
+
attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
|
746
773
|
|
747
774
|
private
|
748
775
|
|
749
776
|
attr_accessor :block,
|
750
777
|
:collect_tokens, :tokens, :prev_token,
|
751
778
|
:free_spacing, :spacing_stack,
|
779
|
+
:regexp_encoding,
|
752
780
|
:group_depth, :set_depth, :conditional_stack,
|
753
781
|
:char_pos
|
754
782
|
|
783
|
+
def extract_encoding(input_object, options)
|
784
|
+
if input_object.is_a?(::Regexp)
|
785
|
+
input_object.encoding
|
786
|
+
elsif options && (options & Regexp::NOENCODING)
|
787
|
+
Encoding::BINARY
|
788
|
+
end
|
789
|
+
end
|
790
|
+
|
755
791
|
def free_spacing?(input_object, options)
|
756
792
|
if options && !input_object.is_a?(String)
|
757
793
|
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|