regexp_parser 2.10.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +6 -0
- data/lib/regexp_parser/parser.rb +1 -0
- data/lib/regexp_parser/scanner/scanner.rl +34 -7
- data/lib/regexp_parser/scanner.rb +499 -470
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4cb66cfbf1c78a46f36cb24a7cbc9e04b0bc96aa1285fe81de79cec4bfd1c2c1
|
4
|
+
data.tar.gz: f650a1b30acac1298186dce0818eede9944e3b5117e794801abd0576d7b37b9e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b87b74cafd00c2a8a3fe5a44942a005a4756974363c916c650c18e74df719920537bfaecdf21080aed339d24f1988444940d96fd66dc6af847498c04efbc033
|
7
|
+
data.tar.gz: 40e7f8357bd2ff7485c7d7105d852b6c615eaf8902b787b611953df637e246a53990acdc6e1b7f7ff2dd350edf749bd012352981b908de2bbcbee0bc59714513
|
data/LICENSE
CHANGED
@@ -18,6 +18,7 @@ module Regexp::Expression
|
|
18
18
|
Codepoint = Class.new(Base) # e.g. \u000A
|
19
19
|
|
20
20
|
CodepointList = Class.new(Base) # e.g. \u{A B}
|
21
|
+
UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
|
21
22
|
|
22
23
|
AbstractMetaControlSequence = Class.new(Base)
|
23
24
|
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
|
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
|
|
15
15
|
Hex.class_eval { def codepoint; text[/\h+/].hex end }
|
16
16
|
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
|
17
17
|
|
18
|
+
UTF8Hex.class_eval do
|
19
|
+
def codepoint
|
20
|
+
text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
18
24
|
CodepointList.class_eval do
|
19
25
|
# Maybe this should be a unique top-level expression class?
|
20
26
|
def char
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -319,6 +319,7 @@ class Regexp::Parser
|
|
319
319
|
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
320
320
|
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
321
321
|
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
322
|
+
when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
|
322
323
|
|
323
324
|
when :control
|
324
325
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
@@ -37,7 +37,8 @@
|
|
37
37
|
octal_sequence = [0-7]{1,3};
|
38
38
|
|
39
39
|
hex_sequence = 'x' . xdigit{1,2};
|
40
|
-
hex_sequence_err = 'x' . [^0-
|
40
|
+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
|
41
|
+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
|
41
42
|
|
42
43
|
codepoint_single = 'u' . xdigit{4};
|
43
44
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
@@ -210,7 +211,7 @@
|
|
210
211
|
type = :nonposixclass
|
211
212
|
end
|
212
213
|
|
213
|
-
unless
|
214
|
+
unless POSIX_CLASSES[class_name]
|
214
215
|
raise ValidationError.for(:posix_class, text)
|
215
216
|
end
|
216
217
|
|
@@ -256,9 +257,21 @@
|
|
256
257
|
# escape sequence scanner
|
257
258
|
# --------------------------------------------------------------------------
|
258
259
|
escape_sequence := |*
|
259
|
-
[1-9] {
|
260
|
+
[1-9] . [0-9]* {
|
260
261
|
text = copy(data, ts-1, te)
|
261
|
-
|
262
|
+
|
263
|
+
# If not enough groups have been opened, there is a fallback to either an
|
264
|
+
# octal or literal interpretation for 2+ digit numerical escapes.
|
265
|
+
digits = text[1..-1]
|
266
|
+
if digits.size == 1 || digits.to_i <= self.capturing_group_count
|
267
|
+
emit(:backref, :number, text)
|
268
|
+
elsif digits =~ /\A[0-7]{2,}\z/
|
269
|
+
emit(:escape, :octal, text)
|
270
|
+
else
|
271
|
+
emit(:escape, :literal, text[0..1])
|
272
|
+
emit(:literal, :literal, text[2..-1])
|
273
|
+
end
|
274
|
+
|
262
275
|
fret;
|
263
276
|
};
|
264
277
|
|
@@ -321,6 +334,16 @@
|
|
321
334
|
fret;
|
322
335
|
};
|
323
336
|
|
337
|
+
high_hex_sequence > (escaped_alpha, 5) {
|
338
|
+
text = copy(data, ts-1, te)
|
339
|
+
if regexp_encoding == Encoding::BINARY
|
340
|
+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
|
341
|
+
else
|
342
|
+
emit(:escape, :utf8_hex, text)
|
343
|
+
end
|
344
|
+
fret;
|
345
|
+
};
|
346
|
+
|
324
347
|
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
348
|
emit(:escape, :hex, copy(data, ts-1, te))
|
326
349
|
fret;
|
@@ -514,6 +537,7 @@
|
|
514
537
|
};
|
515
538
|
|
516
539
|
group_open @group_opened {
|
540
|
+
self.capturing_group_count += 1
|
517
541
|
text = copy(data, ts, te)
|
518
542
|
emit(:group, :capture, text)
|
519
543
|
};
|
@@ -662,6 +686,7 @@ class Regexp::Scanner
|
|
662
686
|
|
663
687
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
688
|
self.free_spacing = free_spacing?(input_object, options)
|
689
|
+
self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
|
665
690
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
666
691
|
|
667
692
|
data = input.unpack("c*")
|
@@ -672,6 +697,7 @@ class Regexp::Scanner
|
|
672
697
|
|
673
698
|
self.set_depth = 0
|
674
699
|
self.group_depth = 0
|
700
|
+
self.capturing_group_count = 0
|
675
701
|
self.conditional_stack = []
|
676
702
|
self.char_pos = 0
|
677
703
|
|
@@ -711,10 +737,9 @@ class Regexp::Scanner
|
|
711
737
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
738
|
end
|
713
739
|
|
714
|
-
|
740
|
+
POSIX_CLASSES =
|
715
741
|
%w[alnum alpha ascii blank cntrl digit graph
|
716
|
-
lower print punct space upper word xdigit]
|
717
|
-
end
|
742
|
+
lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
|
718
743
|
|
719
744
|
# Emits an array with the details of the scanned pattern
|
720
745
|
def emit(type, token, text)
|
@@ -749,7 +774,9 @@ class Regexp::Scanner
|
|
749
774
|
attr_accessor :block,
|
750
775
|
:collect_tokens, :tokens, :prev_token,
|
751
776
|
:free_spacing, :spacing_stack,
|
777
|
+
:regexp_encoding,
|
752
778
|
:group_depth, :set_depth, :conditional_stack,
|
779
|
+
:capturing_group_count,
|
753
780
|
:char_pos
|
754
781
|
|
755
782
|
def free_spacing?(input_object, options)
|