regexp_parser 2.9.3 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +1 -20
- data/lib/regexp_parser/expression/classes/conditional.rb +0 -14
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +24 -95
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +74 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
- data/lib/regexp_parser/expression.rb +3 -0
- data/lib/regexp_parser/parser.rb +9 -5
- data/lib/regexp_parser/scanner/scanner.rl +34 -7
- data/lib/regexp_parser/scanner.rb +499 -470
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4cb66cfbf1c78a46f36cb24a7cbc9e04b0bc96aa1285fe81de79cec4bfd1c2c1
|
4
|
+
data.tar.gz: f650a1b30acac1298186dce0818eede9944e3b5117e794801abd0576d7b37b9e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b87b74cafd00c2a8a3fe5a44942a005a4756974363c916c650c18e74df719920537bfaecdf21080aed339d24f1988444940d96fd66dc6af847498c04efbc033
|
7
|
+
data.tar.gz: 40e7f8357bd2ff7485c7d7105d852b6c615eaf8902b787b611953df637e246a53990acdc6e1b7f7ff2dd350edf749bd012352981b908de2bbcbee0bc59714513
|
data/LICENSE
CHANGED
@@ -1,25 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Backreference
|
3
|
-
class Base < Regexp::Expression::Base
|
4
|
-
attr_accessor :referenced_expression
|
5
|
-
|
6
|
-
def initialize_copy(orig)
|
7
|
-
exp_id = [self.class, self.starts_at]
|
8
|
-
|
9
|
-
# prevent infinite recursion for recursive subexp calls
|
10
|
-
copied = @@copied ||= {}
|
11
|
-
self.referenced_expression =
|
12
|
-
if copied[exp_id]
|
13
|
-
orig.referenced_expression
|
14
|
-
else
|
15
|
-
copied[exp_id] = true
|
16
|
-
orig.referenced_expression.dup
|
17
|
-
end
|
18
|
-
copied.clear
|
19
|
-
|
20
|
-
super
|
21
|
-
end
|
22
|
-
end
|
3
|
+
class Base < Regexp::Expression::Base; end
|
23
4
|
|
24
5
|
class Number < Backreference::Base
|
25
6
|
attr_reader :number
|
@@ -7,26 +7,17 @@ module Regexp::Expression
|
|
7
7
|
end
|
8
8
|
|
9
9
|
class Condition < Regexp::Expression::Base
|
10
|
-
attr_accessor :referenced_expression
|
11
|
-
|
12
10
|
# Name or number of the referenced capturing group that determines state.
|
13
11
|
# Returns a String if reference is by name, Integer if by number.
|
14
12
|
def reference
|
15
13
|
ref = text.tr("'<>()", "")
|
16
14
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
15
|
end
|
18
|
-
|
19
|
-
def initialize_copy(orig)
|
20
|
-
self.referenced_expression = orig.referenced_expression.dup
|
21
|
-
super
|
22
|
-
end
|
23
16
|
end
|
24
17
|
|
25
18
|
class Branch < Regexp::Expression::Sequence; end
|
26
19
|
|
27
20
|
class Expression < Regexp::Expression::Subexpression
|
28
|
-
attr_accessor :referenced_expression
|
29
|
-
|
30
21
|
def <<(exp)
|
31
22
|
expressions.last << exp
|
32
23
|
end
|
@@ -54,11 +45,6 @@ module Regexp::Expression
|
|
54
45
|
def reference
|
55
46
|
condition.reference
|
56
47
|
end
|
57
|
-
|
58
|
-
def initialize_copy(orig)
|
59
|
-
self.referenced_expression = orig.referenced_expression.dup
|
60
|
-
super
|
61
|
-
end
|
62
48
|
end
|
63
49
|
end
|
64
50
|
end
|
@@ -1,100 +1,29 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module EscapeSequence
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
class AsciiEscape < EscapeSequence::Base; end
|
28
|
-
class Backspace < EscapeSequence::Base; end
|
29
|
-
class Bell < EscapeSequence::Base; end
|
30
|
-
class FormFeed < EscapeSequence::Base; end
|
31
|
-
class Newline < EscapeSequence::Base; end
|
32
|
-
class Return < EscapeSequence::Base; end
|
33
|
-
class Tab < EscapeSequence::Base; end
|
34
|
-
class VerticalTab < EscapeSequence::Base; end
|
35
|
-
|
36
|
-
class Hex < EscapeSequence::Base; end
|
37
|
-
class Codepoint < EscapeSequence::Base; end
|
38
|
-
|
39
|
-
class CodepointList < EscapeSequence::Base
|
40
|
-
def char
|
41
|
-
raise NoMethodError, 'CodepointList responds only to #chars'
|
42
|
-
end
|
43
|
-
|
44
|
-
def codepoint
|
45
|
-
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
46
|
-
end
|
47
|
-
|
48
|
-
def chars
|
49
|
-
codepoints.map { |cp| cp.chr('utf-8') }
|
50
|
-
end
|
51
|
-
|
52
|
-
def codepoints
|
53
|
-
text.scan(/\h+/).map(&:hex)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class Octal < EscapeSequence::Base
|
58
|
-
def char
|
59
|
-
text[1..-1].to_i(8).chr('utf-8')
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
class AbstractMetaControlSequence < EscapeSequence::Base
|
64
|
-
def char
|
65
|
-
codepoint.chr('utf-8')
|
66
|
-
end
|
67
|
-
|
68
|
-
private
|
69
|
-
|
70
|
-
def control_sequence_to_s(control_sequence)
|
71
|
-
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
72
|
-
["000#{five_lsb}"].pack('B*')
|
73
|
-
end
|
74
|
-
|
75
|
-
def meta_char_to_codepoint(meta_char)
|
76
|
-
byte_value = meta_char.ord
|
77
|
-
byte_value < 128 ? byte_value + 128 : byte_value
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
class Control < AbstractMetaControlSequence
|
82
|
-
def codepoint
|
83
|
-
control_sequence_to_s(text).ord
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
class Meta < AbstractMetaControlSequence
|
88
|
-
def codepoint
|
89
|
-
meta_char_to_codepoint(text[-1])
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
class MetaControl < AbstractMetaControlSequence
|
94
|
-
def codepoint
|
95
|
-
meta_char_to_codepoint(control_sequence_to_s(text))
|
96
|
-
end
|
97
|
-
end
|
3
|
+
Base = Class.new(Regexp::Expression::Base)
|
4
|
+
|
5
|
+
AsciiEscape = Class.new(Base) # \e
|
6
|
+
Backspace = Class.new(Base) # \b
|
7
|
+
Bell = Class.new(Base) # \a
|
8
|
+
FormFeed = Class.new(Base) # \f
|
9
|
+
Newline = Class.new(Base) # \n
|
10
|
+
Return = Class.new(Base) # \r
|
11
|
+
Tab = Class.new(Base) # \t
|
12
|
+
VerticalTab = Class.new(Base) # \v
|
13
|
+
|
14
|
+
Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes)
|
15
|
+
|
16
|
+
Octal = Class.new(Base) # e.g. \012
|
17
|
+
Hex = Class.new(Base) # e.g. \x0A
|
18
|
+
Codepoint = Class.new(Base) # e.g. \u000A
|
19
|
+
|
20
|
+
CodepointList = Class.new(Base) # e.g. \u{A B}
|
21
|
+
UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
|
22
|
+
|
23
|
+
AbstractMetaControlSequence = Class.new(Base)
|
24
|
+
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
|
25
|
+
Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z
|
26
|
+
MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX
|
98
27
|
end
|
99
28
|
|
100
29
|
# alias for symmetry between Token::* and Expression::*
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Regexp::Expression::EscapeSequence
|
2
|
+
AsciiEscape.class_eval { def codepoint; 0x1B end }
|
3
|
+
Backspace.class_eval { def codepoint; 0x8 end }
|
4
|
+
Bell.class_eval { def codepoint; 0x7 end }
|
5
|
+
FormFeed.class_eval { def codepoint; 0xC end }
|
6
|
+
Newline.class_eval { def codepoint; 0xA end }
|
7
|
+
Return.class_eval { def codepoint; 0xD end }
|
8
|
+
Tab.class_eval { def codepoint; 0x9 end }
|
9
|
+
VerticalTab.class_eval { def codepoint; 0xB end }
|
10
|
+
|
11
|
+
Literal.class_eval { def codepoint; text[1].ord end }
|
12
|
+
|
13
|
+
Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end }
|
14
|
+
|
15
|
+
Hex.class_eval { def codepoint; text[/\h+/].hex end }
|
16
|
+
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
|
17
|
+
|
18
|
+
UTF8Hex.class_eval do
|
19
|
+
def codepoint
|
20
|
+
text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
CodepointList.class_eval do
|
25
|
+
# Maybe this should be a unique top-level expression class?
|
26
|
+
def char
|
27
|
+
raise NoMethodError, 'CodepointList responds only to #chars'
|
28
|
+
end
|
29
|
+
|
30
|
+
def codepoint
|
31
|
+
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
32
|
+
end
|
33
|
+
|
34
|
+
def chars
|
35
|
+
codepoints.map { |cp| cp.chr('utf-8') }
|
36
|
+
end
|
37
|
+
|
38
|
+
def codepoints
|
39
|
+
text.scan(/\h+/).map(&:hex)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
AbstractMetaControlSequence.class_eval do
|
44
|
+
private
|
45
|
+
|
46
|
+
def control_sequence_to_s(control_sequence)
|
47
|
+
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
48
|
+
["000#{five_lsb}"].pack('B*')
|
49
|
+
end
|
50
|
+
|
51
|
+
def meta_char_to_codepoint(meta_char)
|
52
|
+
byte_value = meta_char.ord
|
53
|
+
byte_value < 128 ? byte_value + 128 : byte_value
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Control.class_eval do
|
58
|
+
def codepoint
|
59
|
+
control_sequence_to_s(text).ord
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
Meta.class_eval do
|
64
|
+
def codepoint
|
65
|
+
meta_char_to_codepoint(text[-1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
MetaControl.class_eval do
|
70
|
+
def codepoint
|
71
|
+
meta_char_to_codepoint(control_sequence_to_s(text))
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module ReferencedExpressions
|
3
|
+
attr_accessor :referenced_expressions
|
4
|
+
|
5
|
+
def referenced_expression
|
6
|
+
referenced_expressions && referenced_expressions.first
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize_copy(orig)
|
10
|
+
exp_id = [self.class, self.starts_at]
|
11
|
+
|
12
|
+
# prevent infinite recursion for recursive subexp calls
|
13
|
+
copied = self.class.instance_eval { @copied_ref_exps ||= {} }
|
14
|
+
self.referenced_expressions =
|
15
|
+
if copied[exp_id]
|
16
|
+
orig.referenced_expressions
|
17
|
+
else
|
18
|
+
copied[exp_id] = true
|
19
|
+
orig.referenced_expressions && orig.referenced_expressions.map(&:dup)
|
20
|
+
end
|
21
|
+
copied.clear
|
22
|
+
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Base.include ReferencedExpressions
|
28
|
+
end
|
@@ -25,6 +25,8 @@ require_relative 'expression/classes/root'
|
|
25
25
|
require_relative 'expression/classes/unicode_property'
|
26
26
|
|
27
27
|
require_relative 'expression/methods/construct'
|
28
|
+
require_relative 'expression/methods/escape_sequence_char'
|
29
|
+
require_relative 'expression/methods/escape_sequence_codepoint'
|
28
30
|
require_relative 'expression/methods/human_name'
|
29
31
|
require_relative 'expression/methods/match'
|
30
32
|
require_relative 'expression/methods/match_length'
|
@@ -32,6 +34,7 @@ require_relative 'expression/methods/negative'
|
|
32
34
|
require_relative 'expression/methods/options'
|
33
35
|
require_relative 'expression/methods/parts'
|
34
36
|
require_relative 'expression/methods/printing'
|
37
|
+
require_relative 'expression/methods/referenced_expressions'
|
35
38
|
require_relative 'expression/methods/strfregexp'
|
36
39
|
require_relative 'expression/methods/tests'
|
37
40
|
require_relative 'expression/methods/traverse'
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -319,6 +319,7 @@ class Regexp::Parser
|
|
319
319
|
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
320
320
|
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
321
321
|
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
322
|
+
when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
|
322
323
|
|
323
324
|
when :control
|
324
325
|
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
@@ -580,16 +581,19 @@ class Regexp::Parser
|
|
580
581
|
# the instance of Group::Capture that it refers to via its number.
|
581
582
|
def assign_referenced_expressions
|
582
583
|
# find all referenceable and referring expressions
|
583
|
-
targets = { 0 => root }
|
584
|
+
targets = { 0 => [root] }
|
584
585
|
referrers = []
|
585
586
|
root.each_expression do |exp|
|
586
|
-
exp.
|
587
|
-
|
587
|
+
if exp.referential?
|
588
|
+
referrers << exp
|
589
|
+
elsif exp.is_a?(Group::Capture)
|
590
|
+
(targets[exp.identifier] ||= []) << exp
|
591
|
+
end
|
588
592
|
end
|
589
|
-
# assign
|
593
|
+
# assign referenced expressions to referring expressions
|
590
594
|
# (in a second iteration because there might be forward references)
|
591
595
|
referrers.each do |exp|
|
592
|
-
exp.
|
596
|
+
exp.referenced_expressions = targets[exp.reference] ||
|
593
597
|
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
594
598
|
end
|
595
599
|
end
|
@@ -37,7 +37,8 @@
|
|
37
37
|
octal_sequence = [0-7]{1,3};
|
38
38
|
|
39
39
|
hex_sequence = 'x' . xdigit{1,2};
|
40
|
-
hex_sequence_err = 'x' . [^0-
|
40
|
+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
|
41
|
+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
|
41
42
|
|
42
43
|
codepoint_single = 'u' . xdigit{4};
|
43
44
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
@@ -210,7 +211,7 @@
|
|
210
211
|
type = :nonposixclass
|
211
212
|
end
|
212
213
|
|
213
|
-
unless
|
214
|
+
unless POSIX_CLASSES[class_name]
|
214
215
|
raise ValidationError.for(:posix_class, text)
|
215
216
|
end
|
216
217
|
|
@@ -256,9 +257,21 @@
|
|
256
257
|
# escape sequence scanner
|
257
258
|
# --------------------------------------------------------------------------
|
258
259
|
escape_sequence := |*
|
259
|
-
[1-9] {
|
260
|
+
[1-9] . [0-9]* {
|
260
261
|
text = copy(data, ts-1, te)
|
261
|
-
|
262
|
+
|
263
|
+
# If not enough groups have been opened, there is a fallback to either an
|
264
|
+
# octal or literal interpretation for 2+ digit numerical escapes.
|
265
|
+
digits = text[1..-1]
|
266
|
+
if digits.size == 1 || digits.to_i <= self.capturing_group_count
|
267
|
+
emit(:backref, :number, text)
|
268
|
+
elsif digits =~ /\A[0-7]{2,}\z/
|
269
|
+
emit(:escape, :octal, text)
|
270
|
+
else
|
271
|
+
emit(:escape, :literal, text[0..1])
|
272
|
+
emit(:literal, :literal, text[2..-1])
|
273
|
+
end
|
274
|
+
|
262
275
|
fret;
|
263
276
|
};
|
264
277
|
|
@@ -321,6 +334,16 @@
|
|
321
334
|
fret;
|
322
335
|
};
|
323
336
|
|
337
|
+
high_hex_sequence > (escaped_alpha, 5) {
|
338
|
+
text = copy(data, ts-1, te)
|
339
|
+
if regexp_encoding == Encoding::BINARY
|
340
|
+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
|
341
|
+
else
|
342
|
+
emit(:escape, :utf8_hex, text)
|
343
|
+
end
|
344
|
+
fret;
|
345
|
+
};
|
346
|
+
|
324
347
|
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
348
|
emit(:escape, :hex, copy(data, ts-1, te))
|
326
349
|
fret;
|
@@ -514,6 +537,7 @@
|
|
514
537
|
};
|
515
538
|
|
516
539
|
group_open @group_opened {
|
540
|
+
self.capturing_group_count += 1
|
517
541
|
text = copy(data, ts, te)
|
518
542
|
emit(:group, :capture, text)
|
519
543
|
};
|
@@ -662,6 +686,7 @@ class Regexp::Scanner
|
|
662
686
|
|
663
687
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
688
|
self.free_spacing = free_spacing?(input_object, options)
|
689
|
+
self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
|
665
690
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
666
691
|
|
667
692
|
data = input.unpack("c*")
|
@@ -672,6 +697,7 @@ class Regexp::Scanner
|
|
672
697
|
|
673
698
|
self.set_depth = 0
|
674
699
|
self.group_depth = 0
|
700
|
+
self.capturing_group_count = 0
|
675
701
|
self.conditional_stack = []
|
676
702
|
self.char_pos = 0
|
677
703
|
|
@@ -711,10 +737,9 @@ class Regexp::Scanner
|
|
711
737
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
738
|
end
|
713
739
|
|
714
|
-
|
740
|
+
POSIX_CLASSES =
|
715
741
|
%w[alnum alpha ascii blank cntrl digit graph
|
716
|
-
lower print punct space upper word xdigit]
|
717
|
-
end
|
742
|
+
lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
|
718
743
|
|
719
744
|
# Emits an array with the details of the scanned pattern
|
720
745
|
def emit(type, token, text)
|
@@ -749,7 +774,9 @@ class Regexp::Scanner
|
|
749
774
|
attr_accessor :block,
|
750
775
|
:collect_tokens, :tokens, :prev_token,
|
751
776
|
:free_spacing, :spacing_stack,
|
777
|
+
:regexp_encoding,
|
752
778
|
:group_depth, :set_depth, :conditional_stack,
|
779
|
+
:capturing_group_count,
|
753
780
|
:char_pos
|
754
781
|
|
755
782
|
def free_spacing?(input_object, options)
|