regexp_parser 2.9.2 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/regexp_parser/expression/classes/backreference.rb +1 -20
- data/lib/regexp_parser/expression/classes/conditional.rb +0 -14
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +18 -90
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
- data/lib/regexp_parser/expression.rb +3 -0
- data/lib/regexp_parser/parser.rb +8 -5
- data/lib/regexp_parser/scanner/scanner.rl +2 -2
- data/lib/regexp_parser/scanner.rb +725 -768
- data/lib/regexp_parser/version.rb +1 -1
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6ed5457d89738fa1076cf3875cd2d009973f02857ea68e055ef3ef74a78dc91
|
4
|
+
data.tar.gz: d67eb5f0cb37ad106574b2ae327eefcfc13c9d585cddec6661898f4d8166ebcc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b8adbc3c4707fc4c823456ae1d7547f17568802de03008a17fef18a5f95af08b0e42d48ccdfab25a740603a58ab89c036d70cec94405701201e5a5af51ce392
|
7
|
+
data.tar.gz: 9bea98a42ab64a9b45ddc5564cd077d7eb6d2ddc293844759bb8001aa9fefd8aa26b0e03fff7a286ccde9f7aeacacda9fbb187fe04082749d3c2605e0cece7b9
|
@@ -1,25 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Backreference
|
3
|
-
class Base < Regexp::Expression::Base
|
4
|
-
attr_accessor :referenced_expression
|
5
|
-
|
6
|
-
def initialize_copy(orig)
|
7
|
-
exp_id = [self.class, self.starts_at]
|
8
|
-
|
9
|
-
# prevent infinite recursion for recursive subexp calls
|
10
|
-
copied = @@copied ||= {}
|
11
|
-
self.referenced_expression =
|
12
|
-
if copied[exp_id]
|
13
|
-
orig.referenced_expression
|
14
|
-
else
|
15
|
-
copied[exp_id] = true
|
16
|
-
orig.referenced_expression.dup
|
17
|
-
end
|
18
|
-
copied.clear
|
19
|
-
|
20
|
-
super
|
21
|
-
end
|
22
|
-
end
|
3
|
+
class Base < Regexp::Expression::Base; end
|
23
4
|
|
24
5
|
class Number < Backreference::Base
|
25
6
|
attr_reader :number
|
@@ -7,26 +7,17 @@ module Regexp::Expression
|
|
7
7
|
end
|
8
8
|
|
9
9
|
class Condition < Regexp::Expression::Base
|
10
|
-
attr_accessor :referenced_expression
|
11
|
-
|
12
10
|
# Name or number of the referenced capturing group that determines state.
|
13
11
|
# Returns a String if reference is by name, Integer if by number.
|
14
12
|
def reference
|
15
13
|
ref = text.tr("'<>()", "")
|
16
14
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
15
|
end
|
18
|
-
|
19
|
-
def initialize_copy(orig)
|
20
|
-
self.referenced_expression = orig.referenced_expression.dup
|
21
|
-
super
|
22
|
-
end
|
23
16
|
end
|
24
17
|
|
25
18
|
class Branch < Regexp::Expression::Sequence; end
|
26
19
|
|
27
20
|
class Expression < Regexp::Expression::Subexpression
|
28
|
-
attr_accessor :referenced_expression
|
29
|
-
|
30
21
|
def <<(exp)
|
31
22
|
expressions.last << exp
|
32
23
|
end
|
@@ -54,11 +45,6 @@ module Regexp::Expression
|
|
54
45
|
def reference
|
55
46
|
condition.reference
|
56
47
|
end
|
57
|
-
|
58
|
-
def initialize_copy(orig)
|
59
|
-
self.referenced_expression = orig.referenced_expression.dup
|
60
|
-
super
|
61
|
-
end
|
62
48
|
end
|
63
49
|
end
|
64
50
|
end
|
@@ -1,100 +1,28 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module EscapeSequence
|
3
|
-
|
4
|
-
def codepoint
|
5
|
-
char.ord
|
6
|
-
end
|
3
|
+
Base = Class.new(Regexp::Expression::Base)
|
7
4
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
YAML.load(%Q(---\n"#{text}"\n))
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
5
|
+
AsciiEscape = Class.new(Base) # \e
|
6
|
+
Backspace = Class.new(Base) # \b
|
7
|
+
Bell = Class.new(Base) # \a
|
8
|
+
FormFeed = Class.new(Base) # \f
|
9
|
+
Newline = Class.new(Base) # \n
|
10
|
+
Return = Class.new(Base) # \r
|
11
|
+
Tab = Class.new(Base) # \t
|
12
|
+
VerticalTab = Class.new(Base) # \v
|
20
13
|
|
21
|
-
|
22
|
-
def char
|
23
|
-
text[1..-1]
|
24
|
-
end
|
25
|
-
end
|
14
|
+
Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes)
|
26
15
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
class FormFeed < EscapeSequence::Base; end
|
31
|
-
class Newline < EscapeSequence::Base; end
|
32
|
-
class Return < EscapeSequence::Base; end
|
33
|
-
class Tab < EscapeSequence::Base; end
|
34
|
-
class VerticalTab < EscapeSequence::Base; end
|
16
|
+
Octal = Class.new(Base) # e.g. \012
|
17
|
+
Hex = Class.new(Base) # e.g. \x0A
|
18
|
+
Codepoint = Class.new(Base) # e.g. \u000A
|
35
19
|
|
36
|
-
|
37
|
-
class Codepoint < EscapeSequence::Base; end
|
20
|
+
CodepointList = Class.new(Base) # e.g. \u{A B}
|
38
21
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
def codepoint
|
45
|
-
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
46
|
-
end
|
47
|
-
|
48
|
-
def chars
|
49
|
-
codepoints.map { |cp| cp.chr('utf-8') }
|
50
|
-
end
|
51
|
-
|
52
|
-
def codepoints
|
53
|
-
text.scan(/\h+/).map(&:hex)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class Octal < EscapeSequence::Base
|
58
|
-
def char
|
59
|
-
text[1..-1].to_i(8).chr('utf-8')
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
class AbstractMetaControlSequence < EscapeSequence::Base
|
64
|
-
def char
|
65
|
-
codepoint.chr('utf-8')
|
66
|
-
end
|
67
|
-
|
68
|
-
private
|
69
|
-
|
70
|
-
def control_sequence_to_s(control_sequence)
|
71
|
-
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
72
|
-
["000#{five_lsb}"].pack('B*')
|
73
|
-
end
|
74
|
-
|
75
|
-
def meta_char_to_codepoint(meta_char)
|
76
|
-
byte_value = meta_char.ord
|
77
|
-
byte_value < 128 ? byte_value + 128 : byte_value
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
class Control < AbstractMetaControlSequence
|
82
|
-
def codepoint
|
83
|
-
control_sequence_to_s(text).ord
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
class Meta < AbstractMetaControlSequence
|
88
|
-
def codepoint
|
89
|
-
meta_char_to_codepoint(text[-1])
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
class MetaControl < AbstractMetaControlSequence
|
94
|
-
def codepoint
|
95
|
-
meta_char_to_codepoint(control_sequence_to_s(text))
|
96
|
-
end
|
97
|
-
end
|
22
|
+
AbstractMetaControlSequence = Class.new(Base)
|
23
|
+
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
|
24
|
+
Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z
|
25
|
+
MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX
|
98
26
|
end
|
99
27
|
|
100
28
|
# alias for symmetry between Token::* and Expression::*
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Regexp::Expression::EscapeSequence
|
2
|
+
AsciiEscape.class_eval { def codepoint; 0x1B end }
|
3
|
+
Backspace.class_eval { def codepoint; 0x8 end }
|
4
|
+
Bell.class_eval { def codepoint; 0x7 end }
|
5
|
+
FormFeed.class_eval { def codepoint; 0xC end }
|
6
|
+
Newline.class_eval { def codepoint; 0xA end }
|
7
|
+
Return.class_eval { def codepoint; 0xD end }
|
8
|
+
Tab.class_eval { def codepoint; 0x9 end }
|
9
|
+
VerticalTab.class_eval { def codepoint; 0xB end }
|
10
|
+
|
11
|
+
Literal.class_eval { def codepoint; text[1].ord end }
|
12
|
+
|
13
|
+
Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end }
|
14
|
+
|
15
|
+
Hex.class_eval { def codepoint; text[/\h+/].hex end }
|
16
|
+
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
|
17
|
+
|
18
|
+
CodepointList.class_eval do
|
19
|
+
# Maybe this should be a unique top-level expression class?
|
20
|
+
def char
|
21
|
+
raise NoMethodError, 'CodepointList responds only to #chars'
|
22
|
+
end
|
23
|
+
|
24
|
+
def codepoint
|
25
|
+
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
26
|
+
end
|
27
|
+
|
28
|
+
def chars
|
29
|
+
codepoints.map { |cp| cp.chr('utf-8') }
|
30
|
+
end
|
31
|
+
|
32
|
+
def codepoints
|
33
|
+
text.scan(/\h+/).map(&:hex)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
AbstractMetaControlSequence.class_eval do
|
38
|
+
private
|
39
|
+
|
40
|
+
def control_sequence_to_s(control_sequence)
|
41
|
+
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
42
|
+
["000#{five_lsb}"].pack('B*')
|
43
|
+
end
|
44
|
+
|
45
|
+
def meta_char_to_codepoint(meta_char)
|
46
|
+
byte_value = meta_char.ord
|
47
|
+
byte_value < 128 ? byte_value + 128 : byte_value
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
Control.class_eval do
|
52
|
+
def codepoint
|
53
|
+
control_sequence_to_s(text).ord
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Meta.class_eval do
|
58
|
+
def codepoint
|
59
|
+
meta_char_to_codepoint(text[-1])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
MetaControl.class_eval do
|
64
|
+
def codepoint
|
65
|
+
meta_char_to_codepoint(control_sequence_to_s(text))
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module ReferencedExpressions
|
3
|
+
attr_accessor :referenced_expressions
|
4
|
+
|
5
|
+
def referenced_expression
|
6
|
+
referenced_expressions && referenced_expressions.first
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize_copy(orig)
|
10
|
+
exp_id = [self.class, self.starts_at]
|
11
|
+
|
12
|
+
# prevent infinite recursion for recursive subexp calls
|
13
|
+
copied = self.class.instance_eval { @copied_ref_exps ||= {} }
|
14
|
+
self.referenced_expressions =
|
15
|
+
if copied[exp_id]
|
16
|
+
orig.referenced_expressions
|
17
|
+
else
|
18
|
+
copied[exp_id] = true
|
19
|
+
orig.referenced_expressions && orig.referenced_expressions.map(&:dup)
|
20
|
+
end
|
21
|
+
copied.clear
|
22
|
+
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Base.include ReferencedExpressions
|
28
|
+
end
|
@@ -25,6 +25,8 @@ require_relative 'expression/classes/root'
|
|
25
25
|
require_relative 'expression/classes/unicode_property'
|
26
26
|
|
27
27
|
require_relative 'expression/methods/construct'
|
28
|
+
require_relative 'expression/methods/escape_sequence_char'
|
29
|
+
require_relative 'expression/methods/escape_sequence_codepoint'
|
28
30
|
require_relative 'expression/methods/human_name'
|
29
31
|
require_relative 'expression/methods/match'
|
30
32
|
require_relative 'expression/methods/match_length'
|
@@ -32,6 +34,7 @@ require_relative 'expression/methods/negative'
|
|
32
34
|
require_relative 'expression/methods/options'
|
33
35
|
require_relative 'expression/methods/parts'
|
34
36
|
require_relative 'expression/methods/printing'
|
37
|
+
require_relative 'expression/methods/referenced_expressions'
|
35
38
|
require_relative 'expression/methods/strfregexp'
|
36
39
|
require_relative 'expression/methods/tests'
|
37
40
|
require_relative 'expression/methods/traverse'
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -580,16 +580,19 @@ class Regexp::Parser
|
|
580
580
|
# the instance of Group::Capture that it refers to via its number.
|
581
581
|
def assign_referenced_expressions
|
582
582
|
# find all referenceable and referring expressions
|
583
|
-
targets = { 0 => root }
|
583
|
+
targets = { 0 => [root] }
|
584
584
|
referrers = []
|
585
585
|
root.each_expression do |exp|
|
586
|
-
exp.
|
587
|
-
|
586
|
+
if exp.referential?
|
587
|
+
referrers << exp
|
588
|
+
elsif exp.is_a?(Group::Capture)
|
589
|
+
(targets[exp.identifier] ||= []) << exp
|
590
|
+
end
|
588
591
|
end
|
589
|
-
# assign
|
592
|
+
# assign referenced expressions to referring expressions
|
590
593
|
# (in a second iteration because there might be forward references)
|
591
594
|
referrers.each do |exp|
|
592
|
-
exp.
|
595
|
+
exp.referenced_expressions = targets[exp.reference] ||
|
593
596
|
raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
|
594
597
|
end
|
595
598
|
end
|
@@ -78,8 +78,8 @@
|
|
78
78
|
# try to treat every other group head as options group, like Ruby
|
79
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
80
80
|
|
81
|
-
group_name_id_ab = ([
|
82
|
-
group_name_id_sq = ([^0-9\-']
|
81
|
+
group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
83
|
group_number = '-'? . [0-9]+;
|
84
84
|
group_level = [+\-] . [0-9]+;
|
85
85
|
|