regexp_parser 1.3.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -1
- data/Gemfile +3 -3
- data/README.md +10 -14
- data/Rakefile +3 -4
- data/lib/regexp_parser/expression.rb +28 -53
- data/lib/regexp_parser/expression/classes/backref.rb +18 -10
- data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
- data/lib/regexp_parser/expression/classes/escape.rb +0 -4
- data/lib/regexp_parser/expression/classes/group.rb +4 -2
- data/lib/regexp_parser/expression/classes/keep.rb +1 -3
- data/lib/regexp_parser/expression/methods/match.rb +13 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
- data/lib/regexp_parser/expression/methods/options.rb +35 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
- data/lib/regexp_parser/expression/methods/tests.rb +6 -15
- data/lib/regexp_parser/expression/quantifier.rb +2 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -6
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
- data/lib/regexp_parser/expression/subexpression.rb +3 -5
- data/lib/regexp_parser/lexer.rb +30 -44
- data/lib/regexp_parser/parser.rb +47 -24
- data/lib/regexp_parser/scanner.rb +1159 -1329
- data/lib/regexp_parser/scanner/char_type.rl +0 -3
- data/lib/regexp_parser/scanner/properties/long.yml +34 -1
- data/lib/regexp_parser/scanner/properties/short.yml +12 -0
- data/lib/regexp_parser/scanner/scanner.rl +82 -190
- data/lib/regexp_parser/syntax/tokens.rb +2 -10
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +72 -21
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +10 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +3 -3
- data/spec/expression/base_spec.rb +94 -0
- data/spec/expression/clone_spec.rb +120 -0
- data/spec/expression/conditional_spec.rb +89 -0
- data/spec/expression/free_space_spec.rb +27 -0
- data/spec/expression/methods/match_length_spec.rb +154 -0
- data/spec/expression/methods/match_spec.rb +25 -0
- data/spec/expression/methods/strfregexp_spec.rb +224 -0
- data/spec/expression/methods/tests_spec.rb +99 -0
- data/spec/expression/methods/traverse_spec.rb +140 -0
- data/spec/expression/options_spec.rb +128 -0
- data/spec/expression/root_spec.rb +9 -0
- data/spec/expression/sequence_spec.rb +9 -0
- data/spec/expression/subexpression_spec.rb +50 -0
- data/spec/expression/to_h_spec.rb +26 -0
- data/spec/expression/to_s_spec.rb +100 -0
- data/spec/lexer/all_spec.rb +22 -0
- data/spec/lexer/conditionals_spec.rb +53 -0
- data/spec/lexer/escapes_spec.rb +14 -0
- data/spec/lexer/keep_spec.rb +10 -0
- data/spec/lexer/literals_spec.rb +89 -0
- data/spec/lexer/nesting_spec.rb +99 -0
- data/spec/lexer/refcalls_spec.rb +55 -0
- data/spec/parser/all_spec.rb +43 -0
- data/spec/parser/alternation_spec.rb +88 -0
- data/spec/parser/anchors_spec.rb +17 -0
- data/spec/parser/conditionals_spec.rb +179 -0
- data/spec/parser/errors_spec.rb +30 -0
- data/spec/parser/escapes_spec.rb +121 -0
- data/spec/parser/free_space_spec.rb +130 -0
- data/spec/parser/groups_spec.rb +108 -0
- data/spec/parser/keep_spec.rb +6 -0
- data/spec/parser/posix_classes_spec.rb +8 -0
- data/spec/parser/properties_spec.rb +115 -0
- data/spec/parser/quantifiers_spec.rb +51 -0
- data/spec/parser/refcalls_spec.rb +112 -0
- data/spec/parser/set/intersections_spec.rb +127 -0
- data/spec/parser/set/ranges_spec.rb +111 -0
- data/spec/parser/sets_spec.rb +178 -0
- data/spec/parser/types_spec.rb +18 -0
- data/spec/scanner/all_spec.rb +18 -0
- data/spec/scanner/anchors_spec.rb +21 -0
- data/spec/scanner/conditionals_spec.rb +128 -0
- data/spec/scanner/errors_spec.rb +68 -0
- data/spec/scanner/escapes_spec.rb +53 -0
- data/spec/scanner/free_space_spec.rb +133 -0
- data/spec/scanner/groups_spec.rb +52 -0
- data/spec/scanner/keep_spec.rb +10 -0
- data/spec/scanner/literals_spec.rb +49 -0
- data/spec/scanner/meta_spec.rb +18 -0
- data/spec/scanner/properties_spec.rb +64 -0
- data/spec/scanner/quantifiers_spec.rb +20 -0
- data/spec/scanner/refcalls_spec.rb +36 -0
- data/spec/scanner/sets_spec.rb +102 -0
- data/spec/scanner/types_spec.rb +14 -0
- data/spec/spec_helper.rb +15 -0
- data/{test → spec}/support/runner.rb +9 -8
- data/spec/support/shared_examples.rb +77 -0
- data/{test → spec}/support/warning_extractor.rb +5 -7
- data/spec/syntax/syntax_spec.rb +48 -0
- data/spec/syntax/syntax_token_map_spec.rb +23 -0
- data/spec/syntax/versions/1.8.6_spec.rb +17 -0
- data/spec/syntax/versions/1.9.1_spec.rb +10 -0
- data/spec/syntax/versions/1.9.3_spec.rb +9 -0
- data/spec/syntax/versions/2.0.0_spec.rb +13 -0
- data/spec/syntax/versions/2.2.0_spec.rb +9 -0
- data/spec/syntax/versions/aliases_spec.rb +37 -0
- data/spec/token/token_spec.rb +85 -0
- metadata +144 -143
- data/test/expression/test_all.rb +0 -12
- data/test/expression/test_base.rb +0 -90
- data/test/expression/test_clone.rb +0 -89
- data/test/expression/test_conditionals.rb +0 -113
- data/test/expression/test_free_space.rb +0 -35
- data/test/expression/test_set.rb +0 -84
- data/test/expression/test_strfregexp.rb +0 -230
- data/test/expression/test_subexpression.rb +0 -58
- data/test/expression/test_tests.rb +0 -99
- data/test/expression/test_to_h.rb +0 -59
- data/test/expression/test_to_s.rb +0 -104
- data/test/expression/test_traverse.rb +0 -161
- data/test/helpers.rb +0 -10
- data/test/lexer/test_all.rb +0 -41
- data/test/lexer/test_conditionals.rb +0 -127
- data/test/lexer/test_keep.rb +0 -24
- data/test/lexer/test_literals.rb +0 -130
- data/test/lexer/test_nesting.rb +0 -132
- data/test/lexer/test_refcalls.rb +0 -56
- data/test/parser/set/test_intersections.rb +0 -127
- data/test/parser/set/test_ranges.rb +0 -111
- data/test/parser/test_all.rb +0 -64
- data/test/parser/test_alternation.rb +0 -92
- data/test/parser/test_anchors.rb +0 -34
- data/test/parser/test_conditionals.rb +0 -187
- data/test/parser/test_errors.rb +0 -63
- data/test/parser/test_escapes.rb +0 -134
- data/test/parser/test_free_space.rb +0 -139
- data/test/parser/test_groups.rb +0 -289
- data/test/parser/test_keep.rb +0 -21
- data/test/parser/test_posix_classes.rb +0 -27
- data/test/parser/test_properties.rb +0 -133
- data/test/parser/test_quantifiers.rb +0 -301
- data/test/parser/test_refcalls.rb +0 -186
- data/test/parser/test_sets.rb +0 -179
- data/test/parser/test_types.rb +0 -50
- data/test/scanner/test_all.rb +0 -38
- data/test/scanner/test_anchors.rb +0 -38
- data/test/scanner/test_conditionals.rb +0 -184
- data/test/scanner/test_errors.rb +0 -91
- data/test/scanner/test_escapes.rb +0 -56
- data/test/scanner/test_free_space.rb +0 -200
- data/test/scanner/test_groups.rb +0 -79
- data/test/scanner/test_keep.rb +0 -35
- data/test/scanner/test_literals.rb +0 -89
- data/test/scanner/test_meta.rb +0 -40
- data/test/scanner/test_properties.rb +0 -312
- data/test/scanner/test_quantifiers.rb +0 -37
- data/test/scanner/test_refcalls.rb +0 -52
- data/test/scanner/test_scripts.rb +0 -53
- data/test/scanner/test_sets.rb +0 -119
- data/test/scanner/test_types.rb +0 -35
- data/test/scanner/test_unicode_blocks.rb +0 -30
- data/test/support/disable_autotest.rb +0 -8
- data/test/syntax/test_all.rb +0 -6
- data/test/syntax/test_syntax.rb +0 -61
- data/test/syntax/test_syntax_token_map.rb +0 -25
- data/test/syntax/versions/test_1.8.rb +0 -55
- data/test/syntax/versions/test_1.9.1.rb +0 -36
- data/test/syntax/versions/test_1.9.3.rb +0 -32
- data/test/syntax/versions/test_2.0.0.rb +0 -37
- data/test/syntax/versions/test_2.2.0.rb +0 -32
- data/test/syntax/versions/test_aliases.rb +0 -129
- data/test/syntax/versions/test_all.rb +0 -5
- data/test/test_all.rb +0 -5
- data/test/token/test_all.rb +0 -2
- data/test/token/test_token.rb +0 -107
@@ -0,0 +1,172 @@
|
|
1
|
+
class Regexp::MatchLength
|
2
|
+
include Enumerable
|
3
|
+
|
4
|
+
def self.of(obj)
|
5
|
+
exp = obj.is_a?(Regexp::Expression::Base) ? obj : Regexp::Parser.parse(obj)
|
6
|
+
exp.match_length
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize(exp, opts = {})
|
10
|
+
self.exp_class = exp.class
|
11
|
+
self.min_rep = exp.repetitions.min
|
12
|
+
self.max_rep = exp.repetitions.max
|
13
|
+
if base = opts[:base]
|
14
|
+
self.base_min = base
|
15
|
+
self.base_max = base
|
16
|
+
self.reify = ->{ '.' * base }
|
17
|
+
else
|
18
|
+
self.base_min = opts.fetch(:base_min)
|
19
|
+
self.base_max = opts.fetch(:base_max)
|
20
|
+
self.reify = opts.fetch(:reify)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each(opts = {})
|
25
|
+
return enum_for(__method__) unless block_given?
|
26
|
+
limit = opts[:limit] || 1000
|
27
|
+
yielded = 0
|
28
|
+
(min..max).each do |num|
|
29
|
+
next unless include?(num)
|
30
|
+
yield(num)
|
31
|
+
break if (yielded += 1) >= limit
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def endless_each(&block)
|
36
|
+
return enum_for(__method__) unless block_given?
|
37
|
+
(min..max).each { |num| yield(num) if include?(num) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def include?(length)
|
41
|
+
test_regexp.match?('X' * length)
|
42
|
+
end
|
43
|
+
|
44
|
+
def fixed?
|
45
|
+
min == max
|
46
|
+
end
|
47
|
+
|
48
|
+
def min
|
49
|
+
min_rep * base_min
|
50
|
+
end
|
51
|
+
|
52
|
+
def max
|
53
|
+
max_rep * base_max
|
54
|
+
end
|
55
|
+
|
56
|
+
def minmax
|
57
|
+
[min, max]
|
58
|
+
end
|
59
|
+
|
60
|
+
def inspect
|
61
|
+
type = exp_class.name.sub('Regexp::Expression::', '')
|
62
|
+
"#<#{self.class}<#{type}> min=#{min} max=#{max}>"
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_re
|
66
|
+
"(?:#{reify.call}){#{min_rep},#{max_rep unless max_rep == Float::INFINITY}}"
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
attr_accessor :base_min, :base_max, :min_rep, :max_rep, :exp_class, :reify
|
72
|
+
|
73
|
+
def test_regexp
|
74
|
+
@test_regexp ||= Regexp.new("^#{to_re}$").tap do |regexp|
|
75
|
+
regexp.respond_to?(:match?) || def regexp.match?(str); !!match(str) end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
module Regexp::Expression
|
81
|
+
MatchLength = Regexp::MatchLength
|
82
|
+
|
83
|
+
[
|
84
|
+
CharacterSet,
|
85
|
+
CharacterSet::Intersection,
|
86
|
+
CharacterSet::IntersectedSequence,
|
87
|
+
CharacterSet::Range,
|
88
|
+
CharacterType::Base,
|
89
|
+
EscapeSequence::Base,
|
90
|
+
PosixClass,
|
91
|
+
UnicodeProperty::Base,
|
92
|
+
].each do |klass|
|
93
|
+
klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
94
|
+
def match_length
|
95
|
+
MatchLength.new(self, base: 1)
|
96
|
+
end
|
97
|
+
RUBY
|
98
|
+
end
|
99
|
+
|
100
|
+
class Literal
|
101
|
+
def match_length
|
102
|
+
MatchLength.new(self, base: text.length)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class Subexpression
|
107
|
+
def match_length
|
108
|
+
MatchLength.new(self,
|
109
|
+
base_min: map { |exp| exp.match_length.min }.inject(0, :+),
|
110
|
+
base_max: map { |exp| exp.match_length.max }.inject(0, :+),
|
111
|
+
reify: ->{ map { |exp| exp.match_length.to_re }.join })
|
112
|
+
end
|
113
|
+
|
114
|
+
def inner_match_length
|
115
|
+
dummy = Regexp::Expression::Root.build
|
116
|
+
dummy.expressions = expressions.map(&:clone)
|
117
|
+
dummy.quantifier = quantifier && quantifier.clone
|
118
|
+
dummy.match_length
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
[
|
123
|
+
Alternation,
|
124
|
+
Conditional::Expression,
|
125
|
+
].each do |klass|
|
126
|
+
klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
127
|
+
def match_length
|
128
|
+
MatchLength.new(self,
|
129
|
+
base_min: map { |exp| exp.match_length.min }.min,
|
130
|
+
base_max: map { |exp| exp.match_length.max }.max,
|
131
|
+
reify: ->{ map { |exp| exp.match_length.to_re }.join('|') })
|
132
|
+
end
|
133
|
+
RUBY
|
134
|
+
end
|
135
|
+
|
136
|
+
[
|
137
|
+
Anchor::Base,
|
138
|
+
Assertion::Base,
|
139
|
+
Conditional::Condition,
|
140
|
+
FreeSpace,
|
141
|
+
Keep::Mark,
|
142
|
+
].each do |klass|
|
143
|
+
klass.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
144
|
+
def match_length
|
145
|
+
MatchLength.new(self, base: 0)
|
146
|
+
end
|
147
|
+
RUBY
|
148
|
+
end
|
149
|
+
|
150
|
+
class Backreference::Base
|
151
|
+
def match_length
|
152
|
+
if referenced_expression.nil?
|
153
|
+
raise ArgumentError, 'Missing referenced_expression - not parsed?'
|
154
|
+
end
|
155
|
+
referenced_expression.unquantified_clone.match_length
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class EscapeSequence::CodepointList
|
160
|
+
def match_length
|
161
|
+
MatchLength.new(self, base: codepoints.count)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Special case. Absence group can match 0.. chars, irrespective of content.
|
166
|
+
# TODO: in theory, they *can* exclude match lengths with `.`: `(?~.{3})`
|
167
|
+
class Group::Absence
|
168
|
+
def match_length
|
169
|
+
MatchLength.new(self, base_min: 0, base_max: Float::INFINITY, reify: ->{ '.*' })
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class Base
|
3
|
+
def multiline?
|
4
|
+
options[:m] == true
|
5
|
+
end
|
6
|
+
alias :m? :multiline?
|
7
|
+
|
8
|
+
def case_insensitive?
|
9
|
+
options[:i] == true
|
10
|
+
end
|
11
|
+
alias :i? :case_insensitive?
|
12
|
+
alias :ignore_case? :case_insensitive?
|
13
|
+
|
14
|
+
def free_spacing?
|
15
|
+
options[:x] == true
|
16
|
+
end
|
17
|
+
alias :x? :free_spacing?
|
18
|
+
alias :extended? :free_spacing?
|
19
|
+
|
20
|
+
def default_classes?
|
21
|
+
options[:d] == true
|
22
|
+
end
|
23
|
+
alias :d? :default_classes?
|
24
|
+
|
25
|
+
def ascii_classes?
|
26
|
+
options[:a] == true
|
27
|
+
end
|
28
|
+
alias :a? :ascii_classes?
|
29
|
+
|
30
|
+
def unicode_classes?
|
31
|
+
options[:u] == true
|
32
|
+
end
|
33
|
+
alias :u? :unicode_classes?
|
34
|
+
end
|
35
|
+
end
|
@@ -75,32 +75,23 @@ module Regexp::Expression
|
|
75
75
|
def one_of?(scope, top = true)
|
76
76
|
case scope
|
77
77
|
when Array
|
78
|
-
|
79
|
-
return (scope.include?(token) or scope.include?(:*))
|
80
|
-
else
|
81
|
-
return scope.include?(token)
|
82
|
-
end
|
78
|
+
scope.include?(:*) || scope.include?(token)
|
83
79
|
|
84
80
|
when Hash
|
85
81
|
if scope.has_key?(:*)
|
86
82
|
test_type = scope.has_key?(type) ? type : :*
|
87
|
-
|
83
|
+
one_of?(scope[test_type], false)
|
88
84
|
else
|
89
|
-
|
85
|
+
scope.has_key?(type) && one_of?(scope[type], false)
|
90
86
|
end
|
91
87
|
|
92
88
|
when Symbol
|
93
|
-
|
94
|
-
|
95
|
-
return is?(scope) unless top
|
96
|
-
return type?(scope) if top
|
89
|
+
scope.equal?(:*) || (top ? type?(scope) : is?(scope))
|
97
90
|
|
98
91
|
else
|
99
|
-
raise
|
92
|
+
raise ArgumentError,
|
93
|
+
"Array, Hash, or Symbol expected, #{scope.class.name} given"
|
100
94
|
end
|
101
|
-
|
102
|
-
false
|
103
95
|
end
|
104
|
-
|
105
96
|
end
|
106
97
|
end
|
@@ -18,13 +18,14 @@ module Regexp::Expression
|
|
18
18
|
end
|
19
19
|
|
20
20
|
class << self
|
21
|
-
def add_to(subexpression,
|
21
|
+
def add_to(subexpression, params = {}, active_opts = {})
|
22
22
|
sequence = at_levels(
|
23
23
|
subexpression.level,
|
24
24
|
subexpression.set_level,
|
25
|
-
|
25
|
+
params[:conditional_level] || subexpression.conditional_level
|
26
26
|
)
|
27
27
|
sequence.nesting_level = subexpression.nesting_level + 1
|
28
|
+
sequence.options = active_opts
|
28
29
|
subexpression.expressions << sequence
|
29
30
|
sequence
|
30
31
|
end
|
@@ -44,10 +45,6 @@ module Regexp::Expression
|
|
44
45
|
end
|
45
46
|
end
|
46
47
|
|
47
|
-
def text
|
48
|
-
to_s
|
49
|
-
end
|
50
|
-
|
51
48
|
def starts_at
|
52
49
|
expressions.first.starts_at
|
53
50
|
end
|
@@ -14,12 +14,8 @@ module Regexp::Expression
|
|
14
14
|
expressions.last << exp
|
15
15
|
end
|
16
16
|
|
17
|
-
def add_sequence
|
18
|
-
self.class::OPERAND.add_to(self)
|
19
|
-
end
|
20
|
-
|
21
|
-
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
22
|
-
sequences.last.last.quantify(token, text, min, max, mode)
|
17
|
+
def add_sequence(active_opts = {})
|
18
|
+
self.class::OPERAND.add_to(self, {}, active_opts)
|
23
19
|
end
|
24
20
|
|
25
21
|
def to_s(format = :full)
|
@@ -12,8 +12,8 @@ module Regexp::Expression
|
|
12
12
|
end
|
13
13
|
|
14
14
|
# Override base method to clone the expressions as well.
|
15
|
-
def initialize_clone(
|
16
|
-
|
15
|
+
def initialize_clone(orig)
|
16
|
+
self.expressions = orig.expressions.map(&:clone)
|
17
17
|
super
|
18
18
|
end
|
19
19
|
|
@@ -46,9 +46,7 @@ module Regexp::Expression
|
|
46
46
|
|
47
47
|
def to_s(format = :full)
|
48
48
|
# Note: the format does not get passed down to subexpressions.
|
49
|
-
#
|
50
|
-
# in Expression::Sequence, causing infinite recursion. Clean-up needed.
|
51
|
-
"#{@text}#{expressions.join}#{quantifier_affix(format)}"
|
49
|
+
"#{expressions.join}#{quantifier_affix(format)}"
|
52
50
|
end
|
53
51
|
|
54
52
|
def to_h
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -22,6 +22,7 @@ class Regexp::Lexer
|
|
22
22
|
self.nesting = 0
|
23
23
|
self.set_nesting = 0
|
24
24
|
self.conditional_nesting = 0
|
25
|
+
self.shift = 0
|
25
26
|
|
26
27
|
last = nil
|
27
28
|
Regexp::Scanner.scan(input) do |type, token, text, ts, te|
|
@@ -30,15 +31,13 @@ class Regexp::Lexer
|
|
30
31
|
|
31
32
|
ascend(type, token)
|
32
33
|
|
33
|
-
|
34
|
-
last
|
35
|
-
|
36
|
-
|
37
|
-
nesting, set_nesting, conditional_nesting)
|
34
|
+
if type == :quantifier and last
|
35
|
+
break_literal(last) if last.type == :literal
|
36
|
+
break_codepoint_list(last) if last.token == :codepoint_list
|
37
|
+
end
|
38
38
|
|
39
|
-
current =
|
40
|
-
|
41
|
-
last and last.type == :literal
|
39
|
+
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
40
|
+
nesting, set_nesting, conditional_nesting)
|
42
41
|
|
43
42
|
current = merge_condition(current) if type == :conditional and
|
44
43
|
[:condition, :condition_close].include?(token)
|
@@ -65,7 +64,7 @@ class Regexp::Lexer
|
|
65
64
|
|
66
65
|
private
|
67
66
|
|
68
|
-
attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
|
67
|
+
attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
|
69
68
|
|
70
69
|
def ascend(type, token)
|
71
70
|
case type
|
@@ -92,44 +91,31 @@ class Regexp::Lexer
|
|
92
91
|
# called by scan to break a literal run that is longer than one character
|
93
92
|
# into two separate tokens when it is followed by a quantifier
|
94
93
|
def break_literal(token)
|
95
|
-
|
96
|
-
if
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
last_length = last.length
|
106
|
-
end
|
107
|
-
|
108
|
-
tokens.pop
|
109
|
-
tokens << Regexp::Token.new(:literal, :literal, lead, token.ts,
|
110
|
-
(token.te - last_length), nesting, set_nesting, conditional_nesting)
|
111
|
-
|
112
|
-
tokens << Regexp::Token.new(:literal, :literal, last,
|
113
|
-
(token.ts + lead_length),
|
114
|
-
token.te, nesting, set_nesting, conditional_nesting)
|
115
|
-
end
|
94
|
+
lead, last, _ = token.text.partition(/.\z/mu)
|
95
|
+
return if lead.empty?
|
96
|
+
|
97
|
+
tokens.pop
|
98
|
+
tokens << Regexp::Token.new(:literal, :literal, lead,
|
99
|
+
token.ts, (token.te - last.bytesize),
|
100
|
+
nesting, set_nesting, conditional_nesting)
|
101
|
+
tokens << Regexp::Token.new(:literal, :literal, last,
|
102
|
+
(token.ts + lead.bytesize), token.te,
|
103
|
+
nesting, set_nesting, conditional_nesting)
|
116
104
|
end
|
117
105
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
106
|
+
def break_codepoint_list(token)
|
107
|
+
lead, _, tail = token.text.rpartition(' ')
|
108
|
+
return if lead.empty?
|
109
|
+
|
110
|
+
tokens.pop
|
111
|
+
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
112
|
+
token.ts, (token.te - tail.length),
|
113
|
+
nesting, set_nesting, conditional_nesting)
|
114
|
+
tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
115
|
+
(token.ts + lead.length + 1), (token.te + 3),
|
116
|
+
nesting, set_nesting, conditional_nesting)
|
122
117
|
|
123
|
-
|
124
|
-
:literal,
|
125
|
-
:literal,
|
126
|
-
last.text + current.text,
|
127
|
-
last.ts,
|
128
|
-
current.te,
|
129
|
-
nesting,
|
130
|
-
set_nesting,
|
131
|
-
conditional_nesting,
|
132
|
-
)
|
118
|
+
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
133
119
|
end
|
134
120
|
|
135
121
|
def merge_condition(current)
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -39,6 +39,8 @@ class Regexp::Parser
|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
assign_referenced_expressions
|
43
|
+
|
42
44
|
if block_given?
|
43
45
|
block.call(root)
|
44
46
|
else
|
@@ -163,14 +165,18 @@ class Regexp::Parser
|
|
163
165
|
node << Backreference::NameCall.new(token, active_opts)
|
164
166
|
when :number, :number_ref
|
165
167
|
node << Backreference::Number.new(token, active_opts)
|
166
|
-
when :number_rel_ref
|
167
|
-
node << Backreference::NumberRelative.new(token, active_opts)
|
168
168
|
when :number_recursion_ref
|
169
169
|
node << Backreference::NumberRecursionLevel.new(token, active_opts)
|
170
170
|
when :number_call
|
171
171
|
node << Backreference::NumberCall.new(token, active_opts)
|
172
|
+
when :number_rel_ref
|
173
|
+
node << Backreference::NumberRelative.new(token, active_opts).tap do |exp|
|
174
|
+
assign_effective_number(exp)
|
175
|
+
end
|
172
176
|
when :number_rel_call
|
173
|
-
node << Backreference::NumberCallRelative.new(token, active_opts)
|
177
|
+
node << Backreference::NumberCallRelative.new(token, active_opts).tap do |exp|
|
178
|
+
assign_effective_number(exp)
|
179
|
+
end
|
174
180
|
else
|
175
181
|
raise UnknownTokenError.new('Backreference', token)
|
176
182
|
end
|
@@ -209,9 +215,9 @@ class Regexp::Parser
|
|
209
215
|
nest_conditional(Conditional::Expression.new(token, active_opts))
|
210
216
|
when :condition
|
211
217
|
conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
|
212
|
-
conditional_nesting.last.
|
218
|
+
conditional_nesting.last.add_sequence(active_opts)
|
213
219
|
when :separator
|
214
|
-
conditional_nesting.last.
|
220
|
+
conditional_nesting.last.add_sequence(active_opts)
|
215
221
|
self.node = conditional_nesting.last.branches.last
|
216
222
|
when :close
|
217
223
|
conditional_nesting.pop
|
@@ -229,7 +235,7 @@ class Regexp::Parser
|
|
229
235
|
end
|
230
236
|
|
231
237
|
def posixclass(token)
|
232
|
-
node << PosixClass.new(token)
|
238
|
+
node << PosixClass.new(token, active_opts)
|
233
239
|
end
|
234
240
|
|
235
241
|
include Regexp::Expression::UnicodeProperty
|
@@ -491,6 +497,9 @@ class Regexp::Parser
|
|
491
497
|
end
|
492
498
|
end
|
493
499
|
|
500
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
501
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
502
|
+
|
494
503
|
def options_group(token)
|
495
504
|
positive, negative = token.text.split('-', 2)
|
496
505
|
negative ||= ''
|
@@ -499,23 +508,23 @@ class Regexp::Parser
|
|
499
508
|
opt_changes = {}
|
500
509
|
new_active_opts = active_opts.dup
|
501
510
|
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
opt_changes[flag.to_sym] = new_active_opts[flag.to_sym] = true
|
511
|
+
MOD_FLAGS.each do |flag|
|
512
|
+
if positive.include?(flag.to_s)
|
513
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
506
514
|
end
|
507
|
-
if negative.include?(flag)
|
508
|
-
opt_changes[flag
|
509
|
-
new_active_opts.delete(flag
|
515
|
+
if negative.include?(flag.to_s)
|
516
|
+
opt_changes[flag] = false
|
517
|
+
new_active_opts.delete(flag)
|
510
518
|
end
|
511
519
|
end
|
512
520
|
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
521
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
522
|
+
enc_flag = enc_flag.to_sym
|
523
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
524
|
+
opt_changes[other] = false if new_active_opts[other]
|
525
|
+
new_active_opts.delete(other)
|
526
|
+
end
|
527
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
519
528
|
end
|
520
529
|
|
521
530
|
options_stack << new_active_opts
|
@@ -600,16 +609,14 @@ class Regexp::Parser
|
|
600
609
|
end
|
601
610
|
|
602
611
|
def sequence_operation(klass, token)
|
603
|
-
|
604
|
-
self.node = node.last
|
605
|
-
elsif !node.is_a?(klass)
|
612
|
+
unless node.is_a?(klass)
|
606
613
|
operator = klass.new(token, active_opts)
|
607
|
-
sequence = operator.add_sequence
|
614
|
+
sequence = operator.add_sequence(active_opts)
|
608
615
|
sequence.expressions = node.expressions
|
609
616
|
node.expressions = []
|
610
617
|
nest(operator)
|
611
618
|
end
|
612
|
-
node.add_sequence
|
619
|
+
node.add_sequence(active_opts)
|
613
620
|
end
|
614
621
|
|
615
622
|
def active_opts
|
@@ -627,4 +634,20 @@ class Regexp::Parser
|
|
627
634
|
def count_captured_group
|
628
635
|
captured_group_counts[node.level] += 1
|
629
636
|
end
|
637
|
+
|
638
|
+
def assign_effective_number(exp)
|
639
|
+
exp.effective_number =
|
640
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
641
|
+
end
|
642
|
+
|
643
|
+
def assign_referenced_expressions
|
644
|
+
targets = {}
|
645
|
+
root.each_expression do |exp|
|
646
|
+
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
647
|
+
end
|
648
|
+
root.each_expression do |exp|
|
649
|
+
exp.respond_to?(:reference) &&
|
650
|
+
exp.referenced_expression = targets[exp.reference]
|
651
|
+
end
|
652
|
+
end
|
630
653
|
end # module Regexp::Parser
|