regexp_parser 2.6.0 → 2.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +34 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +101 -100
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1132 -1283
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +14 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
@@ -5,21 +5,16 @@ module Regexp::Expression
|
|
5
5
|
alias :operands :expressions
|
6
6
|
alias :operator :text
|
7
7
|
|
8
|
-
def
|
9
|
-
expressions.first.
|
8
|
+
def ts
|
9
|
+
(head = expressions.first) ? head.ts : @ts
|
10
10
|
end
|
11
|
-
alias :ts :starts_at
|
12
11
|
|
13
12
|
def <<(exp)
|
14
13
|
expressions.last << exp
|
15
14
|
end
|
16
15
|
|
17
|
-
def add_sequence(active_opts = {})
|
18
|
-
self.class::OPERAND.add_to(self,
|
19
|
-
end
|
20
|
-
|
21
|
-
def parts
|
22
|
-
intersperse(expressions, text.dup)
|
16
|
+
def add_sequence(active_opts = {}, params = { ts: 0 })
|
17
|
+
self.class::OPERAND.add_to(self, params, active_opts)
|
23
18
|
end
|
24
19
|
end
|
25
20
|
end
|
@@ -8,7 +8,8 @@ module Regexp::Expression
|
|
8
8
|
|
9
9
|
attr_accessor :type, :token, :text, :ts, :te,
|
10
10
|
:level, :set_level, :conditional_level,
|
11
|
-
:options
|
11
|
+
:options, :parent,
|
12
|
+
:custom_to_s_handling, :pre_quantifier_decorations
|
12
13
|
|
13
14
|
attr_reader :nesting_level, :quantifier
|
14
15
|
end
|
@@ -32,6 +33,10 @@ module Regexp::Expression
|
|
32
33
|
self.text = orig.text.dup if orig.text
|
33
34
|
self.options = orig.options.dup if orig.options
|
34
35
|
self.quantifier = orig.quantifier.clone if orig.quantifier
|
36
|
+
self.parent = nil # updated by Subexpression#initialize_copy
|
37
|
+
if orig.pre_quantifier_decorations
|
38
|
+
self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
|
39
|
+
end
|
35
40
|
super
|
36
41
|
end
|
37
42
|
|
@@ -39,35 +44,51 @@ module Regexp::Expression
|
|
39
44
|
ts
|
40
45
|
end
|
41
46
|
|
47
|
+
def ends_at(include_quantifier = true)
|
48
|
+
ts + (include_quantifier ? full_length : base_length)
|
49
|
+
end
|
50
|
+
|
42
51
|
def base_length
|
43
52
|
to_s(:base).length
|
44
53
|
end
|
45
54
|
|
46
55
|
def full_length
|
47
|
-
to_s.length
|
48
|
-
end
|
49
|
-
|
56
|
+
to_s(:original).length
|
57
|
+
end
|
58
|
+
|
59
|
+
# #to_s reproduces the original source, as an unparser would.
|
60
|
+
#
|
61
|
+
# It takes an optional format argument.
|
62
|
+
#
|
63
|
+
# Example:
|
64
|
+
#
|
65
|
+
# lit = Regexp::Parser.parse(/a +/x)[0]
|
66
|
+
#
|
67
|
+
# lit.to_s # => 'a+' # default; with quantifier
|
68
|
+
# lit.to_s(:full) # => 'a+' # default; with quantifier
|
69
|
+
# lit.to_s(:base) # => 'a' # without quantifier
|
70
|
+
# lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
|
71
|
+
#
|
50
72
|
def to_s(format = :full)
|
51
|
-
|
73
|
+
base = parts.each_with_object(''.dup) do |part, buff|
|
74
|
+
if part.instance_of?(String)
|
75
|
+
buff << part
|
76
|
+
elsif !part.custom_to_s_handling
|
77
|
+
buff << part.to_s(:original)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
"#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
|
52
81
|
end
|
53
82
|
alias :to_str :to_s
|
54
83
|
|
55
|
-
def
|
56
|
-
|
84
|
+
def pre_quantifier_decoration(expression_format = :original)
|
85
|
+
pre_quantifier_decorations.to_a.join if expression_format == :original
|
57
86
|
end
|
58
87
|
|
59
|
-
def quantifier_affix(expression_format)
|
88
|
+
def quantifier_affix(expression_format = :full)
|
60
89
|
quantifier.to_s if quantified? && expression_format != :base
|
61
90
|
end
|
62
91
|
|
63
|
-
def quantified?
|
64
|
-
!quantifier.nil?
|
65
|
-
end
|
66
|
-
|
67
|
-
def optional?
|
68
|
-
quantified? && quantifier.min == 0
|
69
|
-
end
|
70
|
-
|
71
92
|
def offset
|
72
93
|
[starts_at, full_length]
|
73
94
|
end
|
@@ -76,10 +97,6 @@ module Regexp::Expression
|
|
76
97
|
'@%d+%d' % offset
|
77
98
|
end
|
78
99
|
|
79
|
-
def terminal?
|
80
|
-
!respond_to?(:expressions)
|
81
|
-
end
|
82
|
-
|
83
100
|
def nesting_level=(lvl)
|
84
101
|
@nesting_level = lvl
|
85
102
|
quantifier && quantifier.nesting_level = lvl
|
@@ -11,17 +11,15 @@ module Regexp::Expression
|
|
11
11
|
|
12
12
|
# Override base method to clone the expressions as well.
|
13
13
|
def initialize_copy(orig)
|
14
|
-
self.expressions = orig.expressions.map
|
14
|
+
self.expressions = orig.expressions.map do |exp|
|
15
|
+
exp.clone.tap { |copy| copy.parent = self }
|
16
|
+
end
|
15
17
|
super
|
16
18
|
end
|
17
19
|
|
18
20
|
def <<(exp)
|
19
|
-
|
20
|
-
|
21
|
-
else
|
22
|
-
exp.nesting_level = nesting_level + 1
|
23
|
-
expressions << exp
|
24
|
-
end
|
21
|
+
exp.parent = self
|
22
|
+
expressions << exp
|
25
23
|
end
|
26
24
|
|
27
25
|
%w[[] at each empty? fetch index join last length values_at].each do |method|
|
@@ -39,11 +37,7 @@ module Regexp::Expression
|
|
39
37
|
end
|
40
38
|
|
41
39
|
def te
|
42
|
-
ts +
|
43
|
-
end
|
44
|
-
|
45
|
-
def parts
|
46
|
-
expressions
|
40
|
+
ts + base_length
|
47
41
|
end
|
48
42
|
|
49
43
|
def to_h
|
@@ -53,10 +47,21 @@ module Regexp::Expression
|
|
53
47
|
)
|
54
48
|
end
|
55
49
|
|
56
|
-
|
50
|
+
def extract_quantifier_target(quantifier_description)
|
51
|
+
pre_quantifier_decorations = []
|
52
|
+
target = expressions.reverse.find do |exp|
|
53
|
+
if exp.decorative?
|
54
|
+
exp.custom_to_s_handling = true
|
55
|
+
pre_quantifier_decorations << exp.text
|
56
|
+
next
|
57
|
+
end
|
58
|
+
exp
|
59
|
+
end
|
60
|
+
target or raise Regexp::Parser::ParserError,
|
61
|
+
"No valid target found for '#{quantifier_description}' quantifier"
|
57
62
|
|
58
|
-
|
59
|
-
|
63
|
+
target.pre_quantifier_decorations = pre_quantifier_decorations
|
64
|
+
target
|
60
65
|
end
|
61
66
|
end
|
62
67
|
end
|
@@ -1,34 +1,37 @@
|
|
1
|
-
|
1
|
+
require_relative 'error'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
3
|
+
require_relative 'expression/shared'
|
4
|
+
require_relative 'expression/base'
|
5
|
+
require_relative 'expression/quantifier'
|
6
|
+
require_relative 'expression/subexpression'
|
7
|
+
require_relative 'expression/sequence'
|
8
|
+
require_relative 'expression/sequence_operation'
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
10
|
+
require_relative 'expression/classes/alternation'
|
11
|
+
require_relative 'expression/classes/anchor'
|
12
|
+
require_relative 'expression/classes/backreference'
|
13
|
+
require_relative 'expression/classes/character_set'
|
14
|
+
require_relative 'expression/classes/character_set/intersection'
|
15
|
+
require_relative 'expression/classes/character_set/range'
|
16
|
+
require_relative 'expression/classes/character_type'
|
17
|
+
require_relative 'expression/classes/conditional'
|
18
|
+
require_relative 'expression/classes/escape_sequence'
|
19
|
+
require_relative 'expression/classes/free_space'
|
20
|
+
require_relative 'expression/classes/group'
|
21
|
+
require_relative 'expression/classes/keep'
|
22
|
+
require_relative 'expression/classes/literal'
|
23
|
+
require_relative 'expression/classes/posix_class'
|
24
|
+
require_relative 'expression/classes/root'
|
25
|
+
require_relative 'expression/classes/unicode_property'
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
require_relative 'expression/methods/construct'
|
28
|
+
require_relative 'expression/methods/human_name'
|
29
|
+
require_relative 'expression/methods/match'
|
30
|
+
require_relative 'expression/methods/match_length'
|
31
|
+
require_relative 'expression/methods/negative'
|
32
|
+
require_relative 'expression/methods/options'
|
33
|
+
require_relative 'expression/methods/parts'
|
34
|
+
require_relative 'expression/methods/printing'
|
35
|
+
require_relative 'expression/methods/strfregexp'
|
36
|
+
require_relative 'expression/methods/tests'
|
37
|
+
require_relative 'expression/methods/traverse'
|
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -6,57 +6,75 @@ class Regexp::Lexer
|
|
6
6
|
|
7
7
|
OPENING_TOKENS = %i[
|
8
8
|
capture passive lookahead nlookahead lookbehind nlookbehind
|
9
|
-
atomic options options_switch named absence
|
9
|
+
atomic options options_switch named absence open
|
10
10
|
].freeze
|
11
11
|
|
12
12
|
CLOSING_TOKENS = %i[close].freeze
|
13
13
|
|
14
14
|
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
15
|
|
16
|
-
def self.lex(input, syntax =
|
17
|
-
new.lex(input, syntax, options: options, &block)
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def lex(input, syntax =
|
21
|
-
syntax = Regexp::Syntax.for(syntax)
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
22
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
23
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
24
28
|
self.nesting = 0
|
25
29
|
self.set_nesting = 0
|
26
30
|
self.conditional_nesting = 0
|
27
31
|
self.shift = 0
|
28
32
|
|
29
|
-
|
30
|
-
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
31
34
|
type, token = *syntax.normalize(type, token)
|
32
35
|
syntax.check! type, token
|
33
36
|
|
34
37
|
ascend(type, token)
|
35
38
|
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
39
47
|
end
|
40
48
|
|
41
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
42
50
|
nesting, set_nesting, conditional_nesting)
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
52
62
|
|
53
63
|
descend(type, token)
|
54
64
|
end
|
55
65
|
|
56
|
-
if
|
57
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
58
76
|
else
|
59
|
-
tokens
|
77
|
+
tokens << token
|
60
78
|
end
|
61
79
|
end
|
62
80
|
|
@@ -66,27 +84,37 @@ class Regexp::Lexer
|
|
66
84
|
|
67
85
|
private
|
68
86
|
|
69
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
70
90
|
|
71
91
|
def ascend(type, token)
|
92
|
+
return unless CLOSING_TOKENS.include?(token)
|
93
|
+
|
72
94
|
case type
|
73
95
|
when :group, :assertion
|
74
|
-
self.nesting = nesting - 1
|
96
|
+
self.nesting = nesting - 1
|
75
97
|
when :set
|
76
|
-
self.set_nesting = set_nesting - 1
|
98
|
+
self.set_nesting = set_nesting - 1
|
77
99
|
when :conditional
|
78
|
-
self.conditional_nesting = conditional_nesting - 1
|
100
|
+
self.conditional_nesting = conditional_nesting - 1
|
101
|
+
else
|
102
|
+
raise "unhandled nesting type #{type}"
|
79
103
|
end
|
80
104
|
end
|
81
105
|
|
82
106
|
def descend(type, token)
|
107
|
+
return unless OPENING_TOKENS.include?(token)
|
108
|
+
|
83
109
|
case type
|
84
110
|
when :group, :assertion
|
85
|
-
self.nesting = nesting + 1
|
111
|
+
self.nesting = nesting + 1
|
86
112
|
when :set
|
87
|
-
self.set_nesting = set_nesting + 1
|
113
|
+
self.set_nesting = set_nesting + 1
|
88
114
|
when :conditional
|
89
|
-
self.conditional_nesting = conditional_nesting + 1
|
115
|
+
self.conditional_nesting = conditional_nesting + 1
|
116
|
+
else
|
117
|
+
raise "unhandled nesting type #{type}"
|
90
118
|
end
|
91
119
|
end
|
92
120
|
|
@@ -96,34 +124,46 @@ class Regexp::Lexer
|
|
96
124
|
lead, last, _ = token.text.partition(/.\z/mu)
|
97
125
|
return if lead.empty?
|
98
126
|
|
99
|
-
|
100
|
-
tokens << Regexp::Token.new(:literal, :literal, lead,
|
127
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
101
128
|
token.ts, (token.te - last.length),
|
102
129
|
nesting, set_nesting, conditional_nesting)
|
103
|
-
|
130
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
104
131
|
(token.ts + lead.length), token.te,
|
105
132
|
nesting, set_nesting, conditional_nesting)
|
133
|
+
|
134
|
+
token_1.previous = preprev_token
|
135
|
+
token_1.next = token_2
|
136
|
+
token_2.previous = token_1 # .next will be set by #lex
|
137
|
+
[token_1, token_2]
|
106
138
|
end
|
107
139
|
|
140
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
141
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
142
|
+
# c.f. #break_literal.
|
108
143
|
def break_codepoint_list(token)
|
109
144
|
lead, _, tail = token.text.rpartition(' ')
|
110
145
|
return if lead.empty?
|
111
146
|
|
112
|
-
|
113
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
147
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
114
148
|
token.ts, (token.te - tail.length),
|
115
149
|
nesting, set_nesting, conditional_nesting)
|
116
|
-
|
150
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
117
151
|
(token.ts + lead.length + 1), (token.te + 3),
|
118
152
|
nesting, set_nesting, conditional_nesting)
|
119
153
|
|
120
154
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
155
|
+
|
156
|
+
token_1.previous = preprev_token
|
157
|
+
token_1.next = token_2
|
158
|
+
token_2.previous = token_1 # .next will be set by #lex
|
159
|
+
[token_1, token_2]
|
121
160
|
end
|
122
161
|
|
123
|
-
def merge_condition(current)
|
124
|
-
|
125
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
162
|
+
def merge_condition(current, last)
|
163
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
126
164
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
165
|
+
token.previous = preprev_token # .next will be set by #lex
|
166
|
+
token
|
127
167
|
end
|
128
168
|
|
129
169
|
end # module Regexp::Lexer
|