regexp_parser 1.7.0 → 2.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +364 -22
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/README.md +124 -88
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +295 -368
- data/lib/regexp_parser/scanner.rb +1405 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +49 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/lexer.rb
CHANGED
@@ -4,57 +4,77 @@
|
|
4
4
|
# given syntax flavor.
|
5
5
|
class Regexp::Lexer
|
6
6
|
|
7
|
-
OPENING_TOKENS = [
|
8
|
-
|
9
|
-
|
7
|
+
OPENING_TOKENS = %i[
|
8
|
+
capture passive lookahead nlookahead lookbehind nlookbehind
|
9
|
+
atomic options options_switch named absence open
|
10
10
|
].freeze
|
11
11
|
|
12
|
-
CLOSING_TOKENS = [
|
12
|
+
CLOSING_TOKENS = %i[close].freeze
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
CONDITION_TOKENS = %i[condition condition_close].freeze
|
15
|
+
|
16
|
+
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
17
|
+
new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
|
16
18
|
end
|
17
19
|
|
18
|
-
def lex(input, syntax =
|
19
|
-
syntax = Regexp::Syntax.
|
20
|
+
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
|
21
|
+
syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
|
20
22
|
|
23
|
+
self.block = block
|
24
|
+
self.collect_tokens = collect_tokens
|
21
25
|
self.tokens = []
|
26
|
+
self.prev_token = nil
|
27
|
+
self.preprev_token = nil
|
22
28
|
self.nesting = 0
|
23
29
|
self.set_nesting = 0
|
24
30
|
self.conditional_nesting = 0
|
25
31
|
self.shift = 0
|
26
32
|
|
27
|
-
|
28
|
-
Regexp::Scanner.scan(input) do |type, token, text, ts, te|
|
33
|
+
Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
|
29
34
|
type, token = *syntax.normalize(type, token)
|
30
35
|
syntax.check! type, token
|
31
36
|
|
32
37
|
ascend(type, token)
|
33
38
|
|
34
|
-
if
|
35
|
-
|
36
|
-
|
39
|
+
if (last = prev_token) &&
|
40
|
+
type == :quantifier &&
|
41
|
+
(
|
42
|
+
(last.type == :literal && (parts = break_literal(last))) ||
|
43
|
+
(last.token == :codepoint_list && (parts = break_codepoint_list(last)))
|
44
|
+
)
|
45
|
+
emit(parts[0])
|
46
|
+
last = parts[1]
|
37
47
|
end
|
38
48
|
|
39
49
|
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
|
40
50
|
nesting, set_nesting, conditional_nesting)
|
41
51
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
52
|
+
if type == :conditional && CONDITION_TOKENS.include?(token)
|
53
|
+
current = merge_condition(current, last)
|
54
|
+
elsif last
|
55
|
+
last.next = current
|
56
|
+
current.previous = last
|
57
|
+
emit(last)
|
58
|
+
end
|
47
59
|
|
48
|
-
|
49
|
-
|
60
|
+
self.preprev_token = last
|
61
|
+
self.prev_token = current
|
50
62
|
|
51
63
|
descend(type, token)
|
52
64
|
end
|
53
65
|
|
54
|
-
if
|
55
|
-
|
66
|
+
emit(prev_token) if prev_token
|
67
|
+
|
68
|
+
collect_tokens ? tokens : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def emit(token)
|
72
|
+
if block
|
73
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
|
74
|
+
res = block.call(token)
|
75
|
+
tokens << res if collect_tokens
|
56
76
|
else
|
57
|
-
tokens
|
77
|
+
tokens << token
|
58
78
|
end
|
59
79
|
end
|
60
80
|
|
@@ -64,27 +84,37 @@ class Regexp::Lexer
|
|
64
84
|
|
65
85
|
private
|
66
86
|
|
67
|
-
attr_accessor :
|
87
|
+
attr_accessor :block,
|
88
|
+
:collect_tokens, :tokens, :prev_token, :preprev_token,
|
89
|
+
:nesting, :set_nesting, :conditional_nesting, :shift
|
68
90
|
|
69
91
|
def ascend(type, token)
|
92
|
+
return unless CLOSING_TOKENS.include?(token)
|
93
|
+
|
70
94
|
case type
|
71
95
|
when :group, :assertion
|
72
|
-
self.nesting = nesting - 1
|
96
|
+
self.nesting = nesting - 1
|
73
97
|
when :set
|
74
|
-
self.set_nesting = set_nesting - 1
|
98
|
+
self.set_nesting = set_nesting - 1
|
75
99
|
when :conditional
|
76
|
-
self.conditional_nesting = conditional_nesting - 1
|
100
|
+
self.conditional_nesting = conditional_nesting - 1
|
101
|
+
else
|
102
|
+
raise "unhandled nesting type #{type}"
|
77
103
|
end
|
78
104
|
end
|
79
105
|
|
80
106
|
def descend(type, token)
|
107
|
+
return unless OPENING_TOKENS.include?(token)
|
108
|
+
|
81
109
|
case type
|
82
110
|
when :group, :assertion
|
83
|
-
self.nesting = nesting + 1
|
111
|
+
self.nesting = nesting + 1
|
84
112
|
when :set
|
85
|
-
self.set_nesting = set_nesting + 1
|
113
|
+
self.set_nesting = set_nesting + 1
|
86
114
|
when :conditional
|
87
|
-
self.conditional_nesting = conditional_nesting + 1
|
115
|
+
self.conditional_nesting = conditional_nesting + 1
|
116
|
+
else
|
117
|
+
raise "unhandled nesting type #{type}"
|
88
118
|
end
|
89
119
|
end
|
90
120
|
|
@@ -94,34 +124,46 @@ class Regexp::Lexer
|
|
94
124
|
lead, last, _ = token.text.partition(/.\z/mu)
|
95
125
|
return if lead.empty?
|
96
126
|
|
97
|
-
|
98
|
-
|
99
|
-
token.ts, (token.te - last.bytesize),
|
127
|
+
token_1 = Regexp::Token.new(:literal, :literal, lead,
|
128
|
+
token.ts, (token.te - last.length),
|
100
129
|
nesting, set_nesting, conditional_nesting)
|
101
|
-
|
102
|
-
(token.ts + lead.
|
130
|
+
token_2 = Regexp::Token.new(:literal, :literal, last,
|
131
|
+
(token.ts + lead.length), token.te,
|
103
132
|
nesting, set_nesting, conditional_nesting)
|
133
|
+
|
134
|
+
token_1.previous = preprev_token
|
135
|
+
token_1.next = token_2
|
136
|
+
token_2.previous = token_1 # .next will be set by #lex
|
137
|
+
[token_1, token_2]
|
104
138
|
end
|
105
139
|
|
140
|
+
# if a codepoint list is followed by a quantifier, that quantifier applies
|
141
|
+
# to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
|
142
|
+
# c.f. #break_literal.
|
106
143
|
def break_codepoint_list(token)
|
107
144
|
lead, _, tail = token.text.rpartition(' ')
|
108
145
|
return if lead.empty?
|
109
146
|
|
110
|
-
|
111
|
-
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
147
|
+
token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
|
112
148
|
token.ts, (token.te - tail.length),
|
113
149
|
nesting, set_nesting, conditional_nesting)
|
114
|
-
|
150
|
+
token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
|
115
151
|
(token.ts + lead.length + 1), (token.te + 3),
|
116
152
|
nesting, set_nesting, conditional_nesting)
|
117
153
|
|
118
154
|
self.shift = shift + 3 # one space less, but extra \, u, {, and }
|
155
|
+
|
156
|
+
token_1.previous = preprev_token
|
157
|
+
token_1.next = token_2
|
158
|
+
token_2.previous = token_1 # .next will be set by #lex
|
159
|
+
[token_1, token_2]
|
119
160
|
end
|
120
161
|
|
121
|
-
def merge_condition(current)
|
122
|
-
|
123
|
-
Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
162
|
+
def merge_condition(current, last)
|
163
|
+
token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
|
124
164
|
last.ts, current.te, nesting, set_nesting, conditional_nesting)
|
165
|
+
token.previous = preprev_token # .next will be set by #lex
|
166
|
+
token
|
125
167
|
end
|
126
168
|
|
127
169
|
end # module Regexp::Lexer
|