regexp_parser 2.1.1 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -5
- data/LICENSE +1 -1
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +18 -3
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +4 -4
- data/lib/regexp_parser/expression/classes/group.rb +10 -22
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/root.rb +3 -6
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +10 -11
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +55 -24
- data/lib/regexp_parser/expression/sequence.rb +11 -31
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +26 -18
- data/lib/regexp_parser/expression.rb +37 -155
- data/lib/regexp_parser/lexer.rb +81 -39
- data/lib/regexp_parser/parser.rb +135 -173
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +651 -0
- data/lib/regexp_parser/scanner/properties/short.csv +249 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +127 -185
- data/lib/regexp_parser/scanner.rb +1185 -1402
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +91 -66
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +17 -34
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +4 -2
- data/lib/regexp_parser/syntax.rb +2 -2
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -8
- data/regexp_parser.gemspec +20 -22
- metadata +49 -171
- data/CHANGELOG.md +0 -494
- data/README.md +0 -479
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -104
- data/spec/expression/clone_spec.rb +0 -152
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -108
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/delimiters_spec.rb +0 -68
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -64
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -60
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/options_spec.rb +0 -28
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -68
- data/spec/parser/refcalls_spec.rb +0 -117
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/delimiters_spec.rb +0 -52
- data/spec/scanner/errors_spec.rb +0 -67
- data/spec/scanner/escapes_spec.rb +0 -64
- data/spec/scanner/free_space_spec.rb +0 -165
- data/spec/scanner/groups_spec.rb +0 -61
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -39
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/options_spec.rb +0 -36
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -25
- data/spec/scanner/refcalls_spec.rb +0 -55
- data/spec/scanner/sets_spec.rb +0 -151
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -16
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
|
@@ -28,18 +28,7 @@
|
|
|
28
28
|
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
|
33
|
-
'lower' | 'print' | 'punct' |
|
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
|
35
|
-
'word' | 'ascii';
|
|
36
|
-
|
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# these are not supported in ruby at the moment
|
|
41
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
|
42
|
-
character_equivalent = '[=' . alpha . '=]';
|
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
|
43
32
|
|
|
44
33
|
line_anchor = beginning_of_line | end_of_line;
|
|
45
34
|
anchor_char = [AbBzZG];
|
|
@@ -65,20 +54,13 @@
|
|
|
65
54
|
one_or_more = '+' | '+?' | '++';
|
|
66
55
|
|
|
67
56
|
quantifier_greedy = '?' | '*' | '+';
|
|
68
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
|
69
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
|
70
|
-
quantifier_mode = '?' | '+';
|
|
71
57
|
|
|
72
58
|
quantity_exact = (digit+);
|
|
73
59
|
quantity_minimum = (digit+) . ',';
|
|
74
60
|
quantity_maximum = ',' . (digit+);
|
|
75
61
|
quantity_range = (digit+) . ',' . (digit+);
|
|
76
62
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
|
78
|
-
quantifier_mode?;
|
|
79
|
-
|
|
80
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
|
81
|
-
quantifier_possessive | quantifier_interval;
|
|
63
|
+
quantity_maximum | quantity_range ) . range_close;
|
|
82
64
|
|
|
83
65
|
conditional = '(?(';
|
|
84
66
|
|
|
@@ -96,10 +78,9 @@
|
|
|
96
78
|
# try to treat every other group head as options group, like Ruby
|
|
97
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
|
98
80
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
group_number = '-'? . [1-9] . [0-9]*;
|
|
81
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
|
83
|
+
group_number = '-'? . [0-9]+;
|
|
103
84
|
group_level = [+\-] . [0-9]+;
|
|
104
85
|
|
|
105
86
|
group_name = ('<' . group_name_id_ab? . '>') |
|
|
@@ -108,15 +89,11 @@
|
|
|
108
89
|
|
|
109
90
|
group_named = ('?' . group_name );
|
|
110
91
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
|
114
|
-
("'" . group_name_id_sq? . group_level? "'"));
|
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
|
115
94
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
|
119
|
-
("'" . ((group_number . group_level?) | '0') "'"));
|
|
95
|
+
group_ref = 'k' . group_ref_body;
|
|
96
|
+
group_call = 'g' . group_ref_body;
|
|
120
97
|
|
|
121
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
|
122
99
|
|
|
@@ -139,20 +116,21 @@
|
|
|
139
116
|
keep_mark | sequence_char;
|
|
140
117
|
|
|
141
118
|
# escapes that also work within a character set
|
|
142
|
-
set_escape = backslash | brackets | escaped_ascii |
|
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
|
120
|
+
octal_sequence | property_char |
|
|
143
121
|
sequence_char | single_codepoint_char_type;
|
|
144
122
|
|
|
145
123
|
|
|
146
124
|
# EOF error, used where it can be detected
|
|
147
125
|
action premature_end_error {
|
|
148
126
|
text = copy(data, ts ? ts-1 : 0, -1)
|
|
149
|
-
raise PrematureEndError.new(
|
|
127
|
+
raise PrematureEndError.new(text)
|
|
150
128
|
}
|
|
151
129
|
|
|
152
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
|
153
131
|
action invalid_sequence_error {
|
|
154
132
|
text = copy(data, ts ? ts-1 : 0, -1)
|
|
155
|
-
|
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
|
156
134
|
}
|
|
157
135
|
|
|
158
136
|
# group (nesting) and set open/close actions
|
|
@@ -175,8 +153,8 @@
|
|
|
175
153
|
};
|
|
176
154
|
|
|
177
155
|
'-]' @set_closed { # special case, emits two tokens
|
|
178
|
-
emit(:literal, :literal,
|
|
179
|
-
emit(:set, :close,
|
|
156
|
+
emit(:literal, :literal, '-')
|
|
157
|
+
emit(:set, :close, ']')
|
|
180
158
|
if in_set?
|
|
181
159
|
fret;
|
|
182
160
|
else
|
|
@@ -190,28 +168,27 @@
|
|
|
190
168
|
};
|
|
191
169
|
|
|
192
170
|
'^' {
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
emit(:set, :negate, text)
|
|
171
|
+
if prev_token[1] == :open
|
|
172
|
+
emit(:set, :negate, '^')
|
|
196
173
|
else
|
|
197
|
-
emit(:literal, :literal,
|
|
174
|
+
emit(:literal, :literal, '^')
|
|
198
175
|
end
|
|
199
176
|
};
|
|
200
177
|
|
|
201
178
|
'-' {
|
|
202
|
-
|
|
203
|
-
#
|
|
204
|
-
if
|
|
205
|
-
emit(:literal, :literal,
|
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
|
180
|
+
# intersection/negation/range operators
|
|
181
|
+
if prev_token[0] == :set
|
|
182
|
+
emit(:literal, :literal, '-')
|
|
206
183
|
else
|
|
207
|
-
emit(:set, :range,
|
|
184
|
+
emit(:set, :range, '-')
|
|
208
185
|
end
|
|
209
186
|
};
|
|
210
187
|
|
|
211
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
|
212
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
|
213
190
|
'&&' {
|
|
214
|
-
emit(:set, :intersection,
|
|
191
|
+
emit(:set, :intersection, '&&')
|
|
215
192
|
};
|
|
216
193
|
|
|
217
194
|
backslash {
|
|
@@ -219,31 +196,27 @@
|
|
|
219
196
|
};
|
|
220
197
|
|
|
221
198
|
set_open >(open_bracket, 1) >set_opened {
|
|
222
|
-
emit(:set, :open,
|
|
199
|
+
emit(:set, :open, '[')
|
|
223
200
|
fcall character_set;
|
|
224
201
|
};
|
|
225
202
|
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
|
203
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
227
204
|
text = copy(data, ts, te)
|
|
228
205
|
|
|
229
206
|
type = :posixclass
|
|
230
207
|
class_name = text[2..-3]
|
|
231
|
-
if class_name[0]
|
|
208
|
+
if class_name[0] == '^'
|
|
232
209
|
class_name = class_name[1..-1]
|
|
233
210
|
type = :nonposixclass
|
|
234
211
|
end
|
|
235
212
|
|
|
213
|
+
unless self.class.posix_classes.include?(class_name)
|
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
|
215
|
+
end
|
|
216
|
+
|
|
236
217
|
emit(type, class_name.to_sym, text)
|
|
237
218
|
};
|
|
238
219
|
|
|
239
|
-
# These are not supported in ruby at the moment. Enable them if they are.
|
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
241
|
-
# emit(:set, :collation, copy(data, ts, te))
|
|
242
|
-
# };
|
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
|
244
|
-
# emit(:set, :equivalent, copy(data, ts, te))
|
|
245
|
-
# };
|
|
246
|
-
|
|
247
220
|
meta_char > (set_meta, 1) {
|
|
248
221
|
emit(:literal, :literal, copy(data, ts, te))
|
|
249
222
|
};
|
|
@@ -257,12 +230,22 @@
|
|
|
257
230
|
# set escapes scanner
|
|
258
231
|
# --------------------------------------------------------------------------
|
|
259
232
|
set_escape_sequence := |*
|
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
|
234
|
+
octal_sequence {
|
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
|
236
|
+
fret;
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
|
260
240
|
set_escape > (escaped_set_alpha, 2) {
|
|
261
241
|
fhold;
|
|
262
242
|
fnext character_set;
|
|
263
243
|
fcall escape_sequence;
|
|
264
244
|
};
|
|
265
245
|
|
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
|
266
249
|
any > (escaped_set_alpha, 1) {
|
|
267
250
|
emit(:escape, :literal, copy(data, ts-1, te))
|
|
268
251
|
fret;
|
|
@@ -284,6 +267,13 @@
|
|
|
284
267
|
fret;
|
|
285
268
|
};
|
|
286
269
|
|
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
|
271
|
+
text = copy(data, ts-1, te)
|
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
|
273
|
+
emit(:literal, :literal, text[2])
|
|
274
|
+
fret;
|
|
275
|
+
};
|
|
276
|
+
|
|
287
277
|
meta_char {
|
|
288
278
|
case text = copy(data, ts-1, te)
|
|
289
279
|
when '\.'; emit(:escape, :dot, text)
|
|
@@ -323,7 +313,7 @@
|
|
|
323
313
|
|
|
324
314
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
|
325
315
|
text = copy(data, ts-1, te)
|
|
326
|
-
if text[2]
|
|
316
|
+
if text[2] == '{'
|
|
327
317
|
emit(:escape, :codepoint_list, text)
|
|
328
318
|
else
|
|
329
319
|
emit(:escape, :codepoint, text)
|
|
@@ -374,6 +364,7 @@
|
|
|
374
364
|
conditional_expression := |*
|
|
375
365
|
group_lookup . ')' {
|
|
376
366
|
text = copy(data, ts, te-1)
|
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
|
377
368
|
emit(:conditional, :condition, text)
|
|
378
369
|
emit(:conditional, :condition_close, ')')
|
|
379
370
|
};
|
|
@@ -419,12 +410,12 @@
|
|
|
419
410
|
|
|
420
411
|
backslash . anchor_char > (backslashed, 3) {
|
|
421
412
|
case text = copy(data, ts, te)
|
|
422
|
-
when '
|
|
423
|
-
when '
|
|
424
|
-
when '
|
|
425
|
-
when '
|
|
426
|
-
when '
|
|
427
|
-
when '
|
|
413
|
+
when '\A'; emit(:anchor, :bos, text)
|
|
414
|
+
when '\z'; emit(:anchor, :eos, text)
|
|
415
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
|
416
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
|
417
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
|
418
|
+
when '\G'; emit(:anchor, :match_start, text)
|
|
428
419
|
end
|
|
429
420
|
};
|
|
430
421
|
|
|
@@ -456,10 +447,9 @@
|
|
|
456
447
|
|
|
457
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
|
458
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
|
459
|
-
# special case for close
|
|
460
|
-
# correct closing count.
|
|
450
|
+
# special case for close to get the correct closing count.
|
|
461
451
|
# ------------------------------------------------------------------------
|
|
462
|
-
group_open . group_comment
|
|
452
|
+
(group_open . group_comment) @group_closed {
|
|
463
453
|
emit(:group, :comment, copy(data, ts, te))
|
|
464
454
|
};
|
|
465
455
|
|
|
@@ -474,10 +464,10 @@
|
|
|
474
464
|
#
|
|
475
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
|
476
466
|
# ------------------------------------------------------------------------
|
|
477
|
-
group_open . group_options >group_opened {
|
|
467
|
+
(group_open . group_options) >group_opened {
|
|
478
468
|
text = copy(data, ts, te)
|
|
479
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
|
480
|
-
raise
|
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
|
481
471
|
end
|
|
482
472
|
emit_options(text)
|
|
483
473
|
};
|
|
@@ -488,7 +478,7 @@
|
|
|
488
478
|
# (?<=subexp) look-behind
|
|
489
479
|
# (?<!subexp) negative look-behind
|
|
490
480
|
# ------------------------------------------------------------------------
|
|
491
|
-
group_open . assertion_type >group_opened {
|
|
481
|
+
(group_open . assertion_type) >group_opened {
|
|
492
482
|
case text = copy(data, ts, te)
|
|
493
483
|
when '(?='; emit(:assertion, :lookahead, text)
|
|
494
484
|
when '(?!'; emit(:assertion, :nlookahead, text)
|
|
@@ -505,14 +495,14 @@
|
|
|
505
495
|
# (?'name'subexp) named group (single quoted version)
|
|
506
496
|
# (subexp) captured group
|
|
507
497
|
# ------------------------------------------------------------------------
|
|
508
|
-
group_open . group_type >group_opened {
|
|
498
|
+
(group_open . group_type) >group_opened {
|
|
509
499
|
case text = copy(data, ts, te)
|
|
510
500
|
when '(?:'; emit(:group, :passive, text)
|
|
511
501
|
when '(?>'; emit(:group, :atomic, text)
|
|
512
502
|
when '(?~'; emit(:group, :absence, text)
|
|
513
503
|
|
|
514
504
|
when /^\(\?(?:<>|'')/
|
|
515
|
-
|
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
|
516
506
|
|
|
517
507
|
when /^\(\?<[^>]+>/
|
|
518
508
|
emit(:group, :named_ab, text)
|
|
@@ -531,50 +521,52 @@
|
|
|
531
521
|
group_close @group_closed {
|
|
532
522
|
if conditional_stack.last == group_depth + 1
|
|
533
523
|
conditional_stack.pop
|
|
534
|
-
emit(:conditional, :close,
|
|
535
|
-
|
|
524
|
+
emit(:conditional, :close, ')')
|
|
525
|
+
elsif group_depth >= 0
|
|
536
526
|
if spacing_stack.length > 1 &&
|
|
537
527
|
spacing_stack.last[:depth] == group_depth + 1
|
|
538
528
|
spacing_stack.pop
|
|
539
529
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
|
540
530
|
end
|
|
541
531
|
|
|
542
|
-
emit(:group, :close,
|
|
532
|
+
emit(:group, :close, ')')
|
|
533
|
+
else
|
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
|
543
535
|
end
|
|
544
536
|
};
|
|
545
537
|
|
|
546
538
|
|
|
547
539
|
# Group backreference, named and numbered
|
|
548
540
|
# ------------------------------------------------------------------------
|
|
549
|
-
backslash . (
|
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
|
550
542
|
case text = copy(data, ts, te)
|
|
551
|
-
when /^\\k(
|
|
552
|
-
validation_error(:backref, 'backreference', 'ref ID is empty')
|
|
553
|
-
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
|
554
544
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
|
555
|
-
when /^\\k(.)\d
|
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
|
556
546
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
|
557
|
-
when /^\\k(.)
|
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
|
558
548
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
|
559
|
-
when /^\\k(.)[
|
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
|
560
550
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
|
561
|
-
when /^\\k(.)
|
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
|
562
552
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
|
553
|
+
else
|
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
|
563
555
|
end
|
|
564
556
|
};
|
|
565
557
|
|
|
566
558
|
# Group call, named and numbered
|
|
567
559
|
# ------------------------------------------------------------------------
|
|
568
|
-
backslash . (
|
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
|
569
561
|
case text = copy(data, ts, te)
|
|
570
|
-
when /^\\g(
|
|
571
|
-
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
|
572
|
-
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
|
573
563
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
|
574
|
-
when /^\\g(.)\d
|
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
|
575
565
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
|
576
|
-
when /^\\g(.)[+-]\d
|
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
|
577
567
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
|
568
|
+
else
|
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
|
578
570
|
end
|
|
579
571
|
};
|
|
580
572
|
|
|
@@ -605,7 +597,7 @@
|
|
|
605
597
|
end
|
|
606
598
|
};
|
|
607
599
|
|
|
608
|
-
quantifier_interval
|
|
600
|
+
quantifier_interval {
|
|
609
601
|
emit(:quantifier, :interval, copy(data, ts, te))
|
|
610
602
|
};
|
|
611
603
|
|
|
@@ -648,87 +640,35 @@
|
|
|
648
640
|
*|;
|
|
649
641
|
}%%
|
|
650
642
|
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
require 'regexp_parser/error'
|
|
643
|
+
require_relative 'scanner/errors/scanner_error'
|
|
644
|
+
require_relative 'scanner/errors/premature_end_error'
|
|
645
|
+
require_relative 'scanner/errors/validation_error'
|
|
655
646
|
|
|
656
647
|
class Regexp::Scanner
|
|
657
|
-
# General scanner error (catch all)
|
|
658
|
-
class ScannerError < Regexp::Parser::Error; end
|
|
659
|
-
|
|
660
|
-
# Base for all scanner validation errors
|
|
661
|
-
class ValidationError < Regexp::Parser::Error
|
|
662
|
-
def initialize(reason)
|
|
663
|
-
super reason
|
|
664
|
-
end
|
|
665
|
-
end
|
|
666
|
-
|
|
667
|
-
# Unexpected end of pattern
|
|
668
|
-
class PrematureEndError < ScannerError
|
|
669
|
-
def initialize(where = '')
|
|
670
|
-
super "Premature end of pattern at #{where}"
|
|
671
|
-
end
|
|
672
|
-
end
|
|
673
|
-
|
|
674
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
|
675
|
-
class InvalidSequenceError < ValidationError
|
|
676
|
-
def initialize(what = 'sequence', where = '')
|
|
677
|
-
super "Invalid #{what} at #{where}"
|
|
678
|
-
end
|
|
679
|
-
end
|
|
680
|
-
|
|
681
|
-
# Invalid group. Used for named groups.
|
|
682
|
-
class InvalidGroupError < ValidationError
|
|
683
|
-
def initialize(what, reason)
|
|
684
|
-
super "Invalid #{what}, #{reason}."
|
|
685
|
-
end
|
|
686
|
-
end
|
|
687
|
-
|
|
688
|
-
# Invalid groupOption. Used for inline options.
|
|
689
|
-
class InvalidGroupOption < ValidationError
|
|
690
|
-
def initialize(option, text)
|
|
691
|
-
super "Invalid group option #{option} in #{text}"
|
|
692
|
-
end
|
|
693
|
-
end
|
|
694
|
-
|
|
695
|
-
# Invalid back reference. Used for name a number refs/calls.
|
|
696
|
-
class InvalidBackrefError < ValidationError
|
|
697
|
-
def initialize(what, reason)
|
|
698
|
-
super "Invalid back reference #{what}, #{reason}"
|
|
699
|
-
end
|
|
700
|
-
end
|
|
701
|
-
|
|
702
|
-
# The property name was not recognized by the scanner.
|
|
703
|
-
class UnknownUnicodePropertyError < ValidationError
|
|
704
|
-
def initialize(name)
|
|
705
|
-
super "Unknown unicode character property name #{name}"
|
|
706
|
-
end
|
|
707
|
-
end
|
|
708
|
-
|
|
709
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
|
710
649
|
# emitted token into an array that gets returned at the end. If a block is
|
|
711
650
|
# given, it gets called for each emitted token.
|
|
712
651
|
#
|
|
713
652
|
# This method may raise errors if a syntax error is encountered.
|
|
714
653
|
# --------------------------------------------------------------------------
|
|
715
|
-
def self.scan(input_object, options: nil, &block)
|
|
716
|
-
new.scan(input_object, options: options, &block)
|
|
654
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
|
655
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
|
717
656
|
end
|
|
718
657
|
|
|
719
|
-
def scan(input_object, options: nil, &block)
|
|
720
|
-
self.
|
|
658
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
|
659
|
+
self.collect_tokens = collect_tokens
|
|
660
|
+
self.literal_run = nil
|
|
721
661
|
stack = []
|
|
722
662
|
|
|
723
663
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
|
724
664
|
self.free_spacing = free_spacing?(input_object, options)
|
|
725
665
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
|
726
666
|
|
|
727
|
-
data = input.unpack("c*")
|
|
667
|
+
data = input.unpack("c*")
|
|
728
668
|
eof = data.length
|
|
729
669
|
|
|
730
670
|
self.tokens = []
|
|
731
|
-
self.block =
|
|
671
|
+
self.block = block
|
|
732
672
|
|
|
733
673
|
self.set_depth = 0
|
|
734
674
|
self.group_depth = 0
|
|
@@ -753,46 +693,64 @@ class Regexp::Scanner
|
|
|
753
693
|
"[#{set_depth}]") if in_set?
|
|
754
694
|
|
|
755
695
|
# when the entire expression is a literal run
|
|
756
|
-
emit_literal if
|
|
696
|
+
emit_literal if literal_run
|
|
757
697
|
|
|
758
698
|
tokens
|
|
759
699
|
end
|
|
760
700
|
|
|
761
701
|
# lazy-load property maps when first needed
|
|
762
|
-
require 'yaml'
|
|
763
|
-
|
|
764
702
|
def self.short_prop_map
|
|
765
|
-
@short_prop_map ||=
|
|
703
|
+
@short_prop_map ||= parse_prop_map('short')
|
|
766
704
|
end
|
|
767
705
|
|
|
768
706
|
def self.long_prop_map
|
|
769
|
-
@long_prop_map ||=
|
|
707
|
+
@long_prop_map ||= parse_prop_map('long')
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
def self.parse_prop_map(name)
|
|
711
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
|
712
|
+
end
|
|
713
|
+
|
|
714
|
+
def self.posix_classes
|
|
715
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
|
716
|
+
lower print punct space upper word xdigit]
|
|
770
717
|
end
|
|
771
718
|
|
|
772
719
|
# Emits an array with the details of the scanned pattern
|
|
773
720
|
def emit(type, token, text)
|
|
774
721
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
|
775
722
|
|
|
776
|
-
emit_literal if
|
|
723
|
+
emit_literal if literal_run
|
|
777
724
|
|
|
778
725
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
|
779
726
|
# end-users, so we keep track of char-based indices and emit those instead.
|
|
780
727
|
ts_char_pos = char_pos
|
|
781
728
|
te_char_pos = char_pos + text.length
|
|
782
729
|
|
|
783
|
-
|
|
784
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
|
785
|
-
end
|
|
730
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
|
786
731
|
|
|
787
|
-
|
|
732
|
+
self.prev_token = tok
|
|
788
733
|
|
|
789
734
|
self.char_pos = te_char_pos
|
|
735
|
+
|
|
736
|
+
if block
|
|
737
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
|
738
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
|
739
|
+
tokens << tok if collect_tokens
|
|
740
|
+
elsif collect_tokens
|
|
741
|
+
tokens << tok
|
|
742
|
+
end
|
|
790
743
|
end
|
|
791
744
|
|
|
745
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
|
746
|
+
|
|
792
747
|
private
|
|
793
748
|
|
|
794
|
-
attr_accessor :
|
|
795
|
-
:
|
|
749
|
+
attr_accessor :block,
|
|
750
|
+
:collect_tokens, :tokens, :prev_token,
|
|
751
|
+
:free_spacing, :spacing_stack,
|
|
752
|
+
:group_depth, :set_depth, :conditional_stack,
|
|
753
|
+
:char_pos
|
|
796
754
|
|
|
797
755
|
def free_spacing?(input_object, options)
|
|
798
756
|
if options && !input_object.is_a?(String)
|
|
@@ -822,14 +780,13 @@ class Regexp::Scanner
|
|
|
822
780
|
# Appends one or more characters to the literal buffer, to be emitted later
|
|
823
781
|
# by a call to emit_literal.
|
|
824
782
|
def append_literal(data, ts, te)
|
|
825
|
-
self.
|
|
826
|
-
literal << copy(data, ts, te)
|
|
783
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
|
827
784
|
end
|
|
828
785
|
|
|
829
786
|
# Emits the literal run collected by calls to the append_literal method.
|
|
830
787
|
def emit_literal
|
|
831
|
-
text =
|
|
832
|
-
self.
|
|
788
|
+
text = literal_run.join
|
|
789
|
+
self.literal_run = nil
|
|
833
790
|
emit(:literal, :literal, text)
|
|
834
791
|
end
|
|
835
792
|
|
|
@@ -864,23 +821,8 @@ class Regexp::Scanner
|
|
|
864
821
|
|
|
865
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
|
866
823
|
if data.last < 0x00 || data.last > 0x7F
|
|
867
|
-
|
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
|
868
825
|
end
|
|
869
826
|
emit(:escape, token, copy(data, ts-1, te))
|
|
870
827
|
end
|
|
871
|
-
|
|
872
|
-
# Centralizes and unifies the handling of validation related
|
|
873
|
-
# errors.
|
|
874
|
-
def validation_error(type, what, reason)
|
|
875
|
-
case type
|
|
876
|
-
when :group
|
|
877
|
-
error = InvalidGroupError.new(what, reason)
|
|
878
|
-
when :backref
|
|
879
|
-
error = InvalidBackrefError.new(what, reason)
|
|
880
|
-
when :sequence
|
|
881
|
-
error = InvalidSequenceError.new(what, reason)
|
|
882
|
-
end
|
|
883
|
-
|
|
884
|
-
raise error # unless @@config.validation_ignore
|
|
885
|
-
end
|
|
886
828
|
end # module Regexp::Scanner
|