regexp_parser 2.6.0 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
- data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -10
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -20
- data/lib/regexp_parser/expression/subexpression.rb +20 -15
- data/lib/regexp_parser/expression.rb +37 -31
- data/lib/regexp_parser/lexer.rb +76 -36
- data/lib/regexp_parser/parser.rb +107 -103
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +101 -172
- data/lib/regexp_parser/scanner.rb +1171 -1365
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +17 -8
- data/CHANGELOG.md +0 -601
- data/README.md +0 -503
@@ -7,6 +7,8 @@ age=12.0,age=12.0
|
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
9
|
age=14.0,age=14.0
|
10
|
+
age=15.0,age=15.0
|
11
|
+
age=15.1,age=15.1
|
10
12
|
age=2.0,age=2.0
|
11
13
|
age=2.1,age=2.1
|
12
14
|
age=3.0,age=3.0
|
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
|
|
97
99
|
emojipresentation,emoji_presentation
|
98
100
|
enclosingmark,enclosing_mark
|
99
101
|
ethiopic,ethiopic
|
102
|
+
extendedpictographic,extended_pictographic
|
100
103
|
extender,extender
|
101
104
|
finalpunctuation,final_punctuation
|
102
105
|
format,format
|
@@ -106,6 +109,19 @@ gothic,gothic
|
|
106
109
|
grantha,grantha
|
107
110
|
graph,graph
|
108
111
|
graphemebase,grapheme_base
|
112
|
+
graphemeclusterbreak=control,grapheme_cluster_break=control
|
113
|
+
graphemeclusterbreak=cr,grapheme_cluster_break=cr
|
114
|
+
graphemeclusterbreak=extend,grapheme_cluster_break=extend
|
115
|
+
graphemeclusterbreak=l,grapheme_cluster_break=l
|
116
|
+
graphemeclusterbreak=lf,grapheme_cluster_break=lf
|
117
|
+
graphemeclusterbreak=lv,grapheme_cluster_break=lv
|
118
|
+
graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
|
119
|
+
graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
|
120
|
+
graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
|
121
|
+
graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
|
122
|
+
graphemeclusterbreak=t,grapheme_cluster_break=t
|
123
|
+
graphemeclusterbreak=v,grapheme_cluster_break=v
|
124
|
+
graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
|
109
125
|
graphemeextend,grapheme_extend
|
110
126
|
graphemelink,grapheme_link
|
111
127
|
greek,greek
|
@@ -121,11 +137,14 @@ hebrew,hebrew
|
|
121
137
|
hexdigit,hex_digit
|
122
138
|
hiragana,hiragana
|
123
139
|
hyphen,hyphen
|
140
|
+
idcompatmathcontinue,id_compat_math_continue
|
141
|
+
idcompatmathstart,id_compat_math_start
|
124
142
|
idcontinue,id_continue
|
125
143
|
ideographic,ideographic
|
126
144
|
idsbinaryoperator,ids_binary_operator
|
127
145
|
idstart,id_start
|
128
146
|
idstrinaryoperator,ids_trinary_operator
|
147
|
+
idsunaryoperator,ids_unary_operator
|
129
148
|
imperialaramaic,imperial_aramaic
|
130
149
|
inadlam,in_adlam
|
131
150
|
inaegeannumbers,in_aegean_numbers
|
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
|
|
139
158
|
inarabic,in_arabic
|
140
159
|
inarabicextendeda,in_arabic_extended_a
|
141
160
|
inarabicextendedb,in_arabic_extended_b
|
161
|
+
inarabicextendedc,in_arabic_extended_c
|
142
162
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
143
163
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
144
164
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
|
|
186
206
|
incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
|
187
207
|
incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
|
188
208
|
incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
|
209
|
+
incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
|
210
|
+
incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
|
189
211
|
incombiningdiacriticalmarks,in_combining_diacritical_marks
|
190
212
|
incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
|
191
213
|
incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
|
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
|
|
205
227
|
incyrillicextendeda,in_cyrillic_extended_a
|
206
228
|
incyrillicextendedb,in_cyrillic_extended_b
|
207
229
|
incyrillicextendedc,in_cyrillic_extended_c
|
230
|
+
incyrillicextendedd,in_cyrillic_extended_d
|
208
231
|
incyrillicsupplement,in_cyrillic_supplement
|
209
232
|
indeseret,in_deseret
|
210
233
|
indevanagari,in_devanagari
|
211
234
|
indevanagariextended,in_devanagari_extended
|
235
|
+
indevanagariextendeda,in_devanagari_extended_a
|
212
236
|
indingbats,in_dingbats
|
213
237
|
indivesakuru,in_dives_akuru
|
214
238
|
indogra,in_dogra
|
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
|
|
268
292
|
initialpunctuation,initial_punctuation
|
269
293
|
injavanese,in_javanese
|
270
294
|
inkaithi,in_kaithi
|
295
|
+
inkaktoviknumerals,in_kaktovik_numerals
|
271
296
|
inkanaextendeda,in_kana_extended_a
|
272
297
|
inkanaextendedb,in_kana_extended_b
|
273
298
|
inkanasupplement,in_kana_supplement
|
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
|
|
276
301
|
inkannada,in_kannada
|
277
302
|
inkatakana,in_katakana
|
278
303
|
inkatakanaphoneticextensions,in_katakana_phonetic_extensions
|
304
|
+
inkawi,in_kawi
|
279
305
|
inkayahli,in_kayah_li
|
280
306
|
inkharoshthi,in_kharoshthi
|
281
307
|
inkhitansmallscript,in_khitan_small_script
|
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
|
|
339
365
|
inmyanmarextendeda,in_myanmar_extended_a
|
340
366
|
inmyanmarextendedb,in_myanmar_extended_b
|
341
367
|
innabataean,in_nabataean
|
368
|
+
innagmundari,in_nag_mundari
|
342
369
|
innandinagari,in_nandinagari
|
343
370
|
innewa,in_newa
|
344
371
|
innewtailue,in_new_tai_lue
|
@@ -457,6 +484,7 @@ joincontrol,join_control
|
|
457
484
|
kaithi,kaithi
|
458
485
|
kannada,kannada
|
459
486
|
katakana,katakana
|
487
|
+
kawi,kawi
|
460
488
|
kayahli,kayah_li
|
461
489
|
kharoshthi,kharoshthi
|
462
490
|
khitansmallscript,khitan_small_script
|
@@ -503,6 +531,7 @@ mro,mro
|
|
503
531
|
multani,multani
|
504
532
|
myanmar,myanmar
|
505
533
|
nabataean,nabataean
|
534
|
+
nagmundari,nag_mundari
|
506
535
|
nandinagari,nandinagari
|
507
536
|
newa,newa
|
508
537
|
newline,newline
|
@@ -57,6 +57,7 @@ emod,emoji_modifier
|
|
57
57
|
epres,emoji_presentation
|
58
58
|
ethi,ethiopic
|
59
59
|
ext,extender
|
60
|
+
extpict,extended_pictographic
|
60
61
|
geor,georgian
|
61
62
|
glag,glagolitic
|
62
63
|
gong,gunjala_gondi
|
@@ -85,6 +86,7 @@ ideo,ideographic
|
|
85
86
|
ids,id_start
|
86
87
|
idsb,ids_binary_operator
|
87
88
|
idst,ids_trinary_operator
|
89
|
+
idsu,ids_unary_operator
|
88
90
|
ital,old_italic
|
89
91
|
java,javanese
|
90
92
|
joinc,join_control
|
@@ -133,6 +135,7 @@ mtei,meetei_mayek
|
|
133
135
|
mult,multani
|
134
136
|
mymr,myanmar
|
135
137
|
n,number
|
138
|
+
nagm,nag_mundari
|
136
139
|
nand,nandinagari
|
137
140
|
narb,old_north_arabian
|
138
141
|
nbat,nabataean
|
@@ -17,10 +17,10 @@
|
|
17
17
|
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
|
-
name =
|
20
|
+
name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
raise ValidationError.for(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -30,11 +30,6 @@
|
|
30
30
|
|
31
31
|
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
32
32
|
|
33
|
-
|
34
|
-
# these are not supported in ruby at the moment
|
35
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
|
-
character_equivalent = '[=' . alpha . '=]';
|
37
|
-
|
38
33
|
line_anchor = beginning_of_line | end_of_line;
|
39
34
|
anchor_char = [AbBzZG];
|
40
35
|
|
@@ -59,9 +54,6 @@
|
|
59
54
|
one_or_more = '+' | '+?' | '++';
|
60
55
|
|
61
56
|
quantifier_greedy = '?' | '*' | '+';
|
62
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
63
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
64
|
-
quantifier_mode = '?' | '+';
|
65
57
|
|
66
58
|
quantity_exact = (digit+);
|
67
59
|
quantity_minimum = (digit+) . ',';
|
@@ -70,9 +62,6 @@
|
|
70
62
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
71
63
|
quantity_maximum | quantity_range ) . range_close;
|
72
64
|
|
73
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
74
|
-
quantifier_possessive | quantifier_interval;
|
75
|
-
|
76
65
|
conditional = '(?(';
|
77
66
|
|
78
67
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -89,10 +78,9 @@
|
|
89
78
|
# try to treat every other group head as options group, like Ruby
|
90
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
80
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] . [0-9]*;
|
81
|
+
group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
|
+
group_number = '-'? . [0-9]+;
|
96
84
|
group_level = [+\-] . [0-9]+;
|
97
85
|
|
98
86
|
group_name = ('<' . group_name_id_ab? . '>') |
|
@@ -101,15 +89,11 @@
|
|
101
89
|
|
102
90
|
group_named = ('?' . group_name );
|
103
91
|
|
104
|
-
|
105
|
-
|
106
|
-
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
107
|
-
("'" . group_name_id_sq? . group_level? "'"));
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
108
94
|
|
109
|
-
|
110
|
-
|
111
|
-
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
112
|
-
("'" . ((group_number . group_level?) | '0') "'"));
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
113
97
|
|
114
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
115
99
|
|
@@ -132,20 +116,21 @@
|
|
132
116
|
keep_mark | sequence_char;
|
133
117
|
|
134
118
|
# escapes that also work within a character set
|
135
|
-
set_escape = backslash | brackets | escaped_ascii |
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
136
121
|
sequence_char | single_codepoint_char_type;
|
137
122
|
|
138
123
|
|
139
124
|
# EOF error, used where it can be detected
|
140
125
|
action premature_end_error {
|
141
126
|
text = copy(data, ts ? ts-1 : 0, -1)
|
142
|
-
raise PrematureEndError.new(
|
127
|
+
raise PrematureEndError.new(text)
|
143
128
|
}
|
144
129
|
|
145
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
146
131
|
action invalid_sequence_error {
|
147
132
|
text = copy(data, ts ? ts-1 : 0, -1)
|
148
|
-
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
149
134
|
}
|
150
135
|
|
151
136
|
# group (nesting) and set open/close actions
|
@@ -168,8 +153,8 @@
|
|
168
153
|
};
|
169
154
|
|
170
155
|
'-]' @set_closed { # special case, emits two tokens
|
171
|
-
emit(:literal, :literal,
|
172
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
173
158
|
if in_set?
|
174
159
|
fret;
|
175
160
|
else
|
@@ -183,28 +168,27 @@
|
|
183
168
|
};
|
184
169
|
|
185
170
|
'^' {
|
186
|
-
|
187
|
-
|
188
|
-
emit(:set, :negate, text)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
189
173
|
else
|
190
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
191
175
|
end
|
192
176
|
};
|
193
177
|
|
194
178
|
'-' {
|
195
|
-
|
196
|
-
#
|
197
|
-
if
|
198
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
199
183
|
else
|
200
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
201
185
|
end
|
202
186
|
};
|
203
187
|
|
204
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
205
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
206
190
|
'&&' {
|
207
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
208
192
|
};
|
209
193
|
|
210
194
|
backslash {
|
@@ -212,7 +196,7 @@
|
|
212
196
|
};
|
213
197
|
|
214
198
|
set_open >(open_bracket, 1) >set_opened {
|
215
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
216
200
|
fcall character_set;
|
217
201
|
};
|
218
202
|
|
@@ -227,20 +211,12 @@
|
|
227
211
|
end
|
228
212
|
|
229
213
|
unless self.class.posix_classes.include?(class_name)
|
230
|
-
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
231
215
|
end
|
232
216
|
|
233
217
|
emit(type, class_name.to_sym, text)
|
234
218
|
};
|
235
219
|
|
236
|
-
# These are not supported in ruby at the moment. Enable them if they are.
|
237
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
238
|
-
# emit(:set, :collation, copy(data, ts, te))
|
239
|
-
# };
|
240
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
-
# emit(:set, :equivalent, copy(data, ts, te))
|
242
|
-
# };
|
243
|
-
|
244
220
|
meta_char > (set_meta, 1) {
|
245
221
|
emit(:literal, :literal, copy(data, ts, te))
|
246
222
|
};
|
@@ -254,12 +230,22 @@
|
|
254
230
|
# set escapes scanner
|
255
231
|
# --------------------------------------------------------------------------
|
256
232
|
set_escape_sequence := |*
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
236
|
+
fret;
|
237
|
+
};
|
238
|
+
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
257
240
|
set_escape > (escaped_set_alpha, 2) {
|
258
241
|
fhold;
|
259
242
|
fnext character_set;
|
260
243
|
fcall escape_sequence;
|
261
244
|
};
|
262
245
|
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
263
249
|
any > (escaped_set_alpha, 1) {
|
264
250
|
emit(:escape, :literal, copy(data, ts-1, te))
|
265
251
|
fret;
|
@@ -281,6 +267,13 @@
|
|
281
267
|
fret;
|
282
268
|
};
|
283
269
|
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
273
|
+
emit(:literal, :literal, text[2])
|
274
|
+
fret;
|
275
|
+
};
|
276
|
+
|
284
277
|
meta_char {
|
285
278
|
case text = copy(data, ts-1, te)
|
286
279
|
when '\.'; emit(:escape, :dot, text)
|
@@ -371,6 +364,7 @@
|
|
371
364
|
conditional_expression := |*
|
372
365
|
group_lookup . ')' {
|
373
366
|
text = copy(data, ts, te-1)
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
374
368
|
emit(:conditional, :condition, text)
|
375
369
|
emit(:conditional, :condition_close, ')')
|
376
370
|
};
|
@@ -453,10 +447,9 @@
|
|
453
447
|
|
454
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
455
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
456
|
-
# special case for close
|
457
|
-
# correct closing count.
|
450
|
+
# special case for close to get the correct closing count.
|
458
451
|
# ------------------------------------------------------------------------
|
459
|
-
group_open . group_comment
|
452
|
+
(group_open . group_comment) @group_closed {
|
460
453
|
emit(:group, :comment, copy(data, ts, te))
|
461
454
|
};
|
462
455
|
|
@@ -471,10 +464,10 @@
|
|
471
464
|
#
|
472
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
473
466
|
# ------------------------------------------------------------------------
|
474
|
-
group_open . group_options >group_opened {
|
467
|
+
(group_open . group_options) >group_opened {
|
475
468
|
text = copy(data, ts, te)
|
476
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
477
|
-
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
478
471
|
end
|
479
472
|
emit_options(text)
|
480
473
|
};
|
@@ -485,7 +478,7 @@
|
|
485
478
|
# (?<=subexp) look-behind
|
486
479
|
# (?<!subexp) negative look-behind
|
487
480
|
# ------------------------------------------------------------------------
|
488
|
-
group_open . assertion_type >group_opened {
|
481
|
+
(group_open . assertion_type) >group_opened {
|
489
482
|
case text = copy(data, ts, te)
|
490
483
|
when '(?='; emit(:assertion, :lookahead, text)
|
491
484
|
when '(?!'; emit(:assertion, :nlookahead, text)
|
@@ -502,14 +495,14 @@
|
|
502
495
|
# (?'name'subexp) named group (single quoted version)
|
503
496
|
# (subexp) captured group
|
504
497
|
# ------------------------------------------------------------------------
|
505
|
-
group_open . group_type >group_opened {
|
498
|
+
(group_open . group_type) >group_opened {
|
506
499
|
case text = copy(data, ts, te)
|
507
500
|
when '(?:'; emit(:group, :passive, text)
|
508
501
|
when '(?>'; emit(:group, :atomic, text)
|
509
502
|
when '(?~'; emit(:group, :absence, text)
|
510
503
|
|
511
504
|
when /^\(\?(?:<>|'')/
|
512
|
-
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
513
506
|
|
514
507
|
when /^\(\?<[^>]+>/
|
515
508
|
emit(:group, :named_ab, text)
|
@@ -528,50 +521,52 @@
|
|
528
521
|
group_close @group_closed {
|
529
522
|
if conditional_stack.last == group_depth + 1
|
530
523
|
conditional_stack.pop
|
531
|
-
emit(:conditional, :close,
|
532
|
-
|
524
|
+
emit(:conditional, :close, ')')
|
525
|
+
elsif group_depth >= 0
|
533
526
|
if spacing_stack.length > 1 &&
|
534
527
|
spacing_stack.last[:depth] == group_depth + 1
|
535
528
|
spacing_stack.pop
|
536
529
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
537
530
|
end
|
538
531
|
|
539
|
-
emit(:group, :close,
|
532
|
+
emit(:group, :close, ')')
|
533
|
+
else
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
540
535
|
end
|
541
536
|
};
|
542
537
|
|
543
538
|
|
544
539
|
# Group backreference, named and numbered
|
545
540
|
# ------------------------------------------------------------------------
|
546
|
-
backslash . (
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
547
542
|
case text = copy(data, ts, te)
|
548
|
-
when /^\\k(
|
549
|
-
validation_error(:backref, 'backreference', 'ref ID is empty')
|
550
|
-
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
551
544
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
552
|
-
when /^\\k(.)\d
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
553
546
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
554
|
-
when /^\\k(.)
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
555
548
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
556
|
-
when /^\\k(.)[
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
557
550
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
558
|
-
when /^\\k(.)
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
559
552
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
553
|
+
else
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
560
555
|
end
|
561
556
|
};
|
562
557
|
|
563
558
|
# Group call, named and numbered
|
564
559
|
# ------------------------------------------------------------------------
|
565
|
-
backslash . (
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
566
561
|
case text = copy(data, ts, te)
|
567
|
-
when /^\\g(
|
568
|
-
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
569
|
-
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
570
563
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
571
|
-
when /^\\g(.)\d
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
572
565
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
573
|
-
when /^\\g(.)[+-]\d
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
574
567
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
568
|
+
else
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
575
570
|
end
|
576
571
|
};
|
577
572
|
|
@@ -645,95 +640,35 @@
|
|
645
640
|
*|;
|
646
641
|
}%%
|
647
642
|
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
require 'regexp_parser/error'
|
643
|
+
require_relative 'scanner/errors/scanner_error'
|
644
|
+
require_relative 'scanner/errors/premature_end_error'
|
645
|
+
require_relative 'scanner/errors/validation_error'
|
652
646
|
|
653
647
|
class Regexp::Scanner
|
654
|
-
# General scanner error (catch all)
|
655
|
-
class ScannerError < Regexp::Parser::Error; end
|
656
|
-
|
657
|
-
# Base for all scanner validation errors
|
658
|
-
class ValidationError < Regexp::Parser::Error
|
659
|
-
def initialize(reason)
|
660
|
-
super reason
|
661
|
-
end
|
662
|
-
end
|
663
|
-
|
664
|
-
# Unexpected end of pattern
|
665
|
-
class PrematureEndError < ScannerError
|
666
|
-
def initialize(where = '')
|
667
|
-
super "Premature end of pattern at #{where}"
|
668
|
-
end
|
669
|
-
end
|
670
|
-
|
671
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
672
|
-
class InvalidSequenceError < ValidationError
|
673
|
-
def initialize(what = 'sequence', where = '')
|
674
|
-
super "Invalid #{what} at #{where}"
|
675
|
-
end
|
676
|
-
end
|
677
|
-
|
678
|
-
# Invalid group. Used for named groups.
|
679
|
-
class InvalidGroupError < ValidationError
|
680
|
-
def initialize(what, reason)
|
681
|
-
super "Invalid #{what}, #{reason}."
|
682
|
-
end
|
683
|
-
end
|
684
|
-
|
685
|
-
# Invalid groupOption. Used for inline options.
|
686
|
-
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
687
|
-
class InvalidGroupOption < ValidationError
|
688
|
-
def initialize(option, text)
|
689
|
-
super "Invalid group option #{option} in #{text}"
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
# Invalid back reference. Used for name a number refs/calls.
|
694
|
-
class InvalidBackrefError < ValidationError
|
695
|
-
def initialize(what, reason)
|
696
|
-
super "Invalid back reference #{what}, #{reason}"
|
697
|
-
end
|
698
|
-
end
|
699
|
-
|
700
|
-
# The property name was not recognized by the scanner.
|
701
|
-
class UnknownUnicodePropertyError < ValidationError
|
702
|
-
def initialize(name)
|
703
|
-
super "Unknown unicode character property name #{name}"
|
704
|
-
end
|
705
|
-
end
|
706
|
-
|
707
|
-
# The POSIX class name was not recognized by the scanner.
|
708
|
-
class UnknownPosixClassError < ValidationError
|
709
|
-
def initialize(text)
|
710
|
-
super "Unknown POSIX class #{text}"
|
711
|
-
end
|
712
|
-
end
|
713
|
-
|
714
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
715
649
|
# emitted token into an array that gets returned at the end. If a block is
|
716
650
|
# given, it gets called for each emitted token.
|
717
651
|
#
|
718
652
|
# This method may raise errors if a syntax error is encountered.
|
719
653
|
# --------------------------------------------------------------------------
|
720
|
-
def self.scan(input_object, options: nil, &block)
|
721
|
-
new.scan(input_object, options: options, &block)
|
654
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
655
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
722
656
|
end
|
723
657
|
|
724
|
-
def scan(input_object, options: nil, &block)
|
725
|
-
self.
|
658
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
659
|
+
self.collect_tokens = collect_tokens
|
660
|
+
self.literal_run = nil
|
726
661
|
stack = []
|
727
662
|
|
728
663
|
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
729
664
|
self.free_spacing = free_spacing?(input_object, options)
|
730
665
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
731
666
|
|
732
|
-
data = input.unpack("c*")
|
667
|
+
data = input.unpack("c*")
|
733
668
|
eof = data.length
|
734
669
|
|
735
670
|
self.tokens = []
|
736
|
-
self.block =
|
671
|
+
self.block = block
|
737
672
|
|
738
673
|
self.set_depth = 0
|
739
674
|
self.group_depth = 0
|
@@ -758,7 +693,7 @@ class Regexp::Scanner
|
|
758
693
|
"[#{set_depth}]") if in_set?
|
759
694
|
|
760
695
|
# when the entire expression is a literal run
|
761
|
-
emit_literal if
|
696
|
+
emit_literal if literal_run
|
762
697
|
|
763
698
|
tokens
|
764
699
|
end
|
@@ -785,26 +720,37 @@ class Regexp::Scanner
|
|
785
720
|
def emit(type, token, text)
|
786
721
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
787
722
|
|
788
|
-
emit_literal if
|
723
|
+
emit_literal if literal_run
|
789
724
|
|
790
725
|
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
791
726
|
# end-users, so we keep track of char-based indices and emit those instead.
|
792
727
|
ts_char_pos = char_pos
|
793
728
|
te_char_pos = char_pos + text.length
|
794
729
|
|
795
|
-
|
796
|
-
block.call type, token, text, ts_char_pos, te_char_pos
|
797
|
-
end
|
730
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
798
731
|
|
799
|
-
|
732
|
+
self.prev_token = tok
|
800
733
|
|
801
734
|
self.char_pos = te_char_pos
|
735
|
+
|
736
|
+
if block
|
737
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
738
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
739
|
+
tokens << tok if collect_tokens
|
740
|
+
elsif collect_tokens
|
741
|
+
tokens << tok
|
742
|
+
end
|
802
743
|
end
|
803
744
|
|
745
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
746
|
+
|
804
747
|
private
|
805
748
|
|
806
|
-
attr_accessor :
|
807
|
-
:
|
749
|
+
attr_accessor :block,
|
750
|
+
:collect_tokens, :tokens, :prev_token,
|
751
|
+
:free_spacing, :spacing_stack,
|
752
|
+
:group_depth, :set_depth, :conditional_stack,
|
753
|
+
:char_pos
|
808
754
|
|
809
755
|
def free_spacing?(input_object, options)
|
810
756
|
if options && !input_object.is_a?(String)
|
@@ -834,14 +780,13 @@ class Regexp::Scanner
|
|
834
780
|
# Appends one or more characters to the literal buffer, to be emitted later
|
835
781
|
# by a call to emit_literal.
|
836
782
|
def append_literal(data, ts, te)
|
837
|
-
self.
|
838
|
-
literal << copy(data, ts, te)
|
783
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
839
784
|
end
|
840
785
|
|
841
786
|
# Emits the literal run collected by calls to the append_literal method.
|
842
787
|
def emit_literal
|
843
|
-
text =
|
844
|
-
self.
|
788
|
+
text = literal_run.join
|
789
|
+
self.literal_run = nil
|
845
790
|
emit(:literal, :literal, text)
|
846
791
|
end
|
847
792
|
|
@@ -876,24 +821,8 @@ class Regexp::Scanner
|
|
876
821
|
|
877
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
878
823
|
if data.last < 0x00 || data.last > 0x7F
|
879
|
-
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
880
825
|
end
|
881
826
|
emit(:escape, token, copy(data, ts-1, te))
|
882
827
|
end
|
883
|
-
|
884
|
-
# Centralizes and unifies the handling of validation related
|
885
|
-
# errors.
|
886
|
-
def validation_error(type, what, reason = nil)
|
887
|
-
error =
|
888
|
-
case type
|
889
|
-
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
-
when :group then InvalidGroupError.new(what, reason)
|
891
|
-
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
-
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
-
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
-
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
-
end
|
896
|
-
|
897
|
-
raise error # unless @@config.validation_ignore
|
898
|
-
end
|
899
828
|
end # module Regexp::Scanner
|