regexp_parser 2.7.0 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -5
- data/LICENSE +1 -1
- data/lib/regexp_parser/expression/base.rb +0 -7
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/backreference.rb +4 -6
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
- data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
- data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
- data/lib/regexp_parser/expression/classes/group.rb +0 -22
- data/lib/regexp_parser/expression/classes/keep.rb +1 -1
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
- data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
- data/lib/regexp_parser/expression/methods/construct.rb +2 -4
- data/lib/regexp_parser/expression/methods/negative.rb +20 -0
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/tests.rb +40 -3
- data/lib/regexp_parser/expression/methods/traverse.rb +33 -20
- data/lib/regexp_parser/expression/quantifier.rb +30 -17
- data/lib/regexp_parser/expression/sequence.rb +5 -9
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +37 -24
- data/lib/regexp_parser/expression/subexpression.rb +20 -18
- data/lib/regexp_parser/expression.rb +34 -31
- data/lib/regexp_parser/lexer.rb +15 -7
- data/lib/regexp_parser/parser.rb +91 -91
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +29 -0
- data/lib/regexp_parser/scanner/properties/short.csv +3 -0
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +44 -130
- data/lib/regexp_parser/scanner.rb +1096 -1297
- data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
- data/lib/regexp_parser/syntax/token/escape.rb +3 -1
- data/lib/regexp_parser/syntax/token/meta.rb +9 -2
- data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +13 -13
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/syntax.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +6 -6
- data/regexp_parser.gemspec +5 -5
- metadata +14 -8
- data/CHANGELOG.md +0 -632
- data/README.md +0 -503
@@ -0,0 +1,63 @@
|
|
1
|
+
class Regexp::Scanner
|
2
|
+
# Base for all scanner validation errors
|
3
|
+
class ValidationError < ScannerError
|
4
|
+
# Centralizes and unifies the handling of validation related errors.
|
5
|
+
def self.for(type, problem, reason = nil)
|
6
|
+
types.fetch(type).new(problem, reason)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.types
|
10
|
+
@types ||= {
|
11
|
+
backref: InvalidBackrefError,
|
12
|
+
group: InvalidGroupError,
|
13
|
+
group_option: InvalidGroupOption,
|
14
|
+
posix_class: UnknownPosixClassError,
|
15
|
+
property: UnknownUnicodePropertyError,
|
16
|
+
sequence: InvalidSequenceError,
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Invalid sequence format. Used for escape sequences, mainly.
|
22
|
+
class InvalidSequenceError < ValidationError
|
23
|
+
def initialize(what = 'sequence', where = '')
|
24
|
+
super "Invalid #{what} at #{where}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Invalid group. Used for named groups.
|
29
|
+
class InvalidGroupError < ValidationError
|
30
|
+
def initialize(what, reason)
|
31
|
+
super "Invalid #{what}, #{reason}."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Invalid groupOption. Used for inline options.
|
36
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
37
|
+
class InvalidGroupOption < ValidationError
|
38
|
+
def initialize(option, text)
|
39
|
+
super "Invalid group option #{option} in #{text}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Invalid back reference. Used for name a number refs/calls.
|
44
|
+
class InvalidBackrefError < ValidationError
|
45
|
+
def initialize(what, reason)
|
46
|
+
super "Invalid back reference #{what}, #{reason}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# The property name was not recognized by the scanner.
|
51
|
+
class UnknownUnicodePropertyError < ValidationError
|
52
|
+
def initialize(name, _)
|
53
|
+
super "Unknown unicode character property name #{name}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# The POSIX class name was not recognized by the scanner.
|
58
|
+
class UnknownPosixClassError < ValidationError
|
59
|
+
def initialize(text, _)
|
60
|
+
super "Unknown POSIX class #{text}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -7,6 +7,8 @@ age=12.0,age=12.0
|
|
7
7
|
age=12.1,age=12.1
|
8
8
|
age=13.0,age=13.0
|
9
9
|
age=14.0,age=14.0
|
10
|
+
age=15.0,age=15.0
|
11
|
+
age=15.1,age=15.1
|
10
12
|
age=2.0,age=2.0
|
11
13
|
age=2.1,age=2.1
|
12
14
|
age=3.0,age=3.0
|
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
|
|
97
99
|
emojipresentation,emoji_presentation
|
98
100
|
enclosingmark,enclosing_mark
|
99
101
|
ethiopic,ethiopic
|
102
|
+
extendedpictographic,extended_pictographic
|
100
103
|
extender,extender
|
101
104
|
finalpunctuation,final_punctuation
|
102
105
|
format,format
|
@@ -106,6 +109,19 @@ gothic,gothic
|
|
106
109
|
grantha,grantha
|
107
110
|
graph,graph
|
108
111
|
graphemebase,grapheme_base
|
112
|
+
graphemeclusterbreak=control,grapheme_cluster_break=control
|
113
|
+
graphemeclusterbreak=cr,grapheme_cluster_break=cr
|
114
|
+
graphemeclusterbreak=extend,grapheme_cluster_break=extend
|
115
|
+
graphemeclusterbreak=l,grapheme_cluster_break=l
|
116
|
+
graphemeclusterbreak=lf,grapheme_cluster_break=lf
|
117
|
+
graphemeclusterbreak=lv,grapheme_cluster_break=lv
|
118
|
+
graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
|
119
|
+
graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
|
120
|
+
graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
|
121
|
+
graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
|
122
|
+
graphemeclusterbreak=t,grapheme_cluster_break=t
|
123
|
+
graphemeclusterbreak=v,grapheme_cluster_break=v
|
124
|
+
graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
|
109
125
|
graphemeextend,grapheme_extend
|
110
126
|
graphemelink,grapheme_link
|
111
127
|
greek,greek
|
@@ -121,11 +137,14 @@ hebrew,hebrew
|
|
121
137
|
hexdigit,hex_digit
|
122
138
|
hiragana,hiragana
|
123
139
|
hyphen,hyphen
|
140
|
+
idcompatmathcontinue,id_compat_math_continue
|
141
|
+
idcompatmathstart,id_compat_math_start
|
124
142
|
idcontinue,id_continue
|
125
143
|
ideographic,ideographic
|
126
144
|
idsbinaryoperator,ids_binary_operator
|
127
145
|
idstart,id_start
|
128
146
|
idstrinaryoperator,ids_trinary_operator
|
147
|
+
idsunaryoperator,ids_unary_operator
|
129
148
|
imperialaramaic,imperial_aramaic
|
130
149
|
inadlam,in_adlam
|
131
150
|
inaegeannumbers,in_aegean_numbers
|
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
|
|
139
158
|
inarabic,in_arabic
|
140
159
|
inarabicextendeda,in_arabic_extended_a
|
141
160
|
inarabicextendedb,in_arabic_extended_b
|
161
|
+
inarabicextendedc,in_arabic_extended_c
|
142
162
|
inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
|
143
163
|
inarabicpresentationformsa,in_arabic_presentation_forms_a
|
144
164
|
inarabicpresentationformsb,in_arabic_presentation_forms_b
|
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
|
|
186
206
|
incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
|
187
207
|
incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
|
188
208
|
incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
|
209
|
+
incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
|
210
|
+
incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
|
189
211
|
incombiningdiacriticalmarks,in_combining_diacritical_marks
|
190
212
|
incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
|
191
213
|
incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
|
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
|
|
205
227
|
incyrillicextendeda,in_cyrillic_extended_a
|
206
228
|
incyrillicextendedb,in_cyrillic_extended_b
|
207
229
|
incyrillicextendedc,in_cyrillic_extended_c
|
230
|
+
incyrillicextendedd,in_cyrillic_extended_d
|
208
231
|
incyrillicsupplement,in_cyrillic_supplement
|
209
232
|
indeseret,in_deseret
|
210
233
|
indevanagari,in_devanagari
|
211
234
|
indevanagariextended,in_devanagari_extended
|
235
|
+
indevanagariextendeda,in_devanagari_extended_a
|
212
236
|
indingbats,in_dingbats
|
213
237
|
indivesakuru,in_dives_akuru
|
214
238
|
indogra,in_dogra
|
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
|
|
268
292
|
initialpunctuation,initial_punctuation
|
269
293
|
injavanese,in_javanese
|
270
294
|
inkaithi,in_kaithi
|
295
|
+
inkaktoviknumerals,in_kaktovik_numerals
|
271
296
|
inkanaextendeda,in_kana_extended_a
|
272
297
|
inkanaextendedb,in_kana_extended_b
|
273
298
|
inkanasupplement,in_kana_supplement
|
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
|
|
276
301
|
inkannada,in_kannada
|
277
302
|
inkatakana,in_katakana
|
278
303
|
inkatakanaphoneticextensions,in_katakana_phonetic_extensions
|
304
|
+
inkawi,in_kawi
|
279
305
|
inkayahli,in_kayah_li
|
280
306
|
inkharoshthi,in_kharoshthi
|
281
307
|
inkhitansmallscript,in_khitan_small_script
|
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
|
|
339
365
|
inmyanmarextendeda,in_myanmar_extended_a
|
340
366
|
inmyanmarextendedb,in_myanmar_extended_b
|
341
367
|
innabataean,in_nabataean
|
368
|
+
innagmundari,in_nag_mundari
|
342
369
|
innandinagari,in_nandinagari
|
343
370
|
innewa,in_newa
|
344
371
|
innewtailue,in_new_tai_lue
|
@@ -457,6 +484,7 @@ joincontrol,join_control
|
|
457
484
|
kaithi,kaithi
|
458
485
|
kannada,kannada
|
459
486
|
katakana,katakana
|
487
|
+
kawi,kawi
|
460
488
|
kayahli,kayah_li
|
461
489
|
kharoshthi,kharoshthi
|
462
490
|
khitansmallscript,khitan_small_script
|
@@ -503,6 +531,7 @@ mro,mro
|
|
503
531
|
multani,multani
|
504
532
|
myanmar,myanmar
|
505
533
|
nabataean,nabataean
|
534
|
+
nagmundari,nag_mundari
|
506
535
|
nandinagari,nandinagari
|
507
536
|
newa,newa
|
508
537
|
newline,newline
|
@@ -57,6 +57,7 @@ emod,emoji_modifier
|
|
57
57
|
epres,emoji_presentation
|
58
58
|
ethi,ethiopic
|
59
59
|
ext,extender
|
60
|
+
extpict,extended_pictographic
|
60
61
|
geor,georgian
|
61
62
|
glag,glagolitic
|
62
63
|
gong,gunjala_gondi
|
@@ -85,6 +86,7 @@ ideo,ideographic
|
|
85
86
|
ids,id_start
|
86
87
|
idsb,ids_binary_operator
|
87
88
|
idst,ids_trinary_operator
|
89
|
+
idsu,ids_unary_operator
|
88
90
|
ital,old_italic
|
89
91
|
java,javanese
|
90
92
|
joinc,join_control
|
@@ -133,6 +135,7 @@ mtei,meetei_mayek
|
|
133
135
|
mult,multani
|
134
136
|
mymr,myanmar
|
135
137
|
n,number
|
138
|
+
nagm,nag_mundari
|
136
139
|
nand,nandinagari
|
137
140
|
narb,old_north_arabian
|
138
141
|
nbat,nabataean
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
raise ValidationError.for(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -30,11 +30,6 @@
|
|
30
30
|
|
31
31
|
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
32
32
|
|
33
|
-
|
34
|
-
# these are not supported in ruby at the moment
|
35
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
|
-
character_equivalent = '[=' . alpha . '=]';
|
37
|
-
|
38
33
|
line_anchor = beginning_of_line | end_of_line;
|
39
34
|
anchor_char = [AbBzZG];
|
40
35
|
|
@@ -83,10 +78,9 @@
|
|
83
78
|
# try to treat every other group head as options group, like Ruby
|
84
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
85
80
|
|
86
|
-
group_ref = [gk];
|
87
81
|
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
88
82
|
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
89
|
-
group_number = '-'? . [
|
83
|
+
group_number = '-'? . [0-9]+;
|
90
84
|
group_level = [+\-] . [0-9]+;
|
91
85
|
|
92
86
|
group_name = ('<' . group_name_id_ab? . '>') |
|
@@ -95,15 +89,11 @@
|
|
95
89
|
|
96
90
|
group_named = ('?' . group_name );
|
97
91
|
|
98
|
-
|
99
|
-
|
100
|
-
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
101
|
-
("'" . group_name_id_sq? . group_level? "'"));
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
102
94
|
|
103
|
-
|
104
|
-
|
105
|
-
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
106
|
-
("'" . ((group_number . group_level?) | '0') "'"));
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
107
97
|
|
108
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
109
99
|
|
@@ -134,13 +124,13 @@
|
|
134
124
|
# EOF error, used where it can be detected
|
135
125
|
action premature_end_error {
|
136
126
|
text = copy(data, ts ? ts-1 : 0, -1)
|
137
|
-
raise PrematureEndError.new(
|
127
|
+
raise PrematureEndError.new(text)
|
138
128
|
}
|
139
129
|
|
140
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
141
131
|
action invalid_sequence_error {
|
142
132
|
text = copy(data, ts ? ts-1 : 0, -1)
|
143
|
-
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
144
134
|
}
|
145
135
|
|
146
136
|
# group (nesting) and set open/close actions
|
@@ -221,20 +211,12 @@
|
|
221
211
|
end
|
222
212
|
|
223
213
|
unless self.class.posix_classes.include?(class_name)
|
224
|
-
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
225
215
|
end
|
226
216
|
|
227
217
|
emit(type, class_name.to_sym, text)
|
228
218
|
};
|
229
219
|
|
230
|
-
# These are not supported in ruby at the moment. Enable them if they are.
|
231
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
232
|
-
# emit(:set, :collation, copy(data, ts, te))
|
233
|
-
# };
|
234
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
235
|
-
# emit(:set, :equivalent, copy(data, ts, te))
|
236
|
-
# };
|
237
|
-
|
238
220
|
meta_char > (set_meta, 1) {
|
239
221
|
emit(:literal, :literal, copy(data, ts, te))
|
240
222
|
};
|
@@ -285,6 +267,13 @@
|
|
285
267
|
fret;
|
286
268
|
};
|
287
269
|
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
273
|
+
emit(:literal, :literal, text[2])
|
274
|
+
fret;
|
275
|
+
};
|
276
|
+
|
288
277
|
meta_char {
|
289
278
|
case text = copy(data, ts-1, te)
|
290
279
|
when '\.'; emit(:escape, :dot, text)
|
@@ -375,6 +364,7 @@
|
|
375
364
|
conditional_expression := |*
|
376
365
|
group_lookup . ')' {
|
377
366
|
text = copy(data, ts, te-1)
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
378
368
|
emit(:conditional, :condition, text)
|
379
369
|
emit(:conditional, :condition_close, ')')
|
380
370
|
};
|
@@ -457,10 +447,9 @@
|
|
457
447
|
|
458
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
459
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
460
|
-
# special case for close
|
461
|
-
# correct closing count.
|
450
|
+
# special case for close to get the correct closing count.
|
462
451
|
# ------------------------------------------------------------------------
|
463
|
-
group_open . group_comment
|
452
|
+
(group_open . group_comment) @group_closed {
|
464
453
|
emit(:group, :comment, copy(data, ts, te))
|
465
454
|
};
|
466
455
|
|
@@ -475,10 +464,10 @@
|
|
475
464
|
#
|
476
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
477
466
|
# ------------------------------------------------------------------------
|
478
|
-
group_open . group_options >group_opened {
|
467
|
+
(group_open . group_options) >group_opened {
|
479
468
|
text = copy(data, ts, te)
|
480
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
481
|
-
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
482
471
|
end
|
483
472
|
emit_options(text)
|
484
473
|
};
|
@@ -489,7 +478,7 @@
|
|
489
478
|
# (?<=subexp) look-behind
|
490
479
|
# (?<!subexp) negative look-behind
|
491
480
|
# ------------------------------------------------------------------------
|
492
|
-
group_open . assertion_type >group_opened {
|
481
|
+
(group_open . assertion_type) >group_opened {
|
493
482
|
case text = copy(data, ts, te)
|
494
483
|
when '(?='; emit(:assertion, :lookahead, text)
|
495
484
|
when '(?!'; emit(:assertion, :nlookahead, text)
|
@@ -506,14 +495,14 @@
|
|
506
495
|
# (?'name'subexp) named group (single quoted version)
|
507
496
|
# (subexp) captured group
|
508
497
|
# ------------------------------------------------------------------------
|
509
|
-
group_open . group_type >group_opened {
|
498
|
+
(group_open . group_type) >group_opened {
|
510
499
|
case text = copy(data, ts, te)
|
511
500
|
when '(?:'; emit(:group, :passive, text)
|
512
501
|
when '(?>'; emit(:group, :atomic, text)
|
513
502
|
when '(?~'; emit(:group, :absence, text)
|
514
503
|
|
515
504
|
when /^\(\?(?:<>|'')/
|
516
|
-
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
517
506
|
|
518
507
|
when /^\(\?<[^>]+>/
|
519
508
|
emit(:group, :named_ab, text)
|
@@ -533,7 +522,7 @@
|
|
533
522
|
if conditional_stack.last == group_depth + 1
|
534
523
|
conditional_stack.pop
|
535
524
|
emit(:conditional, :close, ')')
|
536
|
-
|
525
|
+
elsif group_depth >= 0
|
537
526
|
if spacing_stack.length > 1 &&
|
538
527
|
spacing_stack.last[:depth] == group_depth + 1
|
539
528
|
spacing_stack.pop
|
@@ -541,41 +530,43 @@
|
|
541
530
|
end
|
542
531
|
|
543
532
|
emit(:group, :close, ')')
|
533
|
+
else
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
544
535
|
end
|
545
536
|
};
|
546
537
|
|
547
538
|
|
548
539
|
# Group backreference, named and numbered
|
549
540
|
# ------------------------------------------------------------------------
|
550
|
-
backslash . (
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
551
542
|
case text = copy(data, ts, te)
|
552
|
-
when /^\\k(
|
553
|
-
validation_error(:backref, 'backreference', 'ref ID is empty')
|
554
|
-
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
555
544
|
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
556
|
-
when /^\\k(.)\d
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
557
546
|
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
558
|
-
when /^\\k(.)
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
559
548
|
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
560
|
-
when /^\\k(.)[
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
561
550
|
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
562
|
-
when /^\\k(.)
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
563
552
|
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
553
|
+
else
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
564
555
|
end
|
565
556
|
};
|
566
557
|
|
567
558
|
# Group call, named and numbered
|
568
559
|
# ------------------------------------------------------------------------
|
569
|
-
backslash . (
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
570
561
|
case text = copy(data, ts, te)
|
571
|
-
when /^\\g(
|
572
|
-
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
573
|
-
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
574
563
|
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
575
|
-
when /^\\g(.)\d
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
576
565
|
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
577
|
-
when /^\\g(.)[+-]\d
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
578
567
|
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
568
|
+
else
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
579
570
|
end
|
580
571
|
};
|
581
572
|
|
@@ -649,72 +640,11 @@
|
|
649
640
|
*|;
|
650
641
|
}%%
|
651
642
|
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
require 'regexp_parser/error'
|
643
|
+
require_relative 'scanner/errors/scanner_error'
|
644
|
+
require_relative 'scanner/errors/premature_end_error'
|
645
|
+
require_relative 'scanner/errors/validation_error'
|
656
646
|
|
657
647
|
class Regexp::Scanner
|
658
|
-
# General scanner error (catch all)
|
659
|
-
class ScannerError < Regexp::Parser::Error; end
|
660
|
-
|
661
|
-
# Base for all scanner validation errors
|
662
|
-
class ValidationError < Regexp::Parser::Error
|
663
|
-
def initialize(reason)
|
664
|
-
super reason
|
665
|
-
end
|
666
|
-
end
|
667
|
-
|
668
|
-
# Unexpected end of pattern
|
669
|
-
class PrematureEndError < ScannerError
|
670
|
-
def initialize(where = '')
|
671
|
-
super "Premature end of pattern at #{where}"
|
672
|
-
end
|
673
|
-
end
|
674
|
-
|
675
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
676
|
-
class InvalidSequenceError < ValidationError
|
677
|
-
def initialize(what = 'sequence', where = '')
|
678
|
-
super "Invalid #{what} at #{where}"
|
679
|
-
end
|
680
|
-
end
|
681
|
-
|
682
|
-
# Invalid group. Used for named groups.
|
683
|
-
class InvalidGroupError < ValidationError
|
684
|
-
def initialize(what, reason)
|
685
|
-
super "Invalid #{what}, #{reason}."
|
686
|
-
end
|
687
|
-
end
|
688
|
-
|
689
|
-
# Invalid groupOption. Used for inline options.
|
690
|
-
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
691
|
-
class InvalidGroupOption < ValidationError
|
692
|
-
def initialize(option, text)
|
693
|
-
super "Invalid group option #{option} in #{text}"
|
694
|
-
end
|
695
|
-
end
|
696
|
-
|
697
|
-
# Invalid back reference. Used for name a number refs/calls.
|
698
|
-
class InvalidBackrefError < ValidationError
|
699
|
-
def initialize(what, reason)
|
700
|
-
super "Invalid back reference #{what}, #{reason}"
|
701
|
-
end
|
702
|
-
end
|
703
|
-
|
704
|
-
# The property name was not recognized by the scanner.
|
705
|
-
class UnknownUnicodePropertyError < ValidationError
|
706
|
-
def initialize(name)
|
707
|
-
super "Unknown unicode character property name #{name}"
|
708
|
-
end
|
709
|
-
end
|
710
|
-
|
711
|
-
# The POSIX class name was not recognized by the scanner.
|
712
|
-
class UnknownPosixClassError < ValidationError
|
713
|
-
def initialize(text)
|
714
|
-
super "Unknown POSIX class #{text}"
|
715
|
-
end
|
716
|
-
end
|
717
|
-
|
718
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
719
649
|
# emitted token into an array that gets returned at the end. If a block is
|
720
650
|
# given, it gets called for each emitted token.
|
@@ -891,24 +821,8 @@ class Regexp::Scanner
|
|
891
821
|
|
892
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
893
823
|
if data.last < 0x00 || data.last > 0x7F
|
894
|
-
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
895
825
|
end
|
896
826
|
emit(:escape, token, copy(data, ts-1, te))
|
897
827
|
end
|
898
|
-
|
899
|
-
# Centralizes and unifies the handling of validation related
|
900
|
-
# errors.
|
901
|
-
def validation_error(type, what, reason = nil)
|
902
|
-
error =
|
903
|
-
case type
|
904
|
-
when :backref then InvalidBackrefError.new(what, reason)
|
905
|
-
when :group then InvalidGroupError.new(what, reason)
|
906
|
-
when :group_option then InvalidGroupOption.new(what, reason)
|
907
|
-
when :posix_class then UnknownPosixClassError.new(what)
|
908
|
-
when :property then UnknownUnicodePropertyError.new(what)
|
909
|
-
when :sequence then InvalidSequenceError.new(what, reason)
|
910
|
-
end
|
911
|
-
|
912
|
-
raise error # unless @@config.validation_ignore
|
913
|
-
end
|
914
828
|
end # module Regexp::Scanner
|