regexp_parser 1.5.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +59 -0
- data/Gemfile +3 -3
- data/README.md +14 -6
- data/Rakefile +3 -4
- data/lib/regexp_parser/expression.rb +6 -43
- data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
- data/lib/regexp_parser/expression/classes/escape.rb +0 -4
- data/lib/regexp_parser/expression/methods/match.rb +13 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/options.rb +35 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
- data/lib/regexp_parser/expression/methods/tests.rb +6 -15
- data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -2
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
- data/lib/regexp_parser/lexer.rb +4 -25
- data/lib/regexp_parser/parser.rb +40 -33
- data/lib/regexp_parser/scanner.rb +1208 -1353
- data/lib/regexp_parser/scanner/char_type.rl +0 -3
- data/lib/regexp_parser/scanner/properties/long.yml +15 -1
- data/lib/regexp_parser/scanner/properties/short.yml +5 -0
- data/lib/regexp_parser/scanner/scanner.rl +116 -202
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +14 -0
- data/spec/expression/methods/match_length_spec.rb +20 -0
- data/spec/expression/methods/match_spec.rb +25 -0
- data/spec/expression/methods/tests_spec.rb +2 -0
- data/spec/expression/methods/traverse_spec.rb +21 -0
- data/spec/expression/options_spec.rb +128 -0
- data/spec/expression/root_spec.rb +9 -0
- data/spec/expression/sequence_spec.rb +9 -0
- data/spec/lexer/conditionals_spec.rb +49 -119
- data/spec/lexer/delimiters_spec.rb +68 -0
- data/spec/lexer/escapes_spec.rb +8 -32
- data/spec/lexer/keep_spec.rb +5 -17
- data/spec/lexer/literals_spec.rb +73 -110
- data/spec/lexer/nesting_spec.rb +86 -117
- data/spec/lexer/refcalls_spec.rb +51 -50
- data/spec/parser/all_spec.rb +13 -1
- data/spec/parser/anchors_spec.rb +9 -23
- data/spec/parser/conditionals_spec.rb +9 -9
- data/spec/parser/errors_spec.rb +22 -43
- data/spec/parser/escapes_spec.rb +33 -44
- data/spec/parser/free_space_spec.rb +25 -4
- data/spec/parser/groups_spec.rb +98 -257
- data/spec/parser/keep_spec.rb +2 -15
- data/spec/parser/options_spec.rb +28 -0
- data/spec/parser/posix_classes_spec.rb +5 -24
- data/spec/parser/properties_spec.rb +42 -54
- data/spec/parser/quantifiers_spec.rb +42 -283
- data/spec/parser/refcalls_spec.rb +60 -185
- data/spec/parser/set/intersections_spec.rb +17 -17
- data/spec/parser/set/ranges_spec.rb +17 -17
- data/spec/parser/sets_spec.rb +5 -5
- data/spec/parser/types_spec.rb +11 -36
- data/spec/scanner/anchors_spec.rb +13 -28
- data/spec/scanner/conditionals_spec.rb +121 -173
- data/spec/scanner/delimiters_spec.rb +52 -0
- data/spec/scanner/errors_spec.rb +64 -87
- data/spec/scanner/escapes_spec.rb +53 -50
- data/spec/scanner/free_space_spec.rb +102 -165
- data/spec/scanner/groups_spec.rb +45 -64
- data/spec/scanner/keep_spec.rb +5 -28
- data/spec/scanner/literals_spec.rb +45 -81
- data/spec/scanner/meta_spec.rb +13 -33
- data/spec/scanner/options_spec.rb +36 -0
- data/spec/scanner/properties_spec.rb +43 -286
- data/spec/scanner/quantifiers_spec.rb +13 -28
- data/spec/scanner/refcalls_spec.rb +32 -48
- data/spec/scanner/sets_spec.rb +88 -102
- data/spec/scanner/types_spec.rb +10 -25
- data/spec/spec_helper.rb +1 -0
- data/spec/support/shared_examples.rb +77 -0
- data/spec/syntax/syntax_spec.rb +4 -0
- data/spec/syntax/versions/1.8.6_spec.rb +12 -33
- data/spec/syntax/versions/1.9.1_spec.rb +5 -18
- data/spec/syntax/versions/1.9.3_spec.rb +4 -17
- data/spec/syntax/versions/2.0.0_spec.rb +8 -23
- data/spec/syntax/versions/2.2.0_spec.rb +4 -17
- data/spec/syntax/versions/aliases_spec.rb +27 -109
- metadata +28 -10
- data/spec/scanner/scripts_spec.rb +0 -49
- data/spec/scanner/unicode_blocks_spec.rb +0 -28
@@ -21,9 +21,6 @@
|
|
21
21
|
when '\W'; emit(:type, :nonword, text, ts - 1, te)
|
22
22
|
when '\R'; emit(:type, :linebreak, text, ts - 1, te)
|
23
23
|
when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
|
24
|
-
else
|
25
|
-
raise ScannerError.new(
|
26
|
-
"Unexpected character in type at #{text} (char #{ts})")
|
27
24
|
end
|
28
25
|
fret;
|
29
26
|
};
|
@@ -6,6 +6,8 @@ adlam: adlam
|
|
6
6
|
age=1.1: age=1.1
|
7
7
|
age=10.0: age=10.0
|
8
8
|
age=11.0: age=11.0
|
9
|
+
age=12.0: age=12.0
|
10
|
+
age=12.1: age=12.1
|
9
11
|
age=2.0: age=2.0
|
10
12
|
age=2.1: age=2.1
|
11
13
|
age=3.0: age=3.0
|
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
|
|
64
66
|
cherokee: cherokee
|
65
67
|
closepunctuation: close_punctuation
|
66
68
|
cntrl: cntrl
|
67
|
-
combiningmark: combining_mark
|
68
69
|
common: common
|
69
70
|
connectorpunctuation: connector_punctuation
|
70
71
|
control: control
|
@@ -86,6 +87,7 @@ dogra: dogra
|
|
86
87
|
duployan: duployan
|
87
88
|
egyptianhieroglyphs: egyptian_hieroglyphs
|
88
89
|
elbasan: elbasan
|
90
|
+
elymaic: elymaic
|
89
91
|
emoji: emoji
|
90
92
|
emojicomponent: emoji_component
|
91
93
|
emojimodifier: emoji_modifier
|
@@ -206,8 +208,10 @@ indogra: in_dogra
|
|
206
208
|
indominotiles: in_domino_tiles
|
207
209
|
induployan: in_duployan
|
208
210
|
inearlydynasticcuneiform: in_early_dynastic_cuneiform
|
211
|
+
inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
|
209
212
|
inegyptianhieroglyphs: in_egyptian_hieroglyphs
|
210
213
|
inelbasan: in_elbasan
|
214
|
+
inelymaic: in_elymaic
|
211
215
|
inemoticons: in_emoticons
|
212
216
|
inenclosedalphanumerics: in_enclosed_alphanumerics
|
213
217
|
inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
|
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
|
|
322
326
|
inmyanmarextendeda: in_myanmar_extended_a
|
323
327
|
inmyanmarextendedb: in_myanmar_extended_b
|
324
328
|
innabataean: in_nabataean
|
329
|
+
innandinagari: in_nandinagari
|
325
330
|
innewa: in_newa
|
326
331
|
innewtailue: in_new_tai_lue
|
327
332
|
innko: in_nko
|
328
333
|
innoblock: in_no_block
|
329
334
|
innumberforms: in_number_forms
|
330
335
|
innushu: in_nushu
|
336
|
+
innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
|
331
337
|
inogham: in_ogham
|
332
338
|
inolchiki: in_ol_chiki
|
333
339
|
inoldhungarian: in_old_hungarian
|
@@ -343,6 +349,7 @@ inoriya: in_oriya
|
|
343
349
|
inornamentaldingbats: in_ornamental_dingbats
|
344
350
|
inosage: in_osage
|
345
351
|
inosmanya: in_osmanya
|
352
|
+
inottomansiyaqnumbers: in_ottoman_siyaq_numbers
|
346
353
|
inpahawhhmong: in_pahawh_hmong
|
347
354
|
inpalmyrene: in_palmyrene
|
348
355
|
inpaucinhau: in_pau_cin_hau
|
@@ -368,6 +375,7 @@ insiddham: in_siddham
|
|
368
375
|
insinhala: in_sinhala
|
369
376
|
insinhalaarchaicnumbers: in_sinhala_archaic_numbers
|
370
377
|
insmallformvariants: in_small_form_variants
|
378
|
+
insmallkanaextension: in_small_kana_extension
|
371
379
|
insogdian: in_sogdian
|
372
380
|
insorasompeng: in_sora_sompeng
|
373
381
|
insoyombo: in_soyombo
|
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
|
|
386
394
|
insupplementaryprivateuseareab: in_supplementary_private_use_area_b
|
387
395
|
insuttonsignwriting: in_sutton_signwriting
|
388
396
|
insylotinagri: in_syloti_nagri
|
397
|
+
insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
|
389
398
|
insyriac: in_syriac
|
390
399
|
insyriacsupplement: in_syriac_supplement
|
391
400
|
intagalog: in_tagalog
|
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
|
|
397
406
|
intaixuanjingsymbols: in_tai_xuan_jing_symbols
|
398
407
|
intakri: in_takri
|
399
408
|
intamil: in_tamil
|
409
|
+
intamilsupplement: in_tamil_supplement
|
400
410
|
intangut: in_tangut
|
401
411
|
intangutcomponents: in_tangut_components
|
402
412
|
intelugu: in_telugu
|
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
|
|
414
424
|
invariationselectorssupplement: in_variation_selectors_supplement
|
415
425
|
invedicextensions: in_vedic_extensions
|
416
426
|
inverticalforms: in_vertical_forms
|
427
|
+
inwancho: in_wancho
|
417
428
|
inwarangciti: in_warang_citi
|
418
429
|
inyijinghexagramsymbols: in_yijing_hexagram_symbols
|
419
430
|
inyiradicals: in_yi_radicals
|
@@ -469,6 +480,7 @@ mro: mro
|
|
469
480
|
multani: multani
|
470
481
|
myanmar: myanmar
|
471
482
|
nabataean: nabataean
|
483
|
+
nandinagari: nandinagari
|
472
484
|
newa: newa
|
473
485
|
newline: newline
|
474
486
|
newtailue: new_tai_lue
|
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
|
|
477
489
|
nonspacingmark: nonspacing_mark
|
478
490
|
number: number
|
479
491
|
nushu: nushu
|
492
|
+
nyiakengpuachuehmong: nyiakeng_puachue_hmong
|
480
493
|
ogham: ogham
|
481
494
|
olchiki: ol_chiki
|
482
495
|
oldhungarian: old_hungarian
|
@@ -569,6 +582,7 @@ uppercase: uppercase
|
|
569
582
|
uppercaseletter: uppercase_letter
|
570
583
|
vai: vai
|
571
584
|
variationselector: variation_selector
|
585
|
+
wancho: wancho
|
572
586
|
warangciti: warang_citi
|
573
587
|
whitespace: white_space
|
574
588
|
word: word
|
@@ -31,6 +31,7 @@ cher: cherokee
|
|
31
31
|
ci: case_ignorable
|
32
32
|
cn: unassigned
|
33
33
|
co: private_use
|
34
|
+
combiningmark: mark
|
34
35
|
copt: coptic
|
35
36
|
cprt: cypriot
|
36
37
|
cs: surrogate
|
@@ -49,6 +50,7 @@ dsrt: deseret
|
|
49
50
|
dupl: duployan
|
50
51
|
egyp: egyptian_hieroglyphs
|
51
52
|
elba: elbasan
|
53
|
+
elym: elymaic
|
52
54
|
ethi: ethiopic
|
53
55
|
ext: extender
|
54
56
|
geor: georgian
|
@@ -72,6 +74,7 @@ hex: hex_digit
|
|
72
74
|
hira: hiragana
|
73
75
|
hluw: anatolian_hieroglyphs
|
74
76
|
hmng: pahawh_hmong
|
77
|
+
hmnp: nyiakeng_puachue_hmong
|
75
78
|
hung: old_hungarian
|
76
79
|
idc: id_continue
|
77
80
|
ideo: ideographic
|
@@ -125,6 +128,7 @@ mtei: meetei_mayek
|
|
125
128
|
mult: multani
|
126
129
|
mymr: myanmar
|
127
130
|
n: number
|
131
|
+
nand: nandinagari
|
128
132
|
narb: old_north_arabian
|
129
133
|
nbat: nabataean
|
130
134
|
nchar: noncharacter_code_point
|
@@ -216,6 +220,7 @@ uideo: unified_ideograph
|
|
216
220
|
vaii: vai
|
217
221
|
vs: variation_selector
|
218
222
|
wara: warang_citi
|
223
|
+
wcho: wancho
|
219
224
|
wspace: white_space
|
220
225
|
xidc: xid_continue
|
221
226
|
xids: xid_start
|
@@ -21,7 +21,7 @@
|
|
21
21
|
set_close = ']';
|
22
22
|
brackets = set_open | set_close;
|
23
23
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
24
|
+
comment = ('#' . [^\n]* . '\n'?);
|
25
25
|
|
26
26
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
27
|
'cntrl' | 'digit' | 'graph' |
|
@@ -49,9 +49,9 @@
|
|
49
49
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
50
50
|
codepoint_sequence = codepoint_single | codepoint_list;
|
51
51
|
|
52
|
-
control_sequence = ('c' | 'C-') . (backslash . 'M-')
|
52
|
+
control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
|
53
53
|
|
54
|
-
meta_sequence = 'M-' . (backslash .
|
54
|
+
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
55
|
|
56
56
|
zero_or_one = '?' | '??' | '?+';
|
57
57
|
zero_or_more = '*' | '*?' | '*+';
|
@@ -62,13 +62,17 @@
|
|
62
62
|
quantifier_possessive = '?+' | '*+' | '++';
|
63
63
|
quantifier_mode = '?' | '+';
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
quantity_exact = (digit+);
|
66
|
+
quantity_minimum = (digit+) . ',';
|
67
|
+
quantity_maximum = ',' . (digit+);
|
68
|
+
quantity_range = (digit+) . ',' . (digit+);
|
69
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
70
|
+
quantity_maximum | quantity_range ) . range_close .
|
71
|
+
quantifier_mode?;
|
67
72
|
|
68
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
74
|
quantifier_possessive | quantifier_interval;
|
70
75
|
|
71
|
-
|
72
76
|
conditional = '(?(';
|
73
77
|
|
74
78
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -82,7 +86,8 @@
|
|
82
86
|
assertion_lookbehind = '?<=';
|
83
87
|
assertion_nlookbehind = '?<!';
|
84
88
|
|
85
|
-
|
89
|
+
# try to treat every other group head as options group, like Ruby
|
90
|
+
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
86
91
|
|
87
92
|
group_ref = [gk];
|
88
93
|
group_name_char = (alnum | '_');
|
@@ -113,7 +118,9 @@
|
|
113
118
|
curlies | parantheses | brackets |
|
114
119
|
line_anchor | quantifier_greedy;
|
115
120
|
|
116
|
-
|
121
|
+
literal_delimiters = ']' | '}';
|
122
|
+
|
123
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
117
124
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
118
125
|
|
119
126
|
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
@@ -121,7 +128,7 @@
|
|
121
128
|
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
122
129
|
|
123
130
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
124
|
-
|
131
|
+
keep_mark | [xucCM];
|
125
132
|
|
126
133
|
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
127
134
|
multi_codepoint_char_type | [0-9cCM];
|
@@ -135,41 +142,35 @@
|
|
135
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
136
143
|
action invalid_sequence_error {
|
137
144
|
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
138
|
-
|
145
|
+
validation_error(:sequence, 'sequence', text)
|
139
146
|
}
|
140
147
|
|
141
148
|
# group (nesting) and set open/close actions
|
142
|
-
action group_opened { self.group_depth = group_depth + 1
|
143
|
-
action group_closed { self.group_depth = group_depth - 1
|
149
|
+
action group_opened { self.group_depth = group_depth + 1 }
|
150
|
+
action group_closed { self.group_depth = group_depth - 1 }
|
151
|
+
action set_opened { self.set_depth = set_depth + 1 }
|
152
|
+
action set_closed { self.set_depth = set_depth - 1 }
|
144
153
|
|
145
154
|
# Character set scanner, continues consuming characters until it meets the
|
146
155
|
# closing bracket of the set.
|
147
156
|
# --------------------------------------------------------------------------
|
148
157
|
character_set := |*
|
149
|
-
set_close > (set_meta, 2) {
|
150
|
-
set_depth -= 1
|
151
|
-
in_set = set_depth > 0 ? true : false
|
152
|
-
|
158
|
+
set_close > (set_meta, 2) @set_closed {
|
153
159
|
emit(:set, :close, *text(data, ts, te))
|
154
|
-
|
155
|
-
if set_depth == 0
|
156
|
-
fgoto main;
|
157
|
-
else
|
160
|
+
if in_set?
|
158
161
|
fret;
|
162
|
+
else
|
163
|
+
fgoto main;
|
159
164
|
end
|
160
165
|
};
|
161
166
|
|
162
|
-
'-]' { # special case, emits two tokens
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
emit(:literal, :literal, copy(data, ts..te-2), ts, te)
|
167
|
-
emit(:set, :close, copy(data, ts+1..te-1), ts, te)
|
168
|
-
|
169
|
-
if set_depth == 0
|
170
|
-
fgoto main;
|
171
|
-
else
|
167
|
+
'-]' @set_closed { # special case, emits two tokens
|
168
|
+
emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
|
169
|
+
emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
|
170
|
+
if in_set?
|
172
171
|
fret;
|
172
|
+
else
|
173
|
+
fgoto main;
|
173
174
|
end
|
174
175
|
};
|
175
176
|
|
@@ -207,14 +208,12 @@
|
|
207
208
|
fcall set_escape_sequence;
|
208
209
|
};
|
209
210
|
|
210
|
-
set_open >(open_bracket, 1) {
|
211
|
-
set_depth += 1
|
212
|
-
|
211
|
+
set_open >(open_bracket, 1) >set_opened {
|
213
212
|
emit(:set, :open, *text(data, ts, te))
|
214
213
|
fcall character_set;
|
215
214
|
};
|
216
215
|
|
217
|
-
class_posix >(open_bracket, 1) @eof(premature_end_error)
|
216
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
218
217
|
text = text(data, ts, te).first
|
219
218
|
|
220
219
|
type = :posixclass
|
@@ -227,11 +226,11 @@
|
|
227
226
|
emit(type, class_name.to_sym, text, ts, te)
|
228
227
|
};
|
229
228
|
|
230
|
-
collating_sequence >(open_bracket, 1) @eof(premature_end_error)
|
229
|
+
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
231
230
|
emit(:set, :collation, *text(data, ts, te))
|
232
231
|
};
|
233
232
|
|
234
|
-
character_equivalent >(open_bracket, 1) @eof(premature_end_error)
|
233
|
+
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
235
234
|
emit(:set, :equivalent, *text(data, ts, te))
|
236
235
|
};
|
237
236
|
|
@@ -337,44 +336,24 @@
|
|
337
336
|
};
|
338
337
|
|
339
338
|
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
340
|
-
|
341
|
-
c = data[te].chr
|
342
|
-
if c =~ /[\x00-\x7F]/
|
343
|
-
emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
|
344
|
-
p += 1
|
345
|
-
else
|
346
|
-
raise InvalidSequenceError.new("control sequence")
|
347
|
-
end
|
348
|
-
else
|
349
|
-
raise PrematureEndError.new("control sequence")
|
350
|
-
end
|
339
|
+
emit_meta_control_sequence(data, ts, te, :control)
|
351
340
|
fret;
|
352
341
|
};
|
353
342
|
|
354
343
|
meta_sequence >(backslashed, 3) $eof(premature_end_error) {
|
355
|
-
|
356
|
-
c = data[te].chr
|
357
|
-
if c =~ /[\x00-\x7F]/
|
358
|
-
emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
|
359
|
-
p += 1
|
360
|
-
else
|
361
|
-
raise InvalidSequenceError.new("meta sequence")
|
362
|
-
end
|
363
|
-
else
|
364
|
-
raise PrematureEndError.new("meta sequence")
|
365
|
-
end
|
344
|
+
emit_meta_control_sequence(data, ts, te, :meta_sequence)
|
366
345
|
fret;
|
367
346
|
};
|
368
347
|
|
369
348
|
char_type_char > (escaped_alpha, 2) {
|
370
349
|
fhold;
|
371
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
350
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
372
351
|
fcall char_type;
|
373
352
|
};
|
374
353
|
|
375
354
|
property_char > (escaped_alpha, 2) {
|
376
355
|
fhold;
|
377
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
356
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
378
357
|
fcall unicode_property;
|
379
358
|
};
|
380
359
|
|
@@ -412,8 +391,7 @@
|
|
412
391
|
};
|
413
392
|
|
414
393
|
alternation {
|
415
|
-
if
|
416
|
-
conditional_stack.last[1] == group_depth
|
394
|
+
if conditional_stack.last == group_depth
|
417
395
|
emit(:conditional, :separator, *text(data, ts, te))
|
418
396
|
else
|
419
397
|
emit(:meta, :alternation, *text(data, ts, te))
|
@@ -442,18 +420,16 @@
|
|
442
420
|
when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
|
443
421
|
when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
|
444
422
|
when '\\G'; emit(:anchor, :match_start, text, ts, te)
|
445
|
-
else
|
446
|
-
raise ScannerError.new(
|
447
|
-
"Unexpected character in anchor at #{text} (char #{ts})")
|
448
423
|
end
|
449
424
|
};
|
450
425
|
|
426
|
+
literal_delimiters {
|
427
|
+
append_literal(data, ts, te)
|
428
|
+
};
|
429
|
+
|
451
430
|
# Character sets
|
452
431
|
# ------------------------------------------------------------------------
|
453
|
-
set_open {
|
454
|
-
set_depth += 1
|
455
|
-
in_set = true
|
456
|
-
|
432
|
+
set_open >set_opened {
|
457
433
|
emit(:set, :open, *text(data, ts, te))
|
458
434
|
fcall character_set;
|
459
435
|
};
|
@@ -465,9 +441,7 @@
|
|
465
441
|
conditional {
|
466
442
|
text = text(data, ts, te).first
|
467
443
|
|
468
|
-
|
469
|
-
conditional_depth += 1
|
470
|
-
conditional_stack << [conditional_depth, group_depth]
|
444
|
+
conditional_stack << group_depth
|
471
445
|
|
472
446
|
emit(:conditional, :open, text[0..-2], ts, te-1)
|
473
447
|
emit(:conditional, :condition_open, '(', te-1, te)
|
@@ -496,7 +470,11 @@
|
|
496
470
|
# (?imxdau-imx:subexp) option on/off for subexp
|
497
471
|
# ------------------------------------------------------------------------
|
498
472
|
group_open . group_options >group_opened {
|
499
|
-
|
473
|
+
text = text(data, ts, te).first
|
474
|
+
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
|
+
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
|
+
end
|
477
|
+
emit_options(text, ts, te)
|
500
478
|
};
|
501
479
|
|
502
480
|
# Assertions
|
@@ -528,19 +506,15 @@
|
|
528
506
|
when '(?>'; emit(:group, :atomic, text, ts, te)
|
529
507
|
when '(?~'; emit(:group, :absence, text, ts, te)
|
530
508
|
|
531
|
-
when /^\(
|
532
|
-
|
509
|
+
when /^\(\?(?:<>|'')/
|
510
|
+
validation_error(:group, 'named group', 'name is empty')
|
533
511
|
|
512
|
+
when /^\(\?<\w*>/
|
534
513
|
emit(:group, :named_ab, text, ts, te)
|
535
514
|
|
536
|
-
when /^\(\?'
|
537
|
-
empty_name_error(:group, 'named group (sq)') if $1.empty?
|
538
|
-
|
515
|
+
when /^\(\?'\w*'/
|
539
516
|
emit(:group, :named_sq, text, ts, te)
|
540
517
|
|
541
|
-
else
|
542
|
-
raise ScannerError.new(
|
543
|
-
"Unknown subexpression group format '#{text}'")
|
544
518
|
end
|
545
519
|
};
|
546
520
|
|
@@ -550,20 +524,13 @@
|
|
550
524
|
};
|
551
525
|
|
552
526
|
group_close @group_closed {
|
553
|
-
if
|
554
|
-
conditional_stack.last[1] == (group_depth + 1)
|
555
|
-
|
556
|
-
emit(:conditional, :close, *text(data, ts, te))
|
527
|
+
if conditional_stack.last == group_depth + 1
|
557
528
|
conditional_stack.pop
|
558
|
-
|
559
|
-
if conditional_stack.length == 0
|
560
|
-
in_conditional = false
|
561
|
-
end
|
529
|
+
emit(:conditional, :close, *text(data, ts, te))
|
562
530
|
else
|
563
|
-
if spacing_stack.length > 1
|
564
|
-
|
531
|
+
if spacing_stack.length > 1 &&
|
532
|
+
spacing_stack.last[:depth] == group_depth + 1
|
565
533
|
spacing_stack.pop
|
566
|
-
|
567
534
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
568
535
|
end
|
569
536
|
|
@@ -576,11 +543,8 @@
|
|
576
543
|
# ------------------------------------------------------------------------
|
577
544
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
578
545
|
case text = text(data, ts, te).first
|
579
|
-
when /^\\([gk])
|
580
|
-
|
581
|
-
|
582
|
-
when /^\\([gk])''/ # single quotes
|
583
|
-
empty_backref_error("ref/call (sq)")
|
546
|
+
when /^\\([gk])(<>|'')/ # angle brackets
|
547
|
+
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
584
548
|
|
585
549
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
586
550
|
if $1 == 'k'
|
@@ -636,9 +600,6 @@
|
|
636
600
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
637
601
|
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
638
602
|
|
639
|
-
else
|
640
|
-
raise ScannerError.new(
|
641
|
-
"Unknown backreference format '#{text}'")
|
642
603
|
end
|
643
604
|
};
|
644
605
|
|
@@ -669,10 +630,15 @@
|
|
669
630
|
end
|
670
631
|
};
|
671
632
|
|
672
|
-
quantifier_interval
|
633
|
+
quantifier_interval {
|
673
634
|
emit(:quantifier, :interval, *text(data, ts, te))
|
674
635
|
};
|
675
636
|
|
637
|
+
# Catch unmatched curly braces as literals
|
638
|
+
range_open {
|
639
|
+
append_literal(data, ts, te)
|
640
|
+
};
|
641
|
+
|
676
642
|
# Escaped sequences
|
677
643
|
# ------------------------------------------------------------------------
|
678
644
|
backslash > (backslashed, 1) {
|
@@ -771,22 +737,17 @@ class Regexp::Scanner
|
|
771
737
|
#
|
772
738
|
# This method may raise errors if a syntax error is encountered.
|
773
739
|
# --------------------------------------------------------------------------
|
774
|
-
def self.scan(input_object, &block)
|
775
|
-
new.scan(input_object, &block)
|
740
|
+
def self.scan(input_object, options: nil, &block)
|
741
|
+
new.scan(input_object, options: options, &block)
|
776
742
|
end
|
777
743
|
|
778
|
-
def scan(input_object, &block)
|
744
|
+
def scan(input_object, options: nil, &block)
|
779
745
|
self.literal = nil
|
780
746
|
stack = []
|
781
747
|
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
else
|
786
|
-
input = input_object
|
787
|
-
self.free_spacing = false
|
788
|
-
end
|
789
|
-
|
748
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
749
|
+
self.free_spacing = free_spacing?(input_object, options)
|
750
|
+
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
790
751
|
|
791
752
|
data = input.unpack("c*") if input.is_a?(String)
|
792
753
|
eof = data.length
|
@@ -794,15 +755,9 @@ class Regexp::Scanner
|
|
794
755
|
self.tokens = []
|
795
756
|
self.block = block_given? ? block : nil
|
796
757
|
|
797
|
-
self.
|
758
|
+
self.set_depth = 0
|
798
759
|
self.group_depth = 0
|
799
|
-
self.
|
800
|
-
|
801
|
-
in_set = false
|
802
|
-
set_depth = 0
|
803
|
-
in_conditional = false
|
804
|
-
conditional_depth = 0
|
805
|
-
conditional_stack = []
|
760
|
+
self.conditional_stack = []
|
806
761
|
|
807
762
|
%% write data;
|
808
763
|
%% write init;
|
@@ -817,9 +772,9 @@ class Regexp::Scanner
|
|
817
772
|
end
|
818
773
|
|
819
774
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
820
|
-
"[#{
|
775
|
+
"[#{group_depth}]") if in_group?
|
821
776
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
822
|
-
"[#{
|
777
|
+
"[#{set_depth}]") if in_set?
|
823
778
|
|
824
779
|
# when the entire expression is a literal run
|
825
780
|
emit_literal if literal
|
@@ -854,62 +809,27 @@ class Regexp::Scanner
|
|
854
809
|
|
855
810
|
private
|
856
811
|
|
857
|
-
attr_accessor :tokens, :literal, :block,
|
858
|
-
:
|
859
|
-
:free_spacing, :spacing_stack
|
860
|
-
|
861
|
-
# Ragel's regex-based scan of the group options introduced a lot of
|
862
|
-
# ambiguity, so we just ask it to find the beginning of what looks
|
863
|
-
# like an options run and handle the rest in here.
|
864
|
-
def scan_options(p, data, ts, te)
|
865
|
-
text = text(data, ts, te).first
|
866
|
-
|
867
|
-
options_char, options_length = true, 0
|
812
|
+
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
813
|
+
:group_depth, :set_depth, :conditional_stack
|
868
814
|
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
while options_char
|
873
|
-
if data[te + options_length]
|
874
|
-
c = data[te + options_length].chr
|
875
|
-
|
876
|
-
if c =~ /[-mixdau]/
|
877
|
-
negative_options = true if c == '-'
|
878
|
-
|
879
|
-
raise InvalidGroupOption.new(c, text) if negative_options and
|
880
|
-
c =~ /[dau]/
|
881
|
-
|
882
|
-
text << c ; p += 1 ; options_length += 1
|
883
|
-
else
|
884
|
-
options_char = false
|
885
|
-
end
|
886
|
-
else
|
887
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
888
|
-
end
|
815
|
+
def free_spacing?(input_object, options)
|
816
|
+
if options && !input_object.is_a?(String)
|
817
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
889
818
|
end
|
890
819
|
|
891
|
-
|
892
|
-
c = data[te + options_length].chr
|
820
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
893
821
|
|
894
|
-
|
895
|
-
# Include the ':' in the options text
|
896
|
-
text << c ; p += 1 ; options_length += 1
|
897
|
-
emit_options(text, ts, te + options_length)
|
822
|
+
return false unless options
|
898
823
|
|
899
|
-
|
900
|
-
|
901
|
-
emit_options(text, ts, te + options_length)
|
824
|
+
options & Regexp::EXTENDED != 0
|
825
|
+
end
|
902
826
|
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
"Unexpected `#{c}' in options sequence, ':' or ')' expected")
|
907
|
-
end
|
908
|
-
else
|
909
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
910
|
-
end
|
827
|
+
def in_group?
|
828
|
+
group_depth > 0
|
829
|
+
end
|
911
830
|
|
912
|
-
|
831
|
+
def in_set?
|
832
|
+
set_depth > 0
|
913
833
|
end
|
914
834
|
|
915
835
|
# Copy from ts to te from data as text
|
@@ -945,32 +865,39 @@ class Regexp::Scanner
|
|
945
865
|
def emit_options(text, ts, te)
|
946
866
|
token = nil
|
947
867
|
|
948
|
-
|
949
|
-
|
868
|
+
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
869
|
+
text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
|
870
|
+
positive, negative, group_local = $1, $2, $3
|
950
871
|
|
951
|
-
|
952
|
-
|
953
|
-
|
872
|
+
if positive.include?('x')
|
873
|
+
self.free_spacing = true
|
874
|
+
end
|
954
875
|
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
876
|
+
# If the x appears in both, treat it like ruby does, the second cancels
|
877
|
+
# the first.
|
878
|
+
if negative && negative.include?('x')
|
879
|
+
self.free_spacing = false
|
880
|
+
end
|
960
881
|
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
end
|
882
|
+
if group_local
|
883
|
+
spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
|
884
|
+
token = :options
|
885
|
+
else
|
886
|
+
# switch for parent group level
|
887
|
+
spacing_stack.last[:free_spacing] = free_spacing
|
888
|
+
token = :options_switch
|
969
889
|
end
|
970
890
|
|
971
891
|
emit(:group, token, text, ts, te)
|
972
892
|
end
|
973
893
|
|
894
|
+
def emit_meta_control_sequence(data, ts, te, token)
|
895
|
+
if data.last < 0x00 || data.last > 0x7F
|
896
|
+
validation_error(:sequence, 'escape', token.to_s)
|
897
|
+
end
|
898
|
+
emit(:escape, token, *text(data, ts, te, 1))
|
899
|
+
end
|
900
|
+
|
974
901
|
# Centralizes and unifies the handling of validation related
|
975
902
|
# errors.
|
976
903
|
def validation_error(type, what, reason)
|
@@ -981,21 +908,8 @@ class Regexp::Scanner
|
|
981
908
|
error = InvalidBackrefError.new(what, reason)
|
982
909
|
when :sequence
|
983
910
|
error = InvalidSequenceError.new(what, reason)
|
984
|
-
else
|
985
|
-
error = ValidationError.new('expression')
|
986
911
|
end
|
987
912
|
|
988
913
|
raise error # unless @@config.validation_ignore
|
989
914
|
end
|
990
|
-
|
991
|
-
# Used for references with an empty name or number
|
992
|
-
def empty_backref_error(type, what)
|
993
|
-
validation_error(:backref, what, 'ref ID is empty')
|
994
|
-
end
|
995
|
-
|
996
|
-
# Used for named expressions with an empty name
|
997
|
-
def empty_name_error(type, what)
|
998
|
-
validation_error(type, what, 'name is empty')
|
999
|
-
end
|
1000
|
-
|
1001
915
|
end # module Regexp::Scanner
|