regexp_parser 1.4.0 → 1.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +66 -1
- data/Gemfile +3 -3
- data/README.md +11 -18
- data/Rakefile +3 -4
- data/lib/regexp_parser/expression.rb +28 -53
- data/lib/regexp_parser/expression/classes/backref.rb +18 -10
- data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
- data/lib/regexp_parser/expression/classes/escape.rb +0 -4
- data/lib/regexp_parser/expression/classes/group.rb +4 -2
- data/lib/regexp_parser/expression/classes/keep.rb +1 -3
- data/lib/regexp_parser/expression/methods/match.rb +13 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
- data/lib/regexp_parser/expression/methods/options.rb +35 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
- data/lib/regexp_parser/expression/methods/tests.rb +6 -15
- data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
- data/lib/regexp_parser/expression/quantifier.rb +2 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -6
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
- data/lib/regexp_parser/expression/subexpression.rb +3 -5
- data/lib/regexp_parser/lexer.rb +30 -44
- data/lib/regexp_parser/parser.rb +47 -24
- data/lib/regexp_parser/scanner.rb +1228 -1367
- data/lib/regexp_parser/scanner/char_type.rl +0 -3
- data/lib/regexp_parser/scanner/properties/long.yml +15 -1
- data/lib/regexp_parser/scanner/properties/short.yml +5 -0
- data/lib/regexp_parser/scanner/scanner.rl +101 -194
- data/lib/regexp_parser/syntax/tokens.rb +2 -10
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -2
- data/spec/expression/base_spec.rb +94 -0
- data/spec/expression/clone_spec.rb +120 -0
- data/spec/expression/conditional_spec.rb +89 -0
- data/spec/expression/free_space_spec.rb +27 -0
- data/spec/expression/methods/match_length_spec.rb +161 -0
- data/spec/expression/methods/match_spec.rb +25 -0
- data/spec/expression/methods/strfregexp_spec.rb +224 -0
- data/spec/expression/methods/tests_spec.rb +99 -0
- data/spec/expression/methods/traverse_spec.rb +161 -0
- data/spec/expression/options_spec.rb +128 -0
- data/spec/expression/root_spec.rb +9 -0
- data/spec/expression/sequence_spec.rb +9 -0
- data/spec/expression/subexpression_spec.rb +50 -0
- data/spec/expression/to_h_spec.rb +26 -0
- data/spec/expression/to_s_spec.rb +100 -0
- data/spec/lexer/all_spec.rb +22 -0
- data/spec/lexer/conditionals_spec.rb +53 -0
- data/spec/lexer/delimiters_spec.rb +68 -0
- data/spec/lexer/escapes_spec.rb +14 -0
- data/spec/lexer/keep_spec.rb +10 -0
- data/spec/lexer/literals_spec.rb +89 -0
- data/spec/lexer/nesting_spec.rb +99 -0
- data/spec/lexer/refcalls_spec.rb +55 -0
- data/spec/parser/all_spec.rb +43 -0
- data/spec/parser/alternation_spec.rb +88 -0
- data/spec/parser/anchors_spec.rb +17 -0
- data/spec/parser/conditionals_spec.rb +179 -0
- data/spec/parser/errors_spec.rb +30 -0
- data/spec/parser/escapes_spec.rb +121 -0
- data/spec/parser/free_space_spec.rb +130 -0
- data/spec/parser/groups_spec.rb +108 -0
- data/spec/parser/keep_spec.rb +6 -0
- data/spec/parser/posix_classes_spec.rb +8 -0
- data/spec/parser/properties_spec.rb +115 -0
- data/spec/parser/quantifiers_spec.rb +52 -0
- data/spec/parser/refcalls_spec.rb +112 -0
- data/spec/parser/set/intersections_spec.rb +127 -0
- data/spec/parser/set/ranges_spec.rb +111 -0
- data/spec/parser/sets_spec.rb +178 -0
- data/spec/parser/types_spec.rb +18 -0
- data/spec/scanner/all_spec.rb +18 -0
- data/spec/scanner/anchors_spec.rb +21 -0
- data/spec/scanner/conditionals_spec.rb +128 -0
- data/spec/scanner/delimiters_spec.rb +52 -0
- data/spec/scanner/errors_spec.rb +67 -0
- data/spec/scanner/escapes_spec.rb +53 -0
- data/spec/scanner/free_space_spec.rb +133 -0
- data/spec/scanner/groups_spec.rb +52 -0
- data/spec/scanner/keep_spec.rb +10 -0
- data/spec/scanner/literals_spec.rb +49 -0
- data/spec/scanner/meta_spec.rb +18 -0
- data/spec/scanner/properties_spec.rb +64 -0
- data/spec/scanner/quantifiers_spec.rb +20 -0
- data/spec/scanner/refcalls_spec.rb +36 -0
- data/spec/scanner/sets_spec.rb +102 -0
- data/spec/scanner/types_spec.rb +14 -0
- data/spec/spec_helper.rb +15 -0
- data/{test → spec}/support/runner.rb +9 -8
- data/spec/support/shared_examples.rb +77 -0
- data/{test → spec}/support/warning_extractor.rb +5 -7
- data/spec/syntax/syntax_spec.rb +48 -0
- data/spec/syntax/syntax_token_map_spec.rb +23 -0
- data/spec/syntax/versions/1.8.6_spec.rb +17 -0
- data/spec/syntax/versions/1.9.1_spec.rb +10 -0
- data/spec/syntax/versions/1.9.3_spec.rb +9 -0
- data/spec/syntax/versions/2.0.0_spec.rb +13 -0
- data/spec/syntax/versions/2.2.0_spec.rb +9 -0
- data/spec/syntax/versions/aliases_spec.rb +37 -0
- data/spec/token/token_spec.rb +85 -0
- metadata +149 -144
- data/test/expression/test_all.rb +0 -12
- data/test/expression/test_base.rb +0 -90
- data/test/expression/test_clone.rb +0 -89
- data/test/expression/test_conditionals.rb +0 -113
- data/test/expression/test_free_space.rb +0 -35
- data/test/expression/test_set.rb +0 -84
- data/test/expression/test_strfregexp.rb +0 -230
- data/test/expression/test_subexpression.rb +0 -58
- data/test/expression/test_tests.rb +0 -99
- data/test/expression/test_to_h.rb +0 -59
- data/test/expression/test_to_s.rb +0 -104
- data/test/expression/test_traverse.rb +0 -161
- data/test/helpers.rb +0 -10
- data/test/lexer/test_all.rb +0 -41
- data/test/lexer/test_conditionals.rb +0 -127
- data/test/lexer/test_keep.rb +0 -24
- data/test/lexer/test_literals.rb +0 -130
- data/test/lexer/test_nesting.rb +0 -132
- data/test/lexer/test_refcalls.rb +0 -56
- data/test/parser/set/test_intersections.rb +0 -127
- data/test/parser/set/test_ranges.rb +0 -111
- data/test/parser/test_all.rb +0 -64
- data/test/parser/test_alternation.rb +0 -92
- data/test/parser/test_anchors.rb +0 -34
- data/test/parser/test_conditionals.rb +0 -187
- data/test/parser/test_errors.rb +0 -63
- data/test/parser/test_escapes.rb +0 -134
- data/test/parser/test_free_space.rb +0 -139
- data/test/parser/test_groups.rb +0 -289
- data/test/parser/test_keep.rb +0 -21
- data/test/parser/test_posix_classes.rb +0 -27
- data/test/parser/test_properties.rb +0 -134
- data/test/parser/test_quantifiers.rb +0 -301
- data/test/parser/test_refcalls.rb +0 -186
- data/test/parser/test_sets.rb +0 -179
- data/test/parser/test_types.rb +0 -50
- data/test/scanner/test_all.rb +0 -38
- data/test/scanner/test_anchors.rb +0 -38
- data/test/scanner/test_conditionals.rb +0 -184
- data/test/scanner/test_errors.rb +0 -91
- data/test/scanner/test_escapes.rb +0 -56
- data/test/scanner/test_free_space.rb +0 -200
- data/test/scanner/test_groups.rb +0 -79
- data/test/scanner/test_keep.rb +0 -35
- data/test/scanner/test_literals.rb +0 -89
- data/test/scanner/test_meta.rb +0 -40
- data/test/scanner/test_properties.rb +0 -312
- data/test/scanner/test_quantifiers.rb +0 -37
- data/test/scanner/test_refcalls.rb +0 -52
- data/test/scanner/test_scripts.rb +0 -53
- data/test/scanner/test_sets.rb +0 -119
- data/test/scanner/test_types.rb +0 -35
- data/test/scanner/test_unicode_blocks.rb +0 -30
- data/test/support/disable_autotest.rb +0 -8
- data/test/syntax/test_all.rb +0 -6
- data/test/syntax/test_syntax.rb +0 -61
- data/test/syntax/test_syntax_token_map.rb +0 -25
- data/test/syntax/versions/test_1.8.rb +0 -55
- data/test/syntax/versions/test_1.9.1.rb +0 -36
- data/test/syntax/versions/test_1.9.3.rb +0 -32
- data/test/syntax/versions/test_2.0.0.rb +0 -37
- data/test/syntax/versions/test_2.2.0.rb +0 -32
- data/test/syntax/versions/test_aliases.rb +0 -129
- data/test/syntax/versions/test_all.rb +0 -5
- data/test/test_all.rb +0 -5
- data/test/token/test_all.rb +0 -2
- data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
|
|
21
21
|
when '\W'; emit(:type, :nonword, text, ts - 1, te)
|
22
22
|
when '\R'; emit(:type, :linebreak, text, ts - 1, te)
|
23
23
|
when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
|
24
|
-
else
|
25
|
-
raise ScannerError.new(
|
26
|
-
"Unexpected character in type at #{text} (char #{ts})")
|
27
24
|
end
|
28
25
|
fret;
|
29
26
|
};
|
@@ -6,6 +6,8 @@ adlam: adlam
|
|
6
6
|
age=1.1: age=1.1
|
7
7
|
age=10.0: age=10.0
|
8
8
|
age=11.0: age=11.0
|
9
|
+
age=12.0: age=12.0
|
10
|
+
age=12.1: age=12.1
|
9
11
|
age=2.0: age=2.0
|
10
12
|
age=2.1: age=2.1
|
11
13
|
age=3.0: age=3.0
|
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
|
|
64
66
|
cherokee: cherokee
|
65
67
|
closepunctuation: close_punctuation
|
66
68
|
cntrl: cntrl
|
67
|
-
combiningmark: combining_mark
|
68
69
|
common: common
|
69
70
|
connectorpunctuation: connector_punctuation
|
70
71
|
control: control
|
@@ -86,6 +87,7 @@ dogra: dogra
|
|
86
87
|
duployan: duployan
|
87
88
|
egyptianhieroglyphs: egyptian_hieroglyphs
|
88
89
|
elbasan: elbasan
|
90
|
+
elymaic: elymaic
|
89
91
|
emoji: emoji
|
90
92
|
emojicomponent: emoji_component
|
91
93
|
emojimodifier: emoji_modifier
|
@@ -206,8 +208,10 @@ indogra: in_dogra
|
|
206
208
|
indominotiles: in_domino_tiles
|
207
209
|
induployan: in_duployan
|
208
210
|
inearlydynasticcuneiform: in_early_dynastic_cuneiform
|
211
|
+
inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
|
209
212
|
inegyptianhieroglyphs: in_egyptian_hieroglyphs
|
210
213
|
inelbasan: in_elbasan
|
214
|
+
inelymaic: in_elymaic
|
211
215
|
inemoticons: in_emoticons
|
212
216
|
inenclosedalphanumerics: in_enclosed_alphanumerics
|
213
217
|
inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
|
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
|
|
322
326
|
inmyanmarextendeda: in_myanmar_extended_a
|
323
327
|
inmyanmarextendedb: in_myanmar_extended_b
|
324
328
|
innabataean: in_nabataean
|
329
|
+
innandinagari: in_nandinagari
|
325
330
|
innewa: in_newa
|
326
331
|
innewtailue: in_new_tai_lue
|
327
332
|
innko: in_nko
|
328
333
|
innoblock: in_no_block
|
329
334
|
innumberforms: in_number_forms
|
330
335
|
innushu: in_nushu
|
336
|
+
innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
|
331
337
|
inogham: in_ogham
|
332
338
|
inolchiki: in_ol_chiki
|
333
339
|
inoldhungarian: in_old_hungarian
|
@@ -343,6 +349,7 @@ inoriya: in_oriya
|
|
343
349
|
inornamentaldingbats: in_ornamental_dingbats
|
344
350
|
inosage: in_osage
|
345
351
|
inosmanya: in_osmanya
|
352
|
+
inottomansiyaqnumbers: in_ottoman_siyaq_numbers
|
346
353
|
inpahawhhmong: in_pahawh_hmong
|
347
354
|
inpalmyrene: in_palmyrene
|
348
355
|
inpaucinhau: in_pau_cin_hau
|
@@ -368,6 +375,7 @@ insiddham: in_siddham
|
|
368
375
|
insinhala: in_sinhala
|
369
376
|
insinhalaarchaicnumbers: in_sinhala_archaic_numbers
|
370
377
|
insmallformvariants: in_small_form_variants
|
378
|
+
insmallkanaextension: in_small_kana_extension
|
371
379
|
insogdian: in_sogdian
|
372
380
|
insorasompeng: in_sora_sompeng
|
373
381
|
insoyombo: in_soyombo
|
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
|
|
386
394
|
insupplementaryprivateuseareab: in_supplementary_private_use_area_b
|
387
395
|
insuttonsignwriting: in_sutton_signwriting
|
388
396
|
insylotinagri: in_syloti_nagri
|
397
|
+
insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
|
389
398
|
insyriac: in_syriac
|
390
399
|
insyriacsupplement: in_syriac_supplement
|
391
400
|
intagalog: in_tagalog
|
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
|
|
397
406
|
intaixuanjingsymbols: in_tai_xuan_jing_symbols
|
398
407
|
intakri: in_takri
|
399
408
|
intamil: in_tamil
|
409
|
+
intamilsupplement: in_tamil_supplement
|
400
410
|
intangut: in_tangut
|
401
411
|
intangutcomponents: in_tangut_components
|
402
412
|
intelugu: in_telugu
|
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
|
|
414
424
|
invariationselectorssupplement: in_variation_selectors_supplement
|
415
425
|
invedicextensions: in_vedic_extensions
|
416
426
|
inverticalforms: in_vertical_forms
|
427
|
+
inwancho: in_wancho
|
417
428
|
inwarangciti: in_warang_citi
|
418
429
|
inyijinghexagramsymbols: in_yijing_hexagram_symbols
|
419
430
|
inyiradicals: in_yi_radicals
|
@@ -469,6 +480,7 @@ mro: mro
|
|
469
480
|
multani: multani
|
470
481
|
myanmar: myanmar
|
471
482
|
nabataean: nabataean
|
483
|
+
nandinagari: nandinagari
|
472
484
|
newa: newa
|
473
485
|
newline: newline
|
474
486
|
newtailue: new_tai_lue
|
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
|
|
477
489
|
nonspacingmark: nonspacing_mark
|
478
490
|
number: number
|
479
491
|
nushu: nushu
|
492
|
+
nyiakengpuachuehmong: nyiakeng_puachue_hmong
|
480
493
|
ogham: ogham
|
481
494
|
olchiki: ol_chiki
|
482
495
|
oldhungarian: old_hungarian
|
@@ -569,6 +582,7 @@ uppercase: uppercase
|
|
569
582
|
uppercaseletter: uppercase_letter
|
570
583
|
vai: vai
|
571
584
|
variationselector: variation_selector
|
585
|
+
wancho: wancho
|
572
586
|
warangciti: warang_citi
|
573
587
|
whitespace: white_space
|
574
588
|
word: word
|
@@ -31,6 +31,7 @@ cher: cherokee
|
|
31
31
|
ci: case_ignorable
|
32
32
|
cn: unassigned
|
33
33
|
co: private_use
|
34
|
+
combiningmark: mark
|
34
35
|
copt: coptic
|
35
36
|
cprt: cypriot
|
36
37
|
cs: surrogate
|
@@ -49,6 +50,7 @@ dsrt: deseret
|
|
49
50
|
dupl: duployan
|
50
51
|
egyp: egyptian_hieroglyphs
|
51
52
|
elba: elbasan
|
53
|
+
elym: elymaic
|
52
54
|
ethi: ethiopic
|
53
55
|
ext: extender
|
54
56
|
geor: georgian
|
@@ -72,6 +74,7 @@ hex: hex_digit
|
|
72
74
|
hira: hiragana
|
73
75
|
hluw: anatolian_hieroglyphs
|
74
76
|
hmng: pahawh_hmong
|
77
|
+
hmnp: nyiakeng_puachue_hmong
|
75
78
|
hung: old_hungarian
|
76
79
|
idc: id_continue
|
77
80
|
ideo: ideographic
|
@@ -125,6 +128,7 @@ mtei: meetei_mayek
|
|
125
128
|
mult: multani
|
126
129
|
mymr: myanmar
|
127
130
|
n: number
|
131
|
+
nand: nandinagari
|
128
132
|
narb: old_north_arabian
|
129
133
|
nbat: nabataean
|
130
134
|
nchar: noncharacter_code_point
|
@@ -216,6 +220,7 @@ uideo: unified_ideograph
|
|
216
220
|
vaii: vai
|
217
221
|
vs: variation_selector
|
218
222
|
wara: warang_citi
|
223
|
+
wcho: wancho
|
219
224
|
wspace: white_space
|
220
225
|
xidc: xid_continue
|
221
226
|
xids: xid_start
|
@@ -49,9 +49,9 @@
|
|
49
49
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
50
50
|
codepoint_sequence = codepoint_single | codepoint_list;
|
51
51
|
|
52
|
-
control_sequence = ('c' | 'C-') . (backslash . 'M-')
|
52
|
+
control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
|
53
53
|
|
54
|
-
meta_sequence = 'M-' . (backslash .
|
54
|
+
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
55
|
|
56
56
|
zero_or_one = '?' | '??' | '?+';
|
57
57
|
zero_or_more = '*' | '*?' | '*+';
|
@@ -62,13 +62,17 @@
|
|
62
62
|
quantifier_possessive = '?+' | '*+' | '++';
|
63
63
|
quantifier_mode = '?' | '+';
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
quantity_exact = (digit+);
|
66
|
+
quantity_minimum = (digit+) . ',';
|
67
|
+
quantity_maximum = ',' . (digit+);
|
68
|
+
quantity_range = (digit+) . ',' . (digit+);
|
69
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
70
|
+
quantity_maximum | quantity_range ) . range_close .
|
71
|
+
quantifier_mode?;
|
67
72
|
|
68
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
74
|
quantifier_possessive | quantifier_interval;
|
70
75
|
|
71
|
-
|
72
76
|
conditional = '(?(';
|
73
77
|
|
74
78
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -82,7 +86,8 @@
|
|
82
86
|
assertion_lookbehind = '?<=';
|
83
87
|
assertion_nlookbehind = '?<!';
|
84
88
|
|
85
|
-
|
89
|
+
# try to treat every other group head as options group, like Ruby
|
90
|
+
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
86
91
|
|
87
92
|
group_ref = [gk];
|
88
93
|
group_name_char = (alnum | '_');
|
@@ -113,6 +118,8 @@
|
|
113
118
|
curlies | parantheses | brackets |
|
114
119
|
line_anchor | quantifier_greedy;
|
115
120
|
|
121
|
+
literal_delimiters = ']' | '}';
|
122
|
+
|
116
123
|
ascii_print = ((0x20..0x7e) - meta_char);
|
117
124
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
118
125
|
|
@@ -135,41 +142,35 @@
|
|
135
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
136
143
|
action invalid_sequence_error {
|
137
144
|
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
138
|
-
|
145
|
+
validation_error(:sequence, 'sequence', text)
|
139
146
|
}
|
140
147
|
|
141
148
|
# group (nesting) and set open/close actions
|
142
|
-
action group_opened { self.group_depth = group_depth + 1
|
143
|
-
action group_closed { self.group_depth = group_depth - 1
|
149
|
+
action group_opened { self.group_depth = group_depth + 1 }
|
150
|
+
action group_closed { self.group_depth = group_depth - 1 }
|
151
|
+
action set_opened { self.set_depth = set_depth + 1 }
|
152
|
+
action set_closed { self.set_depth = set_depth - 1 }
|
144
153
|
|
145
154
|
# Character set scanner, continues consuming characters until it meets the
|
146
155
|
# closing bracket of the set.
|
147
156
|
# --------------------------------------------------------------------------
|
148
157
|
character_set := |*
|
149
|
-
set_close > (set_meta, 2) {
|
150
|
-
set_depth -= 1
|
151
|
-
in_set = set_depth > 0 ? true : false
|
152
|
-
|
158
|
+
set_close > (set_meta, 2) @set_closed {
|
153
159
|
emit(:set, :close, *text(data, ts, te))
|
154
|
-
|
155
|
-
if set_depth == 0
|
156
|
-
fgoto main;
|
157
|
-
else
|
160
|
+
if in_set?
|
158
161
|
fret;
|
162
|
+
else
|
163
|
+
fgoto main;
|
159
164
|
end
|
160
165
|
};
|
161
166
|
|
162
|
-
'-]' { # special case, emits two tokens
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
emit(:literal, :literal, copy(data, ts..te-2), ts, te)
|
167
|
-
emit(:set, :close, copy(data, ts+1..te-1), ts, te)
|
168
|
-
|
169
|
-
if set_depth == 0
|
170
|
-
fgoto main;
|
171
|
-
else
|
167
|
+
'-]' @set_closed { # special case, emits two tokens
|
168
|
+
emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
|
169
|
+
emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
|
170
|
+
if in_set?
|
172
171
|
fret;
|
172
|
+
else
|
173
|
+
fgoto main;
|
173
174
|
end
|
174
175
|
};
|
175
176
|
|
@@ -207,14 +208,12 @@
|
|
207
208
|
fcall set_escape_sequence;
|
208
209
|
};
|
209
210
|
|
210
|
-
set_open >(open_bracket, 1) {
|
211
|
-
set_depth += 1
|
212
|
-
|
211
|
+
set_open >(open_bracket, 1) >set_opened {
|
213
212
|
emit(:set, :open, *text(data, ts, te))
|
214
213
|
fcall character_set;
|
215
214
|
};
|
216
215
|
|
217
|
-
class_posix >(open_bracket, 1) @eof(premature_end_error)
|
216
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
218
217
|
text = text(data, ts, te).first
|
219
218
|
|
220
219
|
type = :posixclass
|
@@ -227,11 +226,11 @@
|
|
227
226
|
emit(type, class_name.to_sym, text, ts, te)
|
228
227
|
};
|
229
228
|
|
230
|
-
collating_sequence >(open_bracket, 1) @eof(premature_end_error)
|
229
|
+
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
231
230
|
emit(:set, :collation, *text(data, ts, te))
|
232
231
|
};
|
233
232
|
|
234
|
-
character_equivalent >(open_bracket, 1) @eof(premature_end_error)
|
233
|
+
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
235
234
|
emit(:set, :equivalent, *text(data, ts, te))
|
236
235
|
};
|
237
236
|
|
@@ -337,44 +336,24 @@
|
|
337
336
|
};
|
338
337
|
|
339
338
|
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
340
|
-
|
341
|
-
c = data[te].chr
|
342
|
-
if c =~ /[\x00-\x7F]/
|
343
|
-
emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
|
344
|
-
p += 1
|
345
|
-
else
|
346
|
-
raise InvalidSequenceError.new("control sequence")
|
347
|
-
end
|
348
|
-
else
|
349
|
-
raise PrematureEndError.new("control sequence")
|
350
|
-
end
|
339
|
+
emit_meta_control_sequence(data, ts, te, :control)
|
351
340
|
fret;
|
352
341
|
};
|
353
342
|
|
354
343
|
meta_sequence >(backslashed, 3) $eof(premature_end_error) {
|
355
|
-
|
356
|
-
c = data[te].chr
|
357
|
-
if c =~ /[\x00-\x7F]/
|
358
|
-
emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
|
359
|
-
p += 1
|
360
|
-
else
|
361
|
-
raise InvalidSequenceError.new("meta sequence")
|
362
|
-
end
|
363
|
-
else
|
364
|
-
raise PrematureEndError.new("meta sequence")
|
365
|
-
end
|
344
|
+
emit_meta_control_sequence(data, ts, te, :meta_sequence)
|
366
345
|
fret;
|
367
346
|
};
|
368
347
|
|
369
348
|
char_type_char > (escaped_alpha, 2) {
|
370
349
|
fhold;
|
371
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
350
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
372
351
|
fcall char_type;
|
373
352
|
};
|
374
353
|
|
375
354
|
property_char > (escaped_alpha, 2) {
|
376
355
|
fhold;
|
377
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
356
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
378
357
|
fcall unicode_property;
|
379
358
|
};
|
380
359
|
|
@@ -412,8 +391,7 @@
|
|
412
391
|
};
|
413
392
|
|
414
393
|
alternation {
|
415
|
-
if
|
416
|
-
conditional_stack.last[1] == group_depth
|
394
|
+
if conditional_stack.last == group_depth
|
417
395
|
emit(:conditional, :separator, *text(data, ts, te))
|
418
396
|
else
|
419
397
|
emit(:meta, :alternation, *text(data, ts, te))
|
@@ -442,18 +420,16 @@
|
|
442
420
|
when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
|
443
421
|
when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
|
444
422
|
when '\\G'; emit(:anchor, :match_start, text, ts, te)
|
445
|
-
else
|
446
|
-
raise ScannerError.new(
|
447
|
-
"Unexpected character in anchor at #{text} (char #{ts})")
|
448
423
|
end
|
449
424
|
};
|
450
425
|
|
426
|
+
literal_delimiters {
|
427
|
+
append_literal(data, ts, te)
|
428
|
+
};
|
429
|
+
|
451
430
|
# Character sets
|
452
431
|
# ------------------------------------------------------------------------
|
453
|
-
set_open {
|
454
|
-
set_depth += 1
|
455
|
-
in_set = true
|
456
|
-
|
432
|
+
set_open >set_opened {
|
457
433
|
emit(:set, :open, *text(data, ts, te))
|
458
434
|
fcall character_set;
|
459
435
|
};
|
@@ -465,9 +441,7 @@
|
|
465
441
|
conditional {
|
466
442
|
text = text(data, ts, te).first
|
467
443
|
|
468
|
-
|
469
|
-
conditional_depth += 1
|
470
|
-
conditional_stack << [conditional_depth, group_depth]
|
444
|
+
conditional_stack << group_depth
|
471
445
|
|
472
446
|
emit(:conditional, :open, text[0..-2], ts, te-1)
|
473
447
|
emit(:conditional, :condition_open, '(', te-1, te)
|
@@ -496,7 +470,11 @@
|
|
496
470
|
# (?imxdau-imx:subexp) option on/off for subexp
|
497
471
|
# ------------------------------------------------------------------------
|
498
472
|
group_open . group_options >group_opened {
|
499
|
-
|
473
|
+
text = text(data, ts, te).first
|
474
|
+
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
|
+
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
|
+
end
|
477
|
+
emit_options(text, ts, te)
|
500
478
|
};
|
501
479
|
|
502
480
|
# Assertions
|
@@ -528,19 +506,15 @@
|
|
528
506
|
when '(?>'; emit(:group, :atomic, text, ts, te)
|
529
507
|
when '(?~'; emit(:group, :absence, text, ts, te)
|
530
508
|
|
531
|
-
when /^\(
|
532
|
-
|
509
|
+
when /^\(\?(?:<>|'')/
|
510
|
+
validation_error(:group, 'named group', 'name is empty')
|
533
511
|
|
512
|
+
when /^\(\?<\w*>/
|
534
513
|
emit(:group, :named_ab, text, ts, te)
|
535
514
|
|
536
|
-
when /^\(\?'
|
537
|
-
empty_name_error(:group, 'named group (sq)') if $1.empty?
|
538
|
-
|
515
|
+
when /^\(\?'\w*'/
|
539
516
|
emit(:group, :named_sq, text, ts, te)
|
540
517
|
|
541
|
-
else
|
542
|
-
raise ScannerError.new(
|
543
|
-
"Unknown subexpression group format '#{text}'")
|
544
518
|
end
|
545
519
|
};
|
546
520
|
|
@@ -550,20 +524,13 @@
|
|
550
524
|
};
|
551
525
|
|
552
526
|
group_close @group_closed {
|
553
|
-
if
|
554
|
-
conditional_stack.last[1] == (group_depth + 1)
|
555
|
-
|
556
|
-
emit(:conditional, :close, *text(data, ts, te))
|
527
|
+
if conditional_stack.last == group_depth + 1
|
557
528
|
conditional_stack.pop
|
558
|
-
|
559
|
-
if conditional_stack.length == 0
|
560
|
-
in_conditional = false
|
561
|
-
end
|
529
|
+
emit(:conditional, :close, *text(data, ts, te))
|
562
530
|
else
|
563
|
-
if spacing_stack.length > 1
|
564
|
-
|
531
|
+
if spacing_stack.length > 1 &&
|
532
|
+
spacing_stack.last[:depth] == group_depth + 1
|
565
533
|
spacing_stack.pop
|
566
|
-
|
567
534
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
568
535
|
end
|
569
536
|
|
@@ -576,11 +543,8 @@
|
|
576
543
|
# ------------------------------------------------------------------------
|
577
544
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
578
545
|
case text = text(data, ts, te).first
|
579
|
-
when /^\\([gk])
|
580
|
-
|
581
|
-
|
582
|
-
when /^\\([gk])''/ # single quotes
|
583
|
-
empty_backref_error("ref/call (sq)")
|
546
|
+
when /^\\([gk])(<>|'')/ # angle brackets
|
547
|
+
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
584
548
|
|
585
549
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
586
550
|
if $1 == 'k'
|
@@ -636,9 +600,6 @@
|
|
636
600
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
637
601
|
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
638
602
|
|
639
|
-
else
|
640
|
-
raise ScannerError.new(
|
641
|
-
"Unknown backreference format '#{text}'")
|
642
603
|
end
|
643
604
|
};
|
644
605
|
|
@@ -669,10 +630,15 @@
|
|
669
630
|
end
|
670
631
|
};
|
671
632
|
|
672
|
-
quantifier_interval
|
633
|
+
quantifier_interval {
|
673
634
|
emit(:quantifier, :interval, *text(data, ts, te))
|
674
635
|
};
|
675
636
|
|
637
|
+
# Catch unmatched curly braces as literals
|
638
|
+
range_open {
|
639
|
+
append_literal(data, ts, te)
|
640
|
+
};
|
641
|
+
|
676
642
|
# Escaped sequences
|
677
643
|
# ------------------------------------------------------------------------
|
678
644
|
backslash > (backslashed, 1) {
|
@@ -786,7 +752,7 @@ class Regexp::Scanner
|
|
786
752
|
input = input_object
|
787
753
|
self.free_spacing = false
|
788
754
|
end
|
789
|
-
|
755
|
+
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
790
756
|
|
791
757
|
data = input.unpack("c*") if input.is_a?(String)
|
792
758
|
eof = data.length
|
@@ -794,15 +760,9 @@ class Regexp::Scanner
|
|
794
760
|
self.tokens = []
|
795
761
|
self.block = block_given? ? block : nil
|
796
762
|
|
797
|
-
self.
|
763
|
+
self.set_depth = 0
|
798
764
|
self.group_depth = 0
|
799
|
-
self.
|
800
|
-
|
801
|
-
in_set = false
|
802
|
-
set_depth = 0
|
803
|
-
in_conditional = false
|
804
|
-
conditional_depth = 0
|
805
|
-
conditional_stack = []
|
765
|
+
self.conditional_stack = []
|
806
766
|
|
807
767
|
%% write data;
|
808
768
|
%% write init;
|
@@ -817,9 +777,9 @@ class Regexp::Scanner
|
|
817
777
|
end
|
818
778
|
|
819
779
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
820
|
-
"[#{
|
780
|
+
"[#{group_depth}]") if in_group?
|
821
781
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
822
|
-
"[#{
|
782
|
+
"[#{set_depth}]") if in_set?
|
823
783
|
|
824
784
|
# when the entire expression is a literal run
|
825
785
|
emit_literal if literal
|
@@ -854,62 +814,15 @@ class Regexp::Scanner
|
|
854
814
|
|
855
815
|
private
|
856
816
|
|
857
|
-
attr_accessor :tokens, :literal, :block,
|
858
|
-
:
|
859
|
-
:free_spacing, :spacing_stack
|
860
|
-
|
861
|
-
# Ragel's regex-based scan of the group options introduced a lot of
|
862
|
-
# ambiguity, so we just ask it to find the beginning of what looks
|
863
|
-
# like an options run and handle the rest in here.
|
864
|
-
def scan_options(p, data, ts, te)
|
865
|
-
text = text(data, ts, te).first
|
817
|
+
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
818
|
+
:group_depth, :set_depth, :conditional_stack
|
866
819
|
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
# as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
|
871
|
-
negative_options = false
|
872
|
-
while options_char
|
873
|
-
if data[te + options_length]
|
874
|
-
c = data[te + options_length].chr
|
875
|
-
|
876
|
-
if c =~ /[-mixdau]/
|
877
|
-
negative_options = true if c == '-'
|
878
|
-
|
879
|
-
raise InvalidGroupOption.new(c, text) if negative_options and
|
880
|
-
c =~ /[dau]/
|
881
|
-
|
882
|
-
text << c ; p += 1 ; options_length += 1
|
883
|
-
else
|
884
|
-
options_char = false
|
885
|
-
end
|
886
|
-
else
|
887
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
888
|
-
end
|
889
|
-
end
|
890
|
-
|
891
|
-
if data[te + options_length]
|
892
|
-
c = data[te + options_length].chr
|
893
|
-
|
894
|
-
if c == ':'
|
895
|
-
# Include the ':' in the options text
|
896
|
-
text << c ; p += 1 ; options_length += 1
|
897
|
-
emit_options(text, ts, te + options_length)
|
898
|
-
|
899
|
-
elsif c == ')'
|
900
|
-
# Don't include the closing ')', let group_close handle it.
|
901
|
-
emit_options(text, ts, te + options_length)
|
902
|
-
|
903
|
-
else
|
904
|
-
# Plain Regexp reports this as 'undefined group option'
|
905
|
-
raise ScannerError.new(
|
906
|
-
"Unexpected `#{c}' in options sequence, ':' or ')' expected")
|
907
|
-
end
|
908
|
-
else
|
909
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
910
|
-
end
|
820
|
+
def in_group?
|
821
|
+
group_depth > 0
|
822
|
+
end
|
911
823
|
|
912
|
-
|
824
|
+
def in_set?
|
825
|
+
set_depth > 0
|
913
826
|
end
|
914
827
|
|
915
828
|
# Copy from ts to te from data as text
|
@@ -945,32 +858,39 @@ class Regexp::Scanner
|
|
945
858
|
def emit_options(text, ts, te)
|
946
859
|
token = nil
|
947
860
|
|
948
|
-
|
949
|
-
|
861
|
+
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
862
|
+
text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
|
863
|
+
positive, negative, group_local = $1, $2, $3
|
950
864
|
|
951
|
-
|
952
|
-
|
953
|
-
|
865
|
+
if positive.include?('x')
|
866
|
+
self.free_spacing = true
|
867
|
+
end
|
954
868
|
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
869
|
+
# If the x appears in both, treat it like ruby does, the second cancels
|
870
|
+
# the first.
|
871
|
+
if negative && negative.include?('x')
|
872
|
+
self.free_spacing = false
|
873
|
+
end
|
960
874
|
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
end
|
875
|
+
if group_local
|
876
|
+
spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
|
877
|
+
token = :options
|
878
|
+
else
|
879
|
+
# switch for parent group level
|
880
|
+
spacing_stack.last[:free_spacing] = free_spacing
|
881
|
+
token = :options_switch
|
969
882
|
end
|
970
883
|
|
971
884
|
emit(:group, token, text, ts, te)
|
972
885
|
end
|
973
886
|
|
887
|
+
def emit_meta_control_sequence(data, ts, te, token)
|
888
|
+
if data.last < 0x00 || data.last > 0x7F
|
889
|
+
validation_error(:sequence, 'escape', token.to_s)
|
890
|
+
end
|
891
|
+
emit(:escape, token, *text(data, ts, te, 1))
|
892
|
+
end
|
893
|
+
|
974
894
|
# Centralizes and unifies the handling of validation related
|
975
895
|
# errors.
|
976
896
|
def validation_error(type, what, reason)
|
@@ -981,21 +901,8 @@ class Regexp::Scanner
|
|
981
901
|
error = InvalidBackrefError.new(what, reason)
|
982
902
|
when :sequence
|
983
903
|
error = InvalidSequenceError.new(what, reason)
|
984
|
-
else
|
985
|
-
error = ValidationError.new('expression')
|
986
904
|
end
|
987
905
|
|
988
906
|
raise error # unless @@config.validation_ignore
|
989
907
|
end
|
990
|
-
|
991
|
-
# Used for references with an empty name or number
|
992
|
-
def empty_backref_error(type, what)
|
993
|
-
validation_error(:backref, what, 'ref ID is empty')
|
994
|
-
end
|
995
|
-
|
996
|
-
# Used for named expressions with an empty name
|
997
|
-
def empty_name_error(type, what)
|
998
|
-
validation_error(type, what, 'name is empty')
|
999
|
-
end
|
1000
|
-
|
1001
908
|
end # module Regexp::Scanner
|