regexp_parser 1.4.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +66 -1
- data/Gemfile +3 -3
- data/README.md +11 -18
- data/Rakefile +3 -4
- data/lib/regexp_parser/expression.rb +28 -53
- data/lib/regexp_parser/expression/classes/backref.rb +18 -10
- data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
- data/lib/regexp_parser/expression/classes/escape.rb +0 -4
- data/lib/regexp_parser/expression/classes/group.rb +4 -2
- data/lib/regexp_parser/expression/classes/keep.rb +1 -3
- data/lib/regexp_parser/expression/methods/match.rb +13 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
- data/lib/regexp_parser/expression/methods/options.rb +35 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
- data/lib/regexp_parser/expression/methods/tests.rb +6 -15
- data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
- data/lib/regexp_parser/expression/quantifier.rb +2 -2
- data/lib/regexp_parser/expression/sequence.rb +3 -6
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
- data/lib/regexp_parser/expression/subexpression.rb +3 -5
- data/lib/regexp_parser/lexer.rb +30 -44
- data/lib/regexp_parser/parser.rb +47 -24
- data/lib/regexp_parser/scanner.rb +1228 -1367
- data/lib/regexp_parser/scanner/char_type.rl +0 -3
- data/lib/regexp_parser/scanner/properties/long.yml +15 -1
- data/lib/regexp_parser/scanner/properties/short.yml +5 -0
- data/lib/regexp_parser/scanner/scanner.rl +101 -194
- data/lib/regexp_parser/syntax/tokens.rb +2 -10
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -2
- data/spec/expression/base_spec.rb +94 -0
- data/spec/expression/clone_spec.rb +120 -0
- data/spec/expression/conditional_spec.rb +89 -0
- data/spec/expression/free_space_spec.rb +27 -0
- data/spec/expression/methods/match_length_spec.rb +161 -0
- data/spec/expression/methods/match_spec.rb +25 -0
- data/spec/expression/methods/strfregexp_spec.rb +224 -0
- data/spec/expression/methods/tests_spec.rb +99 -0
- data/spec/expression/methods/traverse_spec.rb +161 -0
- data/spec/expression/options_spec.rb +128 -0
- data/spec/expression/root_spec.rb +9 -0
- data/spec/expression/sequence_spec.rb +9 -0
- data/spec/expression/subexpression_spec.rb +50 -0
- data/spec/expression/to_h_spec.rb +26 -0
- data/spec/expression/to_s_spec.rb +100 -0
- data/spec/lexer/all_spec.rb +22 -0
- data/spec/lexer/conditionals_spec.rb +53 -0
- data/spec/lexer/delimiters_spec.rb +68 -0
- data/spec/lexer/escapes_spec.rb +14 -0
- data/spec/lexer/keep_spec.rb +10 -0
- data/spec/lexer/literals_spec.rb +89 -0
- data/spec/lexer/nesting_spec.rb +99 -0
- data/spec/lexer/refcalls_spec.rb +55 -0
- data/spec/parser/all_spec.rb +43 -0
- data/spec/parser/alternation_spec.rb +88 -0
- data/spec/parser/anchors_spec.rb +17 -0
- data/spec/parser/conditionals_spec.rb +179 -0
- data/spec/parser/errors_spec.rb +30 -0
- data/spec/parser/escapes_spec.rb +121 -0
- data/spec/parser/free_space_spec.rb +130 -0
- data/spec/parser/groups_spec.rb +108 -0
- data/spec/parser/keep_spec.rb +6 -0
- data/spec/parser/posix_classes_spec.rb +8 -0
- data/spec/parser/properties_spec.rb +115 -0
- data/spec/parser/quantifiers_spec.rb +52 -0
- data/spec/parser/refcalls_spec.rb +112 -0
- data/spec/parser/set/intersections_spec.rb +127 -0
- data/spec/parser/set/ranges_spec.rb +111 -0
- data/spec/parser/sets_spec.rb +178 -0
- data/spec/parser/types_spec.rb +18 -0
- data/spec/scanner/all_spec.rb +18 -0
- data/spec/scanner/anchors_spec.rb +21 -0
- data/spec/scanner/conditionals_spec.rb +128 -0
- data/spec/scanner/delimiters_spec.rb +52 -0
- data/spec/scanner/errors_spec.rb +67 -0
- data/spec/scanner/escapes_spec.rb +53 -0
- data/spec/scanner/free_space_spec.rb +133 -0
- data/spec/scanner/groups_spec.rb +52 -0
- data/spec/scanner/keep_spec.rb +10 -0
- data/spec/scanner/literals_spec.rb +49 -0
- data/spec/scanner/meta_spec.rb +18 -0
- data/spec/scanner/properties_spec.rb +64 -0
- data/spec/scanner/quantifiers_spec.rb +20 -0
- data/spec/scanner/refcalls_spec.rb +36 -0
- data/spec/scanner/sets_spec.rb +102 -0
- data/spec/scanner/types_spec.rb +14 -0
- data/spec/spec_helper.rb +15 -0
- data/{test → spec}/support/runner.rb +9 -8
- data/spec/support/shared_examples.rb +77 -0
- data/{test → spec}/support/warning_extractor.rb +5 -7
- data/spec/syntax/syntax_spec.rb +48 -0
- data/spec/syntax/syntax_token_map_spec.rb +23 -0
- data/spec/syntax/versions/1.8.6_spec.rb +17 -0
- data/spec/syntax/versions/1.9.1_spec.rb +10 -0
- data/spec/syntax/versions/1.9.3_spec.rb +9 -0
- data/spec/syntax/versions/2.0.0_spec.rb +13 -0
- data/spec/syntax/versions/2.2.0_spec.rb +9 -0
- data/spec/syntax/versions/aliases_spec.rb +37 -0
- data/spec/token/token_spec.rb +85 -0
- metadata +149 -144
- data/test/expression/test_all.rb +0 -12
- data/test/expression/test_base.rb +0 -90
- data/test/expression/test_clone.rb +0 -89
- data/test/expression/test_conditionals.rb +0 -113
- data/test/expression/test_free_space.rb +0 -35
- data/test/expression/test_set.rb +0 -84
- data/test/expression/test_strfregexp.rb +0 -230
- data/test/expression/test_subexpression.rb +0 -58
- data/test/expression/test_tests.rb +0 -99
- data/test/expression/test_to_h.rb +0 -59
- data/test/expression/test_to_s.rb +0 -104
- data/test/expression/test_traverse.rb +0 -161
- data/test/helpers.rb +0 -10
- data/test/lexer/test_all.rb +0 -41
- data/test/lexer/test_conditionals.rb +0 -127
- data/test/lexer/test_keep.rb +0 -24
- data/test/lexer/test_literals.rb +0 -130
- data/test/lexer/test_nesting.rb +0 -132
- data/test/lexer/test_refcalls.rb +0 -56
- data/test/parser/set/test_intersections.rb +0 -127
- data/test/parser/set/test_ranges.rb +0 -111
- data/test/parser/test_all.rb +0 -64
- data/test/parser/test_alternation.rb +0 -92
- data/test/parser/test_anchors.rb +0 -34
- data/test/parser/test_conditionals.rb +0 -187
- data/test/parser/test_errors.rb +0 -63
- data/test/parser/test_escapes.rb +0 -134
- data/test/parser/test_free_space.rb +0 -139
- data/test/parser/test_groups.rb +0 -289
- data/test/parser/test_keep.rb +0 -21
- data/test/parser/test_posix_classes.rb +0 -27
- data/test/parser/test_properties.rb +0 -134
- data/test/parser/test_quantifiers.rb +0 -301
- data/test/parser/test_refcalls.rb +0 -186
- data/test/parser/test_sets.rb +0 -179
- data/test/parser/test_types.rb +0 -50
- data/test/scanner/test_all.rb +0 -38
- data/test/scanner/test_anchors.rb +0 -38
- data/test/scanner/test_conditionals.rb +0 -184
- data/test/scanner/test_errors.rb +0 -91
- data/test/scanner/test_escapes.rb +0 -56
- data/test/scanner/test_free_space.rb +0 -200
- data/test/scanner/test_groups.rb +0 -79
- data/test/scanner/test_keep.rb +0 -35
- data/test/scanner/test_literals.rb +0 -89
- data/test/scanner/test_meta.rb +0 -40
- data/test/scanner/test_properties.rb +0 -312
- data/test/scanner/test_quantifiers.rb +0 -37
- data/test/scanner/test_refcalls.rb +0 -52
- data/test/scanner/test_scripts.rb +0 -53
- data/test/scanner/test_sets.rb +0 -119
- data/test/scanner/test_types.rb +0 -35
- data/test/scanner/test_unicode_blocks.rb +0 -30
- data/test/support/disable_autotest.rb +0 -8
- data/test/syntax/test_all.rb +0 -6
- data/test/syntax/test_syntax.rb +0 -61
- data/test/syntax/test_syntax_token_map.rb +0 -25
- data/test/syntax/versions/test_1.8.rb +0 -55
- data/test/syntax/versions/test_1.9.1.rb +0 -36
- data/test/syntax/versions/test_1.9.3.rb +0 -32
- data/test/syntax/versions/test_2.0.0.rb +0 -37
- data/test/syntax/versions/test_2.2.0.rb +0 -32
- data/test/syntax/versions/test_aliases.rb +0 -129
- data/test/syntax/versions/test_all.rb +0 -5
- data/test/test_all.rb +0 -5
- data/test/token/test_all.rb +0 -2
- data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
|
|
21
21
|
when '\W'; emit(:type, :nonword, text, ts - 1, te)
|
22
22
|
when '\R'; emit(:type, :linebreak, text, ts - 1, te)
|
23
23
|
when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
|
24
|
-
else
|
25
|
-
raise ScannerError.new(
|
26
|
-
"Unexpected character in type at #{text} (char #{ts})")
|
27
24
|
end
|
28
25
|
fret;
|
29
26
|
};
|
@@ -6,6 +6,8 @@ adlam: adlam
|
|
6
6
|
age=1.1: age=1.1
|
7
7
|
age=10.0: age=10.0
|
8
8
|
age=11.0: age=11.0
|
9
|
+
age=12.0: age=12.0
|
10
|
+
age=12.1: age=12.1
|
9
11
|
age=2.0: age=2.0
|
10
12
|
age=2.1: age=2.1
|
11
13
|
age=3.0: age=3.0
|
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
|
|
64
66
|
cherokee: cherokee
|
65
67
|
closepunctuation: close_punctuation
|
66
68
|
cntrl: cntrl
|
67
|
-
combiningmark: combining_mark
|
68
69
|
common: common
|
69
70
|
connectorpunctuation: connector_punctuation
|
70
71
|
control: control
|
@@ -86,6 +87,7 @@ dogra: dogra
|
|
86
87
|
duployan: duployan
|
87
88
|
egyptianhieroglyphs: egyptian_hieroglyphs
|
88
89
|
elbasan: elbasan
|
90
|
+
elymaic: elymaic
|
89
91
|
emoji: emoji
|
90
92
|
emojicomponent: emoji_component
|
91
93
|
emojimodifier: emoji_modifier
|
@@ -206,8 +208,10 @@ indogra: in_dogra
|
|
206
208
|
indominotiles: in_domino_tiles
|
207
209
|
induployan: in_duployan
|
208
210
|
inearlydynasticcuneiform: in_early_dynastic_cuneiform
|
211
|
+
inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
|
209
212
|
inegyptianhieroglyphs: in_egyptian_hieroglyphs
|
210
213
|
inelbasan: in_elbasan
|
214
|
+
inelymaic: in_elymaic
|
211
215
|
inemoticons: in_emoticons
|
212
216
|
inenclosedalphanumerics: in_enclosed_alphanumerics
|
213
217
|
inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
|
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
|
|
322
326
|
inmyanmarextendeda: in_myanmar_extended_a
|
323
327
|
inmyanmarextendedb: in_myanmar_extended_b
|
324
328
|
innabataean: in_nabataean
|
329
|
+
innandinagari: in_nandinagari
|
325
330
|
innewa: in_newa
|
326
331
|
innewtailue: in_new_tai_lue
|
327
332
|
innko: in_nko
|
328
333
|
innoblock: in_no_block
|
329
334
|
innumberforms: in_number_forms
|
330
335
|
innushu: in_nushu
|
336
|
+
innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
|
331
337
|
inogham: in_ogham
|
332
338
|
inolchiki: in_ol_chiki
|
333
339
|
inoldhungarian: in_old_hungarian
|
@@ -343,6 +349,7 @@ inoriya: in_oriya
|
|
343
349
|
inornamentaldingbats: in_ornamental_dingbats
|
344
350
|
inosage: in_osage
|
345
351
|
inosmanya: in_osmanya
|
352
|
+
inottomansiyaqnumbers: in_ottoman_siyaq_numbers
|
346
353
|
inpahawhhmong: in_pahawh_hmong
|
347
354
|
inpalmyrene: in_palmyrene
|
348
355
|
inpaucinhau: in_pau_cin_hau
|
@@ -368,6 +375,7 @@ insiddham: in_siddham
|
|
368
375
|
insinhala: in_sinhala
|
369
376
|
insinhalaarchaicnumbers: in_sinhala_archaic_numbers
|
370
377
|
insmallformvariants: in_small_form_variants
|
378
|
+
insmallkanaextension: in_small_kana_extension
|
371
379
|
insogdian: in_sogdian
|
372
380
|
insorasompeng: in_sora_sompeng
|
373
381
|
insoyombo: in_soyombo
|
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
|
|
386
394
|
insupplementaryprivateuseareab: in_supplementary_private_use_area_b
|
387
395
|
insuttonsignwriting: in_sutton_signwriting
|
388
396
|
insylotinagri: in_syloti_nagri
|
397
|
+
insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
|
389
398
|
insyriac: in_syriac
|
390
399
|
insyriacsupplement: in_syriac_supplement
|
391
400
|
intagalog: in_tagalog
|
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
|
|
397
406
|
intaixuanjingsymbols: in_tai_xuan_jing_symbols
|
398
407
|
intakri: in_takri
|
399
408
|
intamil: in_tamil
|
409
|
+
intamilsupplement: in_tamil_supplement
|
400
410
|
intangut: in_tangut
|
401
411
|
intangutcomponents: in_tangut_components
|
402
412
|
intelugu: in_telugu
|
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
|
|
414
424
|
invariationselectorssupplement: in_variation_selectors_supplement
|
415
425
|
invedicextensions: in_vedic_extensions
|
416
426
|
inverticalforms: in_vertical_forms
|
427
|
+
inwancho: in_wancho
|
417
428
|
inwarangciti: in_warang_citi
|
418
429
|
inyijinghexagramsymbols: in_yijing_hexagram_symbols
|
419
430
|
inyiradicals: in_yi_radicals
|
@@ -469,6 +480,7 @@ mro: mro
|
|
469
480
|
multani: multani
|
470
481
|
myanmar: myanmar
|
471
482
|
nabataean: nabataean
|
483
|
+
nandinagari: nandinagari
|
472
484
|
newa: newa
|
473
485
|
newline: newline
|
474
486
|
newtailue: new_tai_lue
|
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
|
|
477
489
|
nonspacingmark: nonspacing_mark
|
478
490
|
number: number
|
479
491
|
nushu: nushu
|
492
|
+
nyiakengpuachuehmong: nyiakeng_puachue_hmong
|
480
493
|
ogham: ogham
|
481
494
|
olchiki: ol_chiki
|
482
495
|
oldhungarian: old_hungarian
|
@@ -569,6 +582,7 @@ uppercase: uppercase
|
|
569
582
|
uppercaseletter: uppercase_letter
|
570
583
|
vai: vai
|
571
584
|
variationselector: variation_selector
|
585
|
+
wancho: wancho
|
572
586
|
warangciti: warang_citi
|
573
587
|
whitespace: white_space
|
574
588
|
word: word
|
@@ -31,6 +31,7 @@ cher: cherokee
|
|
31
31
|
ci: case_ignorable
|
32
32
|
cn: unassigned
|
33
33
|
co: private_use
|
34
|
+
combiningmark: mark
|
34
35
|
copt: coptic
|
35
36
|
cprt: cypriot
|
36
37
|
cs: surrogate
|
@@ -49,6 +50,7 @@ dsrt: deseret
|
|
49
50
|
dupl: duployan
|
50
51
|
egyp: egyptian_hieroglyphs
|
51
52
|
elba: elbasan
|
53
|
+
elym: elymaic
|
52
54
|
ethi: ethiopic
|
53
55
|
ext: extender
|
54
56
|
geor: georgian
|
@@ -72,6 +74,7 @@ hex: hex_digit
|
|
72
74
|
hira: hiragana
|
73
75
|
hluw: anatolian_hieroglyphs
|
74
76
|
hmng: pahawh_hmong
|
77
|
+
hmnp: nyiakeng_puachue_hmong
|
75
78
|
hung: old_hungarian
|
76
79
|
idc: id_continue
|
77
80
|
ideo: ideographic
|
@@ -125,6 +128,7 @@ mtei: meetei_mayek
|
|
125
128
|
mult: multani
|
126
129
|
mymr: myanmar
|
127
130
|
n: number
|
131
|
+
nand: nandinagari
|
128
132
|
narb: old_north_arabian
|
129
133
|
nbat: nabataean
|
130
134
|
nchar: noncharacter_code_point
|
@@ -216,6 +220,7 @@ uideo: unified_ideograph
|
|
216
220
|
vaii: vai
|
217
221
|
vs: variation_selector
|
218
222
|
wara: warang_citi
|
223
|
+
wcho: wancho
|
219
224
|
wspace: white_space
|
220
225
|
xidc: xid_continue
|
221
226
|
xids: xid_start
|
@@ -49,9 +49,9 @@
|
|
49
49
|
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
50
50
|
codepoint_sequence = codepoint_single | codepoint_list;
|
51
51
|
|
52
|
-
control_sequence = ('c' | 'C-') . (backslash . 'M-')
|
52
|
+
control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
|
53
53
|
|
54
|
-
meta_sequence = 'M-' . (backslash .
|
54
|
+
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
55
|
|
56
56
|
zero_or_one = '?' | '??' | '?+';
|
57
57
|
zero_or_more = '*' | '*?' | '*+';
|
@@ -62,13 +62,17 @@
|
|
62
62
|
quantifier_possessive = '?+' | '*+' | '++';
|
63
63
|
quantifier_mode = '?' | '+';
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
quantity_exact = (digit+);
|
66
|
+
quantity_minimum = (digit+) . ',';
|
67
|
+
quantity_maximum = ',' . (digit+);
|
68
|
+
quantity_range = (digit+) . ',' . (digit+);
|
69
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
70
|
+
quantity_maximum | quantity_range ) . range_close .
|
71
|
+
quantifier_mode?;
|
67
72
|
|
68
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
74
|
quantifier_possessive | quantifier_interval;
|
70
75
|
|
71
|
-
|
72
76
|
conditional = '(?(';
|
73
77
|
|
74
78
|
group_comment = '?#' . [^)]* . group_close;
|
@@ -82,7 +86,8 @@
|
|
82
86
|
assertion_lookbehind = '?<=';
|
83
87
|
assertion_nlookbehind = '?<!';
|
84
88
|
|
85
|
-
|
89
|
+
# try to treat every other group head as options group, like Ruby
|
90
|
+
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
86
91
|
|
87
92
|
group_ref = [gk];
|
88
93
|
group_name_char = (alnum | '_');
|
@@ -113,6 +118,8 @@
|
|
113
118
|
curlies | parantheses | brackets |
|
114
119
|
line_anchor | quantifier_greedy;
|
115
120
|
|
121
|
+
literal_delimiters = ']' | '}';
|
122
|
+
|
116
123
|
ascii_print = ((0x20..0x7e) - meta_char);
|
117
124
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
118
125
|
|
@@ -135,41 +142,35 @@
|
|
135
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
136
143
|
action invalid_sequence_error {
|
137
144
|
text = ts ? copy(data, ts-1..-1) : data.pack('c*')
|
138
|
-
|
145
|
+
validation_error(:sequence, 'sequence', text)
|
139
146
|
}
|
140
147
|
|
141
148
|
# group (nesting) and set open/close actions
|
142
|
-
action group_opened { self.group_depth = group_depth + 1
|
143
|
-
action group_closed { self.group_depth = group_depth - 1
|
149
|
+
action group_opened { self.group_depth = group_depth + 1 }
|
150
|
+
action group_closed { self.group_depth = group_depth - 1 }
|
151
|
+
action set_opened { self.set_depth = set_depth + 1 }
|
152
|
+
action set_closed { self.set_depth = set_depth - 1 }
|
144
153
|
|
145
154
|
# Character set scanner, continues consuming characters until it meets the
|
146
155
|
# closing bracket of the set.
|
147
156
|
# --------------------------------------------------------------------------
|
148
157
|
character_set := |*
|
149
|
-
set_close > (set_meta, 2) {
|
150
|
-
set_depth -= 1
|
151
|
-
in_set = set_depth > 0 ? true : false
|
152
|
-
|
158
|
+
set_close > (set_meta, 2) @set_closed {
|
153
159
|
emit(:set, :close, *text(data, ts, te))
|
154
|
-
|
155
|
-
if set_depth == 0
|
156
|
-
fgoto main;
|
157
|
-
else
|
160
|
+
if in_set?
|
158
161
|
fret;
|
162
|
+
else
|
163
|
+
fgoto main;
|
159
164
|
end
|
160
165
|
};
|
161
166
|
|
162
|
-
'-]' { # special case, emits two tokens
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
emit(:literal, :literal, copy(data, ts..te-2), ts, te)
|
167
|
-
emit(:set, :close, copy(data, ts+1..te-1), ts, te)
|
168
|
-
|
169
|
-
if set_depth == 0
|
170
|
-
fgoto main;
|
171
|
-
else
|
167
|
+
'-]' @set_closed { # special case, emits two tokens
|
168
|
+
emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
|
169
|
+
emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
|
170
|
+
if in_set?
|
172
171
|
fret;
|
172
|
+
else
|
173
|
+
fgoto main;
|
173
174
|
end
|
174
175
|
};
|
175
176
|
|
@@ -207,14 +208,12 @@
|
|
207
208
|
fcall set_escape_sequence;
|
208
209
|
};
|
209
210
|
|
210
|
-
set_open >(open_bracket, 1) {
|
211
|
-
set_depth += 1
|
212
|
-
|
211
|
+
set_open >(open_bracket, 1) >set_opened {
|
213
212
|
emit(:set, :open, *text(data, ts, te))
|
214
213
|
fcall character_set;
|
215
214
|
};
|
216
215
|
|
217
|
-
class_posix >(open_bracket, 1) @eof(premature_end_error)
|
216
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
218
217
|
text = text(data, ts, te).first
|
219
218
|
|
220
219
|
type = :posixclass
|
@@ -227,11 +226,11 @@
|
|
227
226
|
emit(type, class_name.to_sym, text, ts, te)
|
228
227
|
};
|
229
228
|
|
230
|
-
collating_sequence >(open_bracket, 1) @eof(premature_end_error)
|
229
|
+
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
231
230
|
emit(:set, :collation, *text(data, ts, te))
|
232
231
|
};
|
233
232
|
|
234
|
-
character_equivalent >(open_bracket, 1) @eof(premature_end_error)
|
233
|
+
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
235
234
|
emit(:set, :equivalent, *text(data, ts, te))
|
236
235
|
};
|
237
236
|
|
@@ -337,44 +336,24 @@
|
|
337
336
|
};
|
338
337
|
|
339
338
|
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
340
|
-
|
341
|
-
c = data[te].chr
|
342
|
-
if c =~ /[\x00-\x7F]/
|
343
|
-
emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
|
344
|
-
p += 1
|
345
|
-
else
|
346
|
-
raise InvalidSequenceError.new("control sequence")
|
347
|
-
end
|
348
|
-
else
|
349
|
-
raise PrematureEndError.new("control sequence")
|
350
|
-
end
|
339
|
+
emit_meta_control_sequence(data, ts, te, :control)
|
351
340
|
fret;
|
352
341
|
};
|
353
342
|
|
354
343
|
meta_sequence >(backslashed, 3) $eof(premature_end_error) {
|
355
|
-
|
356
|
-
c = data[te].chr
|
357
|
-
if c =~ /[\x00-\x7F]/
|
358
|
-
emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
|
359
|
-
p += 1
|
360
|
-
else
|
361
|
-
raise InvalidSequenceError.new("meta sequence")
|
362
|
-
end
|
363
|
-
else
|
364
|
-
raise PrematureEndError.new("meta sequence")
|
365
|
-
end
|
344
|
+
emit_meta_control_sequence(data, ts, te, :meta_sequence)
|
366
345
|
fret;
|
367
346
|
};
|
368
347
|
|
369
348
|
char_type_char > (escaped_alpha, 2) {
|
370
349
|
fhold;
|
371
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
350
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
372
351
|
fcall char_type;
|
373
352
|
};
|
374
353
|
|
375
354
|
property_char > (escaped_alpha, 2) {
|
376
355
|
fhold;
|
377
|
-
fnext *(in_set ? fentry(character_set) : fentry(main));
|
356
|
+
fnext *(in_set? ? fentry(character_set) : fentry(main));
|
378
357
|
fcall unicode_property;
|
379
358
|
};
|
380
359
|
|
@@ -412,8 +391,7 @@
|
|
412
391
|
};
|
413
392
|
|
414
393
|
alternation {
|
415
|
-
if
|
416
|
-
conditional_stack.last[1] == group_depth
|
394
|
+
if conditional_stack.last == group_depth
|
417
395
|
emit(:conditional, :separator, *text(data, ts, te))
|
418
396
|
else
|
419
397
|
emit(:meta, :alternation, *text(data, ts, te))
|
@@ -442,18 +420,16 @@
|
|
442
420
|
when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
|
443
421
|
when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
|
444
422
|
when '\\G'; emit(:anchor, :match_start, text, ts, te)
|
445
|
-
else
|
446
|
-
raise ScannerError.new(
|
447
|
-
"Unexpected character in anchor at #{text} (char #{ts})")
|
448
423
|
end
|
449
424
|
};
|
450
425
|
|
426
|
+
literal_delimiters {
|
427
|
+
append_literal(data, ts, te)
|
428
|
+
};
|
429
|
+
|
451
430
|
# Character sets
|
452
431
|
# ------------------------------------------------------------------------
|
453
|
-
set_open {
|
454
|
-
set_depth += 1
|
455
|
-
in_set = true
|
456
|
-
|
432
|
+
set_open >set_opened {
|
457
433
|
emit(:set, :open, *text(data, ts, te))
|
458
434
|
fcall character_set;
|
459
435
|
};
|
@@ -465,9 +441,7 @@
|
|
465
441
|
conditional {
|
466
442
|
text = text(data, ts, te).first
|
467
443
|
|
468
|
-
|
469
|
-
conditional_depth += 1
|
470
|
-
conditional_stack << [conditional_depth, group_depth]
|
444
|
+
conditional_stack << group_depth
|
471
445
|
|
472
446
|
emit(:conditional, :open, text[0..-2], ts, te-1)
|
473
447
|
emit(:conditional, :condition_open, '(', te-1, te)
|
@@ -496,7 +470,11 @@
|
|
496
470
|
# (?imxdau-imx:subexp) option on/off for subexp
|
497
471
|
# ------------------------------------------------------------------------
|
498
472
|
group_open . group_options >group_opened {
|
499
|
-
|
473
|
+
text = text(data, ts, te).first
|
474
|
+
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
|
+
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
|
+
end
|
477
|
+
emit_options(text, ts, te)
|
500
478
|
};
|
501
479
|
|
502
480
|
# Assertions
|
@@ -528,19 +506,15 @@
|
|
528
506
|
when '(?>'; emit(:group, :atomic, text, ts, te)
|
529
507
|
when '(?~'; emit(:group, :absence, text, ts, te)
|
530
508
|
|
531
|
-
when /^\(
|
532
|
-
|
509
|
+
when /^\(\?(?:<>|'')/
|
510
|
+
validation_error(:group, 'named group', 'name is empty')
|
533
511
|
|
512
|
+
when /^\(\?<\w*>/
|
534
513
|
emit(:group, :named_ab, text, ts, te)
|
535
514
|
|
536
|
-
when /^\(\?'
|
537
|
-
empty_name_error(:group, 'named group (sq)') if $1.empty?
|
538
|
-
|
515
|
+
when /^\(\?'\w*'/
|
539
516
|
emit(:group, :named_sq, text, ts, te)
|
540
517
|
|
541
|
-
else
|
542
|
-
raise ScannerError.new(
|
543
|
-
"Unknown subexpression group format '#{text}'")
|
544
518
|
end
|
545
519
|
};
|
546
520
|
|
@@ -550,20 +524,13 @@
|
|
550
524
|
};
|
551
525
|
|
552
526
|
group_close @group_closed {
|
553
|
-
if
|
554
|
-
conditional_stack.last[1] == (group_depth + 1)
|
555
|
-
|
556
|
-
emit(:conditional, :close, *text(data, ts, te))
|
527
|
+
if conditional_stack.last == group_depth + 1
|
557
528
|
conditional_stack.pop
|
558
|
-
|
559
|
-
if conditional_stack.length == 0
|
560
|
-
in_conditional = false
|
561
|
-
end
|
529
|
+
emit(:conditional, :close, *text(data, ts, te))
|
562
530
|
else
|
563
|
-
if spacing_stack.length > 1
|
564
|
-
|
531
|
+
if spacing_stack.length > 1 &&
|
532
|
+
spacing_stack.last[:depth] == group_depth + 1
|
565
533
|
spacing_stack.pop
|
566
|
-
|
567
534
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
568
535
|
end
|
569
536
|
|
@@ -576,11 +543,8 @@
|
|
576
543
|
# ------------------------------------------------------------------------
|
577
544
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
578
545
|
case text = text(data, ts, te).first
|
579
|
-
when /^\\([gk])
|
580
|
-
|
581
|
-
|
582
|
-
when /^\\([gk])''/ # single quotes
|
583
|
-
empty_backref_error("ref/call (sq)")
|
546
|
+
when /^\\([gk])(<>|'')/ # angle brackets
|
547
|
+
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
584
548
|
|
585
549
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
586
550
|
if $1 == 'k'
|
@@ -636,9 +600,6 @@
|
|
636
600
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
637
601
|
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
638
602
|
|
639
|
-
else
|
640
|
-
raise ScannerError.new(
|
641
|
-
"Unknown backreference format '#{text}'")
|
642
603
|
end
|
643
604
|
};
|
644
605
|
|
@@ -669,10 +630,15 @@
|
|
669
630
|
end
|
670
631
|
};
|
671
632
|
|
672
|
-
quantifier_interval
|
633
|
+
quantifier_interval {
|
673
634
|
emit(:quantifier, :interval, *text(data, ts, te))
|
674
635
|
};
|
675
636
|
|
637
|
+
# Catch unmatched curly braces as literals
|
638
|
+
range_open {
|
639
|
+
append_literal(data, ts, te)
|
640
|
+
};
|
641
|
+
|
676
642
|
# Escaped sequences
|
677
643
|
# ------------------------------------------------------------------------
|
678
644
|
backslash > (backslashed, 1) {
|
@@ -786,7 +752,7 @@ class Regexp::Scanner
|
|
786
752
|
input = input_object
|
787
753
|
self.free_spacing = false
|
788
754
|
end
|
789
|
-
|
755
|
+
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
790
756
|
|
791
757
|
data = input.unpack("c*") if input.is_a?(String)
|
792
758
|
eof = data.length
|
@@ -794,15 +760,9 @@ class Regexp::Scanner
|
|
794
760
|
self.tokens = []
|
795
761
|
self.block = block_given? ? block : nil
|
796
762
|
|
797
|
-
self.
|
763
|
+
self.set_depth = 0
|
798
764
|
self.group_depth = 0
|
799
|
-
self.
|
800
|
-
|
801
|
-
in_set = false
|
802
|
-
set_depth = 0
|
803
|
-
in_conditional = false
|
804
|
-
conditional_depth = 0
|
805
|
-
conditional_stack = []
|
765
|
+
self.conditional_stack = []
|
806
766
|
|
807
767
|
%% write data;
|
808
768
|
%% write init;
|
@@ -817,9 +777,9 @@ class Regexp::Scanner
|
|
817
777
|
end
|
818
778
|
|
819
779
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
820
|
-
"[#{
|
780
|
+
"[#{group_depth}]") if in_group?
|
821
781
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
822
|
-
"[#{
|
782
|
+
"[#{set_depth}]") if in_set?
|
823
783
|
|
824
784
|
# when the entire expression is a literal run
|
825
785
|
emit_literal if literal
|
@@ -854,62 +814,15 @@ class Regexp::Scanner
|
|
854
814
|
|
855
815
|
private
|
856
816
|
|
857
|
-
attr_accessor :tokens, :literal, :block,
|
858
|
-
:
|
859
|
-
:free_spacing, :spacing_stack
|
860
|
-
|
861
|
-
# Ragel's regex-based scan of the group options introduced a lot of
|
862
|
-
# ambiguity, so we just ask it to find the beginning of what looks
|
863
|
-
# like an options run and handle the rest in here.
|
864
|
-
def scan_options(p, data, ts, te)
|
865
|
-
text = text(data, ts, te).first
|
817
|
+
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
818
|
+
:group_depth, :set_depth, :conditional_stack
|
866
819
|
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
# as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
|
871
|
-
negative_options = false
|
872
|
-
while options_char
|
873
|
-
if data[te + options_length]
|
874
|
-
c = data[te + options_length].chr
|
875
|
-
|
876
|
-
if c =~ /[-mixdau]/
|
877
|
-
negative_options = true if c == '-'
|
878
|
-
|
879
|
-
raise InvalidGroupOption.new(c, text) if negative_options and
|
880
|
-
c =~ /[dau]/
|
881
|
-
|
882
|
-
text << c ; p += 1 ; options_length += 1
|
883
|
-
else
|
884
|
-
options_char = false
|
885
|
-
end
|
886
|
-
else
|
887
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
888
|
-
end
|
889
|
-
end
|
890
|
-
|
891
|
-
if data[te + options_length]
|
892
|
-
c = data[te + options_length].chr
|
893
|
-
|
894
|
-
if c == ':'
|
895
|
-
# Include the ':' in the options text
|
896
|
-
text << c ; p += 1 ; options_length += 1
|
897
|
-
emit_options(text, ts, te + options_length)
|
898
|
-
|
899
|
-
elsif c == ')'
|
900
|
-
# Don't include the closing ')', let group_close handle it.
|
901
|
-
emit_options(text, ts, te + options_length)
|
902
|
-
|
903
|
-
else
|
904
|
-
# Plain Regexp reports this as 'undefined group option'
|
905
|
-
raise ScannerError.new(
|
906
|
-
"Unexpected `#{c}' in options sequence, ':' or ')' expected")
|
907
|
-
end
|
908
|
-
else
|
909
|
-
raise PrematureEndError.new("expression options `#{text}'")
|
910
|
-
end
|
820
|
+
def in_group?
|
821
|
+
group_depth > 0
|
822
|
+
end
|
911
823
|
|
912
|
-
|
824
|
+
def in_set?
|
825
|
+
set_depth > 0
|
913
826
|
end
|
914
827
|
|
915
828
|
# Copy from ts to te from data as text
|
@@ -945,32 +858,39 @@ class Regexp::Scanner
|
|
945
858
|
def emit_options(text, ts, te)
|
946
859
|
token = nil
|
947
860
|
|
948
|
-
|
949
|
-
|
861
|
+
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
862
|
+
text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
|
863
|
+
positive, negative, group_local = $1, $2, $3
|
950
864
|
|
951
|
-
|
952
|
-
|
953
|
-
|
865
|
+
if positive.include?('x')
|
866
|
+
self.free_spacing = true
|
867
|
+
end
|
954
868
|
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
869
|
+
# If the x appears in both, treat it like ruby does, the second cancels
|
870
|
+
# the first.
|
871
|
+
if negative && negative.include?('x')
|
872
|
+
self.free_spacing = false
|
873
|
+
end
|
960
874
|
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
end
|
875
|
+
if group_local
|
876
|
+
spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
|
877
|
+
token = :options
|
878
|
+
else
|
879
|
+
# switch for parent group level
|
880
|
+
spacing_stack.last[:free_spacing] = free_spacing
|
881
|
+
token = :options_switch
|
969
882
|
end
|
970
883
|
|
971
884
|
emit(:group, token, text, ts, te)
|
972
885
|
end
|
973
886
|
|
887
|
+
def emit_meta_control_sequence(data, ts, te, token)
|
888
|
+
if data.last < 0x00 || data.last > 0x7F
|
889
|
+
validation_error(:sequence, 'escape', token.to_s)
|
890
|
+
end
|
891
|
+
emit(:escape, token, *text(data, ts, te, 1))
|
892
|
+
end
|
893
|
+
|
974
894
|
# Centralizes and unifies the handling of validation related
|
975
895
|
# errors.
|
976
896
|
def validation_error(type, what, reason)
|
@@ -981,21 +901,8 @@ class Regexp::Scanner
|
|
981
901
|
error = InvalidBackrefError.new(what, reason)
|
982
902
|
when :sequence
|
983
903
|
error = InvalidSequenceError.new(what, reason)
|
984
|
-
else
|
985
|
-
error = ValidationError.new('expression')
|
986
904
|
end
|
987
905
|
|
988
906
|
raise error # unless @@config.validation_ignore
|
989
907
|
end
|
990
|
-
|
991
|
-
# Used for references with an empty name or number
|
992
|
-
def empty_backref_error(type, what)
|
993
|
-
validation_error(:backref, what, 'ref ID is empty')
|
994
|
-
end
|
995
|
-
|
996
|
-
# Used for named expressions with an empty name
|
997
|
-
def empty_name_error(type, what)
|
998
|
-
validation_error(type, what, 'name is empty')
|
999
|
-
end
|
1000
|
-
|
1001
908
|
end # module Regexp::Scanner
|