regexp_parser 1.7.0 → 2.8.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +303 -368
- data/lib/regexp_parser/scanner.rb +1423 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +19 -23
- metadata +52 -171
- data/CHANGELOG.md +0 -349
- data/README.md +0 -470
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,26 +20,15 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
22
27
|
brackets = set_open | set_close;
|
23
28
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
25
|
-
|
26
|
-
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
|
-
'cntrl' | 'digit' | 'graph' |
|
28
|
-
'lower' | 'print' | 'punct' |
|
29
|
-
'space' | 'upper' | 'xdigit' |
|
30
|
-
'word' | 'ascii';
|
31
|
-
|
32
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
|
-
|
29
|
+
comment = ('#' . [^\n]* . '\n'?);
|
34
30
|
|
35
|
-
|
36
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
|
-
character_equivalent = '[=' . alpha . '=]';
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
line_anchor = beginning_of_line | end_of_line;
|
40
34
|
anchor_char = [AbBzZG];
|
@@ -53,21 +47,20 @@
|
|
53
47
|
|
54
48
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
49
|
|
50
|
+
sequence_char = [CMcux];
|
51
|
+
|
56
52
|
zero_or_one = '?' | '??' | '?+';
|
57
53
|
zero_or_more = '*' | '*?' | '*+';
|
58
54
|
one_or_more = '+' | '+?' | '++';
|
59
55
|
|
60
56
|
quantifier_greedy = '?' | '*' | '+';
|
61
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
62
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
63
|
-
quantifier_mode = '?' | '+';
|
64
|
-
|
65
|
-
quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
|
66
|
-
range_close . quantifier_mode?;
|
67
|
-
|
68
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
|
-
quantifier_possessive | quantifier_interval;
|
70
57
|
|
58
|
+
quantity_exact = (digit+);
|
59
|
+
quantity_minimum = (digit+) . ',';
|
60
|
+
quantity_maximum = ',' . (digit+);
|
61
|
+
quantity_range = (digit+) . ',' . (digit+);
|
62
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
63
|
+
quantity_maximum | quantity_range ) . range_close;
|
71
64
|
|
72
65
|
conditional = '(?(';
|
73
66
|
|
@@ -85,22 +78,22 @@
|
|
85
78
|
# try to treat every other group head as options group, like Ruby
|
86
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
87
80
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
group_number = '-'? . [1-9] . ([0-9]+)?;
|
81
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
|
+
group_number = '-'? . [0-9]+;
|
92
84
|
group_level = [+\-] . [0-9]+;
|
93
85
|
|
94
|
-
group_name = ('<' .
|
86
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
87
|
+
("'" . group_name_id_sq? . "'");
|
95
88
|
group_lookup = group_name | group_number;
|
96
89
|
|
97
90
|
group_named = ('?' . group_name );
|
98
91
|
|
99
|
-
|
100
|
-
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
101
94
|
|
102
|
-
|
103
|
-
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
104
97
|
|
105
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
106
99
|
|
@@ -111,32 +104,33 @@
|
|
111
104
|
|
112
105
|
# characters that 'break' a literal
|
113
106
|
meta_char = dot | backslash | alternation |
|
114
|
-
curlies |
|
107
|
+
curlies | parentheses | brackets |
|
115
108
|
line_anchor | quantifier_greedy;
|
116
109
|
|
117
|
-
|
118
|
-
ascii_nonprint = (0x01..0x1f | 0x7f);
|
110
|
+
literal_delimiters = ']' | '}';
|
119
111
|
|
120
|
-
|
121
|
-
|
122
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
112
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
113
|
+
ascii_nonprint = (0x01..0x1f | 0x7f);
|
123
114
|
|
124
115
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
125
|
-
|
116
|
+
keep_mark | sequence_char;
|
117
|
+
|
118
|
+
# escapes that also work within a character set
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
121
|
+
sequence_char | single_codepoint_char_type;
|
126
122
|
|
127
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
128
|
-
multi_codepoint_char_type | [0-9cCM];
|
129
123
|
|
130
124
|
# EOF error, used where it can be detected
|
131
125
|
action premature_end_error {
|
132
|
-
text =
|
133
|
-
raise PrematureEndError.new(
|
126
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
127
|
+
raise PrematureEndError.new(text)
|
134
128
|
}
|
135
129
|
|
136
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
137
131
|
action invalid_sequence_error {
|
138
|
-
text =
|
139
|
-
|
132
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
140
134
|
}
|
141
135
|
|
142
136
|
# group (nesting) and set open/close actions
|
@@ -150,7 +144,7 @@
|
|
150
144
|
# --------------------------------------------------------------------------
|
151
145
|
character_set := |*
|
152
146
|
set_close > (set_meta, 2) @set_closed {
|
153
|
-
emit(:set, :close,
|
147
|
+
emit(:set, :close, copy(data, ts, te))
|
154
148
|
if in_set?
|
155
149
|
fret;
|
156
150
|
else
|
@@ -159,8 +153,8 @@
|
|
159
153
|
};
|
160
154
|
|
161
155
|
'-]' @set_closed { # special case, emits two tokens
|
162
|
-
emit(:literal, :literal,
|
163
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
164
158
|
if in_set?
|
165
159
|
fret;
|
166
160
|
else
|
@@ -169,33 +163,32 @@
|
|
169
163
|
};
|
170
164
|
|
171
165
|
'-&&' { # special case, emits two tokens
|
172
|
-
emit(:literal, :literal, '-'
|
173
|
-
emit(:set, :intersection, '&&'
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :intersection, '&&')
|
174
168
|
};
|
175
169
|
|
176
170
|
'^' {
|
177
|
-
|
178
|
-
|
179
|
-
emit(:set, :negate, text, ts, te)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
180
173
|
else
|
181
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
182
175
|
end
|
183
176
|
};
|
184
177
|
|
185
178
|
'-' {
|
186
|
-
|
187
|
-
#
|
188
|
-
if
|
189
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
190
183
|
else
|
191
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
192
185
|
end
|
193
186
|
};
|
194
187
|
|
195
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
196
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
197
190
|
'&&' {
|
198
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
199
192
|
};
|
200
193
|
|
201
194
|
backslash {
|
@@ -203,59 +196,60 @@
|
|
203
196
|
};
|
204
197
|
|
205
198
|
set_open >(open_bracket, 1) >set_opened {
|
206
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
207
200
|
fcall character_set;
|
208
201
|
};
|
209
202
|
|
210
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
211
|
-
text =
|
203
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
204
|
+
text = copy(data, ts, te)
|
212
205
|
|
213
206
|
type = :posixclass
|
214
207
|
class_name = text[2..-3]
|
215
|
-
if class_name[0]
|
208
|
+
if class_name[0] == '^'
|
216
209
|
class_name = class_name[1..-1]
|
217
210
|
type = :nonposixclass
|
218
211
|
end
|
219
212
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
224
|
-
emit(:set, :collation, *text(data, ts, te))
|
225
|
-
};
|
213
|
+
unless self.class.posix_classes.include?(class_name)
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
215
|
+
end
|
226
216
|
|
227
|
-
|
228
|
-
emit(:set, :equivalent, *text(data, ts, te))
|
217
|
+
emit(type, class_name.to_sym, text)
|
229
218
|
};
|
230
219
|
|
231
220
|
meta_char > (set_meta, 1) {
|
232
|
-
emit(:literal, :literal,
|
221
|
+
emit(:literal, :literal, copy(data, ts, te))
|
233
222
|
};
|
234
223
|
|
235
|
-
any
|
236
|
-
|
237
|
-
|
238
|
-
utf8_3_byte |
|
239
|
-
utf8_4_byte {
|
240
|
-
char, *rest = *text(data, ts, te)
|
241
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
242
|
-
emit(:literal, :literal, char, *rest)
|
224
|
+
any | ascii_nonprint | utf8_multibyte {
|
225
|
+
text = copy(data, ts, te)
|
226
|
+
emit(:literal, :literal, text)
|
243
227
|
};
|
244
228
|
*|;
|
245
229
|
|
246
230
|
# set escapes scanner
|
247
231
|
# --------------------------------------------------------------------------
|
248
232
|
set_escape_sequence := |*
|
249
|
-
|
250
|
-
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
251
236
|
fret;
|
252
237
|
};
|
253
238
|
|
254
|
-
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
240
|
+
set_escape > (escaped_set_alpha, 2) {
|
255
241
|
fhold;
|
256
242
|
fnext character_set;
|
257
243
|
fcall escape_sequence;
|
258
244
|
};
|
245
|
+
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
249
|
+
any > (escaped_set_alpha, 1) {
|
250
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
251
|
+
fret;
|
252
|
+
};
|
259
253
|
*|;
|
260
254
|
|
261
255
|
|
@@ -263,33 +257,40 @@
|
|
263
257
|
# --------------------------------------------------------------------------
|
264
258
|
escape_sequence := |*
|
265
259
|
[1-9] {
|
266
|
-
text =
|
267
|
-
emit(:backref, :number, text
|
260
|
+
text = copy(data, ts-1, te)
|
261
|
+
emit(:backref, :number, text)
|
268
262
|
fret;
|
269
263
|
};
|
270
264
|
|
271
265
|
octal_sequence {
|
272
|
-
emit(:escape, :octal,
|
266
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
267
|
+
fret;
|
268
|
+
};
|
269
|
+
|
270
|
+
[8-9] . [0-9] { # special case, emits two tokens
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:escape, :literal, text[0, 2])
|
273
|
+
emit(:literal, :literal, text[2])
|
273
274
|
fret;
|
274
275
|
};
|
275
276
|
|
276
277
|
meta_char {
|
277
|
-
case text =
|
278
|
-
when '\.'; emit(:escape, :dot, text
|
279
|
-
when '\|'; emit(:escape, :alternation, text
|
280
|
-
when '\^'; emit(:escape, :bol, text
|
281
|
-
when '\$'; emit(:escape, :eol, text
|
282
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
283
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
284
|
-
when '\+'; emit(:escape, :one_or_more, text
|
285
|
-
when '\('; emit(:escape, :group_open, text
|
286
|
-
when '\)'; emit(:escape, :group_close, text
|
287
|
-
when '\{'; emit(:escape, :interval_open, text
|
288
|
-
when '\}'; emit(:escape, :interval_close, text
|
289
|
-
when '\['; emit(:escape, :set_open, text
|
290
|
-
when '\]'; emit(:escape, :set_close, text
|
278
|
+
case text = copy(data, ts-1, te)
|
279
|
+
when '\.'; emit(:escape, :dot, text)
|
280
|
+
when '\|'; emit(:escape, :alternation, text)
|
281
|
+
when '\^'; emit(:escape, :bol, text)
|
282
|
+
when '\$'; emit(:escape, :eol, text)
|
283
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
284
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
285
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
286
|
+
when '\('; emit(:escape, :group_open, text)
|
287
|
+
when '\)'; emit(:escape, :group_close, text)
|
288
|
+
when '\{'; emit(:escape, :interval_open, text)
|
289
|
+
when '\}'; emit(:escape, :interval_close, text)
|
290
|
+
when '\['; emit(:escape, :set_open, text)
|
291
|
+
when '\]'; emit(:escape, :set_close, text)
|
291
292
|
when "\\\\";
|
292
|
-
emit(:escape, :backslash, text
|
293
|
+
emit(:escape, :backslash, text)
|
293
294
|
end
|
294
295
|
fret;
|
295
296
|
};
|
@@ -297,31 +298,31 @@
|
|
297
298
|
escaped_ascii > (escaped_alpha, 7) {
|
298
299
|
# \b is emitted as backspace only when inside a character set, otherwise
|
299
300
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
300
|
-
case text =
|
301
|
-
when '\a'; emit(:escape, :bell, text
|
302
|
-
when '\b'; emit(:escape, :backspace, text
|
303
|
-
when '\e'; emit(:escape, :escape, text
|
304
|
-
when '\f'; emit(:escape, :form_feed, text
|
305
|
-
when '\n'; emit(:escape, :newline, text
|
306
|
-
when '\r'; emit(:escape, :carriage, text
|
307
|
-
when '\t'; emit(:escape, :tab, text
|
308
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
301
|
+
case text = copy(data, ts-1, te)
|
302
|
+
when '\a'; emit(:escape, :bell, text)
|
303
|
+
when '\b'; emit(:escape, :backspace, text)
|
304
|
+
when '\e'; emit(:escape, :escape, text)
|
305
|
+
when '\f'; emit(:escape, :form_feed, text)
|
306
|
+
when '\n'; emit(:escape, :newline, text)
|
307
|
+
when '\r'; emit(:escape, :carriage, text)
|
308
|
+
when '\t'; emit(:escape, :tab, text)
|
309
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
309
310
|
end
|
310
311
|
fret;
|
311
312
|
};
|
312
313
|
|
313
314
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
314
|
-
text =
|
315
|
-
if text[2]
|
316
|
-
emit(:escape, :codepoint_list, text
|
315
|
+
text = copy(data, ts-1, te)
|
316
|
+
if text[2] == '{'
|
317
|
+
emit(:escape, :codepoint_list, text)
|
317
318
|
else
|
318
|
-
emit(:escape, :codepoint, text
|
319
|
+
emit(:escape, :codepoint, text)
|
319
320
|
end
|
320
321
|
fret;
|
321
322
|
};
|
322
323
|
|
323
|
-
hex_sequence > (escaped_alpha, 5)
|
324
|
-
emit(:escape, :hex,
|
324
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
325
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
325
326
|
fret;
|
326
327
|
};
|
327
328
|
|
@@ -351,8 +352,8 @@
|
|
351
352
|
fcall unicode_property;
|
352
353
|
};
|
353
354
|
|
354
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
355
|
-
emit(:escape, :literal,
|
355
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
356
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
356
357
|
fret;
|
357
358
|
};
|
358
359
|
*|;
|
@@ -362,9 +363,10 @@
|
|
362
363
|
# --------------------------------------------------------------------------
|
363
364
|
conditional_expression := |*
|
364
365
|
group_lookup . ')' {
|
365
|
-
text =
|
366
|
-
|
367
|
-
emit(:conditional, :
|
366
|
+
text = copy(data, ts, te-1)
|
367
|
+
text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
|
368
|
+
emit(:conditional, :condition, text)
|
369
|
+
emit(:conditional, :condition_close, ')')
|
368
370
|
};
|
369
371
|
|
370
372
|
any {
|
@@ -381,46 +383,50 @@
|
|
381
383
|
# Meta characters
|
382
384
|
# ------------------------------------------------------------------------
|
383
385
|
dot {
|
384
|
-
emit(:meta, :dot,
|
386
|
+
emit(:meta, :dot, copy(data, ts, te))
|
385
387
|
};
|
386
388
|
|
387
389
|
alternation {
|
388
390
|
if conditional_stack.last == group_depth
|
389
|
-
emit(:conditional, :separator,
|
391
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
390
392
|
else
|
391
|
-
emit(:meta, :alternation,
|
393
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
392
394
|
end
|
393
395
|
};
|
394
396
|
|
395
397
|
# Anchors
|
396
398
|
# ------------------------------------------------------------------------
|
397
399
|
beginning_of_line {
|
398
|
-
emit(:anchor, :bol,
|
400
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
399
401
|
};
|
400
402
|
|
401
403
|
end_of_line {
|
402
|
-
emit(:anchor, :eol,
|
404
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
403
405
|
};
|
404
406
|
|
405
407
|
backslash . keep_mark > (backslashed, 4) {
|
406
|
-
emit(:keep, :mark,
|
408
|
+
emit(:keep, :mark, copy(data, ts, te))
|
407
409
|
};
|
408
410
|
|
409
411
|
backslash . anchor_char > (backslashed, 3) {
|
410
|
-
case text =
|
411
|
-
when '
|
412
|
-
when '
|
413
|
-
when '
|
414
|
-
when '
|
415
|
-
when '
|
416
|
-
when '
|
412
|
+
case text = copy(data, ts, te)
|
413
|
+
when '\A'; emit(:anchor, :bos, text)
|
414
|
+
when '\z'; emit(:anchor, :eos, text)
|
415
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
416
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
417
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
418
|
+
when '\G'; emit(:anchor, :match_start, text)
|
417
419
|
end
|
418
420
|
};
|
419
421
|
|
422
|
+
literal_delimiters {
|
423
|
+
append_literal(data, ts, te)
|
424
|
+
};
|
425
|
+
|
420
426
|
# Character sets
|
421
427
|
# ------------------------------------------------------------------------
|
422
428
|
set_open >set_opened {
|
423
|
-
emit(:set, :open,
|
429
|
+
emit(:set, :open, copy(data, ts, te))
|
424
430
|
fcall character_set;
|
425
431
|
};
|
426
432
|
|
@@ -429,23 +435,22 @@
|
|
429
435
|
# (?(condition)Y|N) conditional expression
|
430
436
|
# ------------------------------------------------------------------------
|
431
437
|
conditional {
|
432
|
-
text =
|
438
|
+
text = copy(data, ts, te)
|
433
439
|
|
434
440
|
conditional_stack << group_depth
|
435
441
|
|
436
|
-
emit(:conditional, :open, text[0..-2]
|
437
|
-
emit(:conditional, :condition_open, '('
|
442
|
+
emit(:conditional, :open, text[0..-2])
|
443
|
+
emit(:conditional, :condition_open, '(')
|
438
444
|
fcall conditional_expression;
|
439
445
|
};
|
440
446
|
|
441
447
|
|
442
448
|
# (?#...) comments: parsed as a single expression, without introducing a
|
443
449
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
444
|
-
# special case for close
|
445
|
-
# correct closing count.
|
450
|
+
# special case for close to get the correct closing count.
|
446
451
|
# ------------------------------------------------------------------------
|
447
|
-
group_open . group_comment
|
448
|
-
emit(:group, :comment,
|
452
|
+
(group_open . group_comment) @group_closed {
|
453
|
+
emit(:group, :comment, copy(data, ts, te))
|
449
454
|
};
|
450
455
|
|
451
456
|
# Expression options:
|
@@ -459,12 +464,12 @@
|
|
459
464
|
#
|
460
465
|
# (?imxdau-imx:subexp) option on/off for subexp
|
461
466
|
# ------------------------------------------------------------------------
|
462
|
-
group_open . group_options >group_opened {
|
463
|
-
text =
|
467
|
+
(group_open . group_options) >group_opened {
|
468
|
+
text = copy(data, ts, te)
|
464
469
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
465
|
-
raise
|
470
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
466
471
|
end
|
467
|
-
emit_options(text
|
472
|
+
emit_options(text)
|
468
473
|
};
|
469
474
|
|
470
475
|
# Assertions
|
@@ -473,12 +478,12 @@
|
|
473
478
|
# (?<=subexp) look-behind
|
474
479
|
# (?<!subexp) negative look-behind
|
475
480
|
# ------------------------------------------------------------------------
|
476
|
-
group_open . assertion_type >group_opened {
|
477
|
-
case text =
|
478
|
-
when '(?='; emit(:assertion, :lookahead, text
|
479
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
480
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
481
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
481
|
+
(group_open . assertion_type) >group_opened {
|
482
|
+
case text = copy(data, ts, te)
|
483
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
484
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
485
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
486
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
482
487
|
end
|
483
488
|
};
|
484
489
|
|
@@ -490,106 +495,78 @@
|
|
490
495
|
# (?'name'subexp) named group (single quoted version)
|
491
496
|
# (subexp) captured group
|
492
497
|
# ------------------------------------------------------------------------
|
493
|
-
group_open . group_type >group_opened {
|
494
|
-
case text =
|
495
|
-
when '(?:'; emit(:group, :passive, text
|
496
|
-
when '(?>'; emit(:group, :atomic, text
|
497
|
-
when '(?~'; emit(:group, :absence, text
|
498
|
+
(group_open . group_type) >group_opened {
|
499
|
+
case text = copy(data, ts, te)
|
500
|
+
when '(?:'; emit(:group, :passive, text)
|
501
|
+
when '(?>'; emit(:group, :atomic, text)
|
502
|
+
when '(?~'; emit(:group, :absence, text)
|
498
503
|
|
499
504
|
when /^\(\?(?:<>|'')/
|
500
|
-
|
505
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
501
506
|
|
502
|
-
when /^\(
|
503
|
-
emit(:group, :named_ab, text
|
507
|
+
when /^\(\?<[^>]+>/
|
508
|
+
emit(:group, :named_ab, text)
|
504
509
|
|
505
|
-
when /^\(\?'
|
506
|
-
emit(:group, :named_sq, text
|
510
|
+
when /^\(\?'[^']+'/
|
511
|
+
emit(:group, :named_sq, text)
|
507
512
|
|
508
513
|
end
|
509
514
|
};
|
510
515
|
|
511
516
|
group_open @group_opened {
|
512
|
-
text =
|
513
|
-
emit(:group, :capture, text
|
517
|
+
text = copy(data, ts, te)
|
518
|
+
emit(:group, :capture, text)
|
514
519
|
};
|
515
520
|
|
516
521
|
group_close @group_closed {
|
517
522
|
if conditional_stack.last == group_depth + 1
|
518
523
|
conditional_stack.pop
|
519
|
-
emit(:conditional, :close,
|
520
|
-
|
524
|
+
emit(:conditional, :close, ')')
|
525
|
+
elsif group_depth >= 0
|
521
526
|
if spacing_stack.length > 1 &&
|
522
527
|
spacing_stack.last[:depth] == group_depth + 1
|
523
528
|
spacing_stack.pop
|
524
529
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
525
530
|
end
|
526
531
|
|
527
|
-
emit(:group, :close,
|
532
|
+
emit(:group, :close, ')')
|
533
|
+
else
|
534
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
528
535
|
end
|
529
536
|
};
|
530
537
|
|
531
538
|
|
532
539
|
# Group backreference, named and numbered
|
533
540
|
# ------------------------------------------------------------------------
|
534
|
-
backslash . (
|
535
|
-
case text =
|
536
|
-
when /^\\([
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
551
|
-
end
|
552
|
-
|
553
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
554
|
-
if $1 == 'k'
|
555
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
556
|
-
else
|
557
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
558
|
-
end
|
559
|
-
|
560
|
-
when /^\\([gk])'\d+'/ # single quotes
|
561
|
-
if $1 == 'k'
|
562
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
563
|
-
else
|
564
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
565
|
-
end
|
566
|
-
|
567
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
568
|
-
if $1 == 'k'
|
569
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
570
|
-
else
|
571
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
572
|
-
end
|
573
|
-
|
574
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
575
|
-
if $1 == 'k'
|
576
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
577
|
-
else
|
578
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
579
|
-
end
|
580
|
-
|
581
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
582
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
583
|
-
|
584
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
585
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
586
|
-
|
587
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
588
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
589
|
-
|
590
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
591
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
541
|
+
backslash . (group_ref) > (backslashed, 4) {
|
542
|
+
case text = copy(data, ts, te)
|
543
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
544
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
545
|
+
when /^\\k(.)0*[1-9]\d*['>]$/
|
546
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
547
|
+
when /^\\k(.)-0*[1-9]\d*['>]$/
|
548
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
549
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
550
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
551
|
+
when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
|
552
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
553
|
+
else
|
554
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
555
|
+
end
|
556
|
+
};
|
592
557
|
|
558
|
+
# Group call, named and numbered
|
559
|
+
# ------------------------------------------------------------------------
|
560
|
+
backslash . (group_call) > (backslashed, 4) {
|
561
|
+
case text = copy(data, ts, te)
|
562
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
563
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
564
|
+
when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
|
565
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
566
|
+
when /^\\g(.)[+-]0*[1-9]\d*/
|
567
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
568
|
+
else
|
569
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
593
570
|
end
|
594
571
|
};
|
595
572
|
|
@@ -597,31 +574,36 @@
|
|
597
574
|
# Quantifiers
|
598
575
|
# ------------------------------------------------------------------------
|
599
576
|
zero_or_one {
|
600
|
-
case text =
|
601
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
602
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
603
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
577
|
+
case text = copy(data, ts, te)
|
578
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
579
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
580
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
604
581
|
end
|
605
582
|
};
|
606
583
|
|
607
584
|
zero_or_more {
|
608
|
-
case text =
|
609
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
610
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
611
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
587
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
588
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
612
589
|
end
|
613
590
|
};
|
614
591
|
|
615
592
|
one_or_more {
|
616
|
-
case text =
|
617
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
618
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
619
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
595
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
596
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
620
597
|
end
|
621
598
|
};
|
622
599
|
|
623
|
-
quantifier_interval
|
624
|
-
emit(:quantifier, :interval,
|
600
|
+
quantifier_interval {
|
601
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
602
|
+
};
|
603
|
+
|
604
|
+
# Catch unmatched curly braces as literals
|
605
|
+
range_open {
|
606
|
+
append_literal(data, ts, te)
|
625
607
|
};
|
626
608
|
|
627
609
|
# Escaped sequences
|
@@ -632,15 +614,17 @@
|
|
632
614
|
|
633
615
|
comment {
|
634
616
|
if free_spacing
|
635
|
-
emit(:free_space, :comment,
|
617
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
636
618
|
else
|
637
|
-
|
619
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
620
|
+
append_literal(data, ts, ts + 1)
|
621
|
+
fexec ts + 1;
|
638
622
|
end
|
639
623
|
};
|
640
624
|
|
641
625
|
space+ {
|
642
626
|
if free_spacing
|
643
|
-
emit(:free_space, :whitespace,
|
627
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
644
628
|
else
|
645
629
|
append_literal(data, ts, te)
|
646
630
|
end
|
@@ -649,105 +633,47 @@
|
|
649
633
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
650
634
|
# except meta characters.
|
651
635
|
# ------------------------------------------------------------------------
|
652
|
-
(ascii_print -- space)+
|
653
|
-
ascii_nonprint+ |
|
654
|
-
utf8_2_byte+ |
|
655
|
-
utf8_3_byte+ |
|
656
|
-
utf8_4_byte+ {
|
636
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
657
637
|
append_literal(data, ts, te)
|
658
638
|
};
|
659
639
|
|
660
640
|
*|;
|
661
641
|
}%%
|
662
642
|
|
663
|
-
|
664
|
-
|
643
|
+
require 'regexp_parser/scanner/errors/scanner_error'
|
644
|
+
require 'regexp_parser/scanner/errors/premature_end_error'
|
645
|
+
require 'regexp_parser/scanner/errors/validation_error'
|
665
646
|
|
666
647
|
class Regexp::Scanner
|
667
|
-
# General scanner error (catch all)
|
668
|
-
class ScannerError < StandardError; end
|
669
|
-
|
670
|
-
# Base for all scanner validation errors
|
671
|
-
class ValidationError < StandardError
|
672
|
-
def initialize(reason)
|
673
|
-
super reason
|
674
|
-
end
|
675
|
-
end
|
676
|
-
|
677
|
-
# Unexpected end of pattern
|
678
|
-
class PrematureEndError < ScannerError
|
679
|
-
def initialize(where = '')
|
680
|
-
super "Premature end of pattern at #{where}"
|
681
|
-
end
|
682
|
-
end
|
683
|
-
|
684
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
685
|
-
class InvalidSequenceError < ValidationError
|
686
|
-
def initialize(what = 'sequence', where = '')
|
687
|
-
super "Invalid #{what} at #{where}"
|
688
|
-
end
|
689
|
-
end
|
690
|
-
|
691
|
-
# Invalid group. Used for named groups.
|
692
|
-
class InvalidGroupError < ValidationError
|
693
|
-
def initialize(what, reason)
|
694
|
-
super "Invalid #{what}, #{reason}."
|
695
|
-
end
|
696
|
-
end
|
697
|
-
|
698
|
-
# Invalid groupOption. Used for inline options.
|
699
|
-
class InvalidGroupOption < ValidationError
|
700
|
-
def initialize(option, text)
|
701
|
-
super "Invalid group option #{option} in #{text}"
|
702
|
-
end
|
703
|
-
end
|
704
|
-
|
705
|
-
# Invalid back reference. Used for name a number refs/calls.
|
706
|
-
class InvalidBackrefError < ValidationError
|
707
|
-
def initialize(what, reason)
|
708
|
-
super "Invalid back reference #{what}, #{reason}"
|
709
|
-
end
|
710
|
-
end
|
711
|
-
|
712
|
-
# The property name was not recognized by the scanner.
|
713
|
-
class UnknownUnicodePropertyError < ValidationError
|
714
|
-
def initialize(name)
|
715
|
-
super "Unknown unicode character property name #{name}"
|
716
|
-
end
|
717
|
-
end
|
718
|
-
|
719
648
|
# Scans the given regular expression text, or Regexp object and collects the
|
720
649
|
# emitted token into an array that gets returned at the end. If a block is
|
721
650
|
# given, it gets called for each emitted token.
|
722
651
|
#
|
723
652
|
# This method may raise errors if a syntax error is encountered.
|
724
653
|
# --------------------------------------------------------------------------
|
725
|
-
def self.scan(input_object, &block)
|
726
|
-
new.scan(input_object, &block)
|
654
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
655
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
727
656
|
end
|
728
657
|
|
729
|
-
def scan(input_object, &block)
|
730
|
-
self.
|
658
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
659
|
+
self.collect_tokens = collect_tokens
|
660
|
+
self.literal_run = nil
|
731
661
|
stack = []
|
732
662
|
|
733
|
-
|
734
|
-
|
735
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
736
|
-
else
|
737
|
-
input = input_object
|
738
|
-
self.free_spacing = false
|
739
|
-
end
|
663
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
664
|
+
self.free_spacing = free_spacing?(input_object, options)
|
740
665
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
741
666
|
|
742
|
-
data = input.unpack("c*")
|
667
|
+
data = input.unpack("c*")
|
743
668
|
eof = data.length
|
744
669
|
|
745
670
|
self.tokens = []
|
746
|
-
self.block =
|
671
|
+
self.block = block
|
747
672
|
|
748
673
|
self.set_depth = 0
|
749
674
|
self.group_depth = 0
|
750
675
|
self.conditional_stack = []
|
676
|
+
self.char_pos = 0
|
751
677
|
|
752
678
|
%% write data;
|
753
679
|
%% write init;
|
@@ -757,7 +683,7 @@ class Regexp::Scanner
|
|
757
683
|
testEof = testEof
|
758
684
|
|
759
685
|
if cs == re_scanner_error
|
760
|
-
text =
|
686
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
761
687
|
raise ScannerError.new("Scan error at '#{text}'")
|
762
688
|
end
|
763
689
|
|
@@ -767,40 +693,76 @@ class Regexp::Scanner
|
|
767
693
|
"[#{set_depth}]") if in_set?
|
768
694
|
|
769
695
|
# when the entire expression is a literal run
|
770
|
-
emit_literal if
|
696
|
+
emit_literal if literal_run
|
771
697
|
|
772
698
|
tokens
|
773
699
|
end
|
774
700
|
|
775
701
|
# lazy-load property maps when first needed
|
776
|
-
require 'yaml'
|
777
|
-
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
778
|
-
|
779
702
|
def self.short_prop_map
|
780
|
-
@short_prop_map ||=
|
703
|
+
@short_prop_map ||= parse_prop_map('short')
|
781
704
|
end
|
782
705
|
|
783
706
|
def self.long_prop_map
|
784
|
-
@long_prop_map ||=
|
707
|
+
@long_prop_map ||= parse_prop_map('long')
|
708
|
+
end
|
709
|
+
|
710
|
+
def self.parse_prop_map(name)
|
711
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
712
|
+
end
|
713
|
+
|
714
|
+
def self.posix_classes
|
715
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
716
|
+
lower print punct space upper word xdigit]
|
785
717
|
end
|
786
718
|
|
787
719
|
# Emits an array with the details of the scanned pattern
|
788
|
-
def emit(type, token, text
|
720
|
+
def emit(type, token, text)
|
789
721
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
790
722
|
|
791
|
-
emit_literal if
|
723
|
+
emit_literal if literal_run
|
724
|
+
|
725
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
726
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
727
|
+
ts_char_pos = char_pos
|
728
|
+
te_char_pos = char_pos + text.length
|
729
|
+
|
730
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
731
|
+
|
732
|
+
self.prev_token = tok
|
733
|
+
|
734
|
+
self.char_pos = te_char_pos
|
792
735
|
|
793
736
|
if block
|
794
|
-
block.call type, token, text,
|
737
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
738
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
739
|
+
tokens << tok if collect_tokens
|
740
|
+
elsif collect_tokens
|
741
|
+
tokens << tok
|
795
742
|
end
|
796
|
-
|
797
|
-
tokens << [type, token, text, ts, te]
|
798
743
|
end
|
799
744
|
|
745
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
746
|
+
|
800
747
|
private
|
801
748
|
|
802
|
-
attr_accessor :
|
803
|
-
:
|
749
|
+
attr_accessor :block,
|
750
|
+
:collect_tokens, :tokens, :prev_token,
|
751
|
+
:free_spacing, :spacing_stack,
|
752
|
+
:group_depth, :set_depth, :conditional_stack,
|
753
|
+
:char_pos
|
754
|
+
|
755
|
+
def free_spacing?(input_object, options)
|
756
|
+
if options && !input_object.is_a?(String)
|
757
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
758
|
+
end
|
759
|
+
|
760
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
761
|
+
|
762
|
+
return false unless options
|
763
|
+
|
764
|
+
options & Regexp::EXTENDED != 0
|
765
|
+
end
|
804
766
|
|
805
767
|
def in_group?
|
806
768
|
group_depth > 0
|
@@ -811,36 +773,24 @@ class Regexp::Scanner
|
|
811
773
|
end
|
812
774
|
|
813
775
|
# Copy from ts to te from data as text
|
814
|
-
def copy(data,
|
815
|
-
data[
|
816
|
-
end
|
817
|
-
|
818
|
-
# Copy from ts to te from data as text, returning an array with the text
|
819
|
-
# and the offsets used to copy it.
|
820
|
-
def text(data, ts, te, soff = 0)
|
821
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
776
|
+
def copy(data, ts, te)
|
777
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
822
778
|
end
|
823
779
|
|
824
780
|
# Appends one or more characters to the literal buffer, to be emitted later
|
825
|
-
# by a call to emit_literal.
|
781
|
+
# by a call to emit_literal.
|
826
782
|
def append_literal(data, ts, te)
|
827
|
-
self.
|
828
|
-
literal << text(data, ts, te)
|
783
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
829
784
|
end
|
830
785
|
|
831
|
-
# Emits the literal run collected by calls to the append_literal method
|
832
|
-
# using the total start (ts) and end (te) offsets of the run.
|
786
|
+
# Emits the literal run collected by calls to the append_literal method.
|
833
787
|
def emit_literal
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
838
|
-
|
839
|
-
self.literal = nil
|
840
|
-
emit(:literal, :literal, text, ts, te)
|
788
|
+
text = literal_run.join
|
789
|
+
self.literal_run = nil
|
790
|
+
emit(:literal, :literal, text)
|
841
791
|
end
|
842
792
|
|
843
|
-
def emit_options(text
|
793
|
+
def emit_options(text)
|
844
794
|
token = nil
|
845
795
|
|
846
796
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -866,28 +816,13 @@ class Regexp::Scanner
|
|
866
816
|
token = :options_switch
|
867
817
|
end
|
868
818
|
|
869
|
-
emit(:group, token, text
|
819
|
+
emit(:group, token, text)
|
870
820
|
end
|
871
821
|
|
872
822
|
def emit_meta_control_sequence(data, ts, te, token)
|
873
823
|
if data.last < 0x00 || data.last > 0x7F
|
874
|
-
|
875
|
-
end
|
876
|
-
emit(:escape, token, *text(data, ts, te, 1))
|
877
|
-
end
|
878
|
-
|
879
|
-
# Centralizes and unifies the handling of validation related
|
880
|
-
# errors.
|
881
|
-
def validation_error(type, what, reason)
|
882
|
-
case type
|
883
|
-
when :group
|
884
|
-
error = InvalidGroupError.new(what, reason)
|
885
|
-
when :backref
|
886
|
-
error = InvalidBackrefError.new(what, reason)
|
887
|
-
when :sequence
|
888
|
-
error = InvalidSequenceError.new(what, reason)
|
824
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
889
825
|
end
|
890
|
-
|
891
|
-
raise error # unless @@config.validation_ignore
|
826
|
+
emit(:escape, token, copy(data, ts-1, te))
|
892
827
|
end
|
893
828
|
end # module Regexp::Scanner
|