regexp_parser 1.7.0 → 2.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +364 -22
- data/Gemfile +8 -2
- data/LICENSE +1 -1
- data/README.md +124 -88
- data/Rakefile +6 -70
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/base.rb +76 -0
- data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
- data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
- data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
- data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
- data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
- data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
- data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
- data/lib/regexp_parser/expression/classes/group.rb +28 -15
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/literal.rb +1 -5
- data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -19
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
- data/lib/regexp_parser/expression/methods/construct.rb +41 -0
- data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
- data/lib/regexp_parser/expression/methods/parts.rb +23 -0
- data/lib/regexp_parser/expression/methods/printing.rb +26 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +47 -1
- data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
- data/lib/regexp_parser/expression/quantifier.rb +57 -17
- data/lib/regexp_parser/expression/sequence.rb +11 -47
- data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
- data/lib/regexp_parser/expression/shared.rb +111 -0
- data/lib/regexp_parser/expression/subexpression.rb +27 -19
- data/lib/regexp_parser/expression.rb +14 -141
- data/lib/regexp_parser/lexer.rb +83 -41
- data/lib/regexp_parser/parser.rb +371 -429
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
- data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
- data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
- data/lib/regexp_parser/scanner/properties/long.csv +633 -0
- data/lib/regexp_parser/scanner/properties/short.csv +248 -0
- data/lib/regexp_parser/scanner/property.rl +4 -4
- data/lib/regexp_parser/scanner/scanner.rl +295 -368
- data/lib/regexp_parser/scanner.rb +1405 -1674
- data/lib/regexp_parser/syntax/any.rb +2 -7
- data/lib/regexp_parser/syntax/base.rb +92 -67
- data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
- data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
- data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
- data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
- data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
- data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
- data/lib/regexp_parser/syntax/token/escape.rb +33 -0
- data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
- data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
- data/lib/regexp_parser/syntax/token/meta.rb +20 -0
- data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
- data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
- data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
- data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
- data/lib/regexp_parser/syntax/token.rb +45 -0
- data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
- data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
- data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
- data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
- data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
- data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
- data/lib/regexp_parser/syntax/versions.rb +3 -1
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/token.rb +9 -20
- data/lib/regexp_parser/version.rb +1 -1
- data/lib/regexp_parser.rb +0 -2
- data/regexp_parser.gemspec +20 -22
- metadata +49 -166
- data/lib/regexp_parser/scanner/properties/long.yml +0 -594
- data/lib/regexp_parser/scanner/properties/short.yml +0 -237
- data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
- data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
- data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
- data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
- data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
- data/lib/regexp_parser/syntax/tokens.rb +0 -45
- data/spec/expression/base_spec.rb +0 -94
- data/spec/expression/clone_spec.rb +0 -120
- data/spec/expression/conditional_spec.rb +0 -89
- data/spec/expression/free_space_spec.rb +0 -27
- data/spec/expression/methods/match_length_spec.rb +0 -161
- data/spec/expression/methods/match_spec.rb +0 -25
- data/spec/expression/methods/strfregexp_spec.rb +0 -224
- data/spec/expression/methods/tests_spec.rb +0 -99
- data/spec/expression/methods/traverse_spec.rb +0 -161
- data/spec/expression/options_spec.rb +0 -128
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
- data/spec/expression/subexpression_spec.rb +0 -50
- data/spec/expression/to_h_spec.rb +0 -26
- data/spec/expression/to_s_spec.rb +0 -100
- data/spec/lexer/all_spec.rb +0 -22
- data/spec/lexer/conditionals_spec.rb +0 -53
- data/spec/lexer/escapes_spec.rb +0 -14
- data/spec/lexer/keep_spec.rb +0 -10
- data/spec/lexer/literals_spec.rb +0 -89
- data/spec/lexer/nesting_spec.rb +0 -99
- data/spec/lexer/refcalls_spec.rb +0 -55
- data/spec/parser/all_spec.rb +0 -43
- data/spec/parser/alternation_spec.rb +0 -88
- data/spec/parser/anchors_spec.rb +0 -17
- data/spec/parser/conditionals_spec.rb +0 -179
- data/spec/parser/errors_spec.rb +0 -30
- data/spec/parser/escapes_spec.rb +0 -121
- data/spec/parser/free_space_spec.rb +0 -130
- data/spec/parser/groups_spec.rb +0 -108
- data/spec/parser/keep_spec.rb +0 -6
- data/spec/parser/posix_classes_spec.rb +0 -8
- data/spec/parser/properties_spec.rb +0 -115
- data/spec/parser/quantifiers_spec.rb +0 -51
- data/spec/parser/refcalls_spec.rb +0 -112
- data/spec/parser/set/intersections_spec.rb +0 -127
- data/spec/parser/set/ranges_spec.rb +0 -111
- data/spec/parser/sets_spec.rb +0 -178
- data/spec/parser/types_spec.rb +0 -18
- data/spec/scanner/all_spec.rb +0 -18
- data/spec/scanner/anchors_spec.rb +0 -21
- data/spec/scanner/conditionals_spec.rb +0 -128
- data/spec/scanner/errors_spec.rb +0 -68
- data/spec/scanner/escapes_spec.rb +0 -53
- data/spec/scanner/free_space_spec.rb +0 -133
- data/spec/scanner/groups_spec.rb +0 -52
- data/spec/scanner/keep_spec.rb +0 -10
- data/spec/scanner/literals_spec.rb +0 -49
- data/spec/scanner/meta_spec.rb +0 -18
- data/spec/scanner/properties_spec.rb +0 -64
- data/spec/scanner/quantifiers_spec.rb +0 -20
- data/spec/scanner/refcalls_spec.rb +0 -36
- data/spec/scanner/sets_spec.rb +0 -102
- data/spec/scanner/types_spec.rb +0 -14
- data/spec/spec_helper.rb +0 -15
- data/spec/support/runner.rb +0 -42
- data/spec/support/shared_examples.rb +0 -77
- data/spec/support/warning_extractor.rb +0 -60
- data/spec/syntax/syntax_spec.rb +0 -48
- data/spec/syntax/syntax_token_map_spec.rb +0 -23
- data/spec/syntax/versions/1.8.6_spec.rb +0 -17
- data/spec/syntax/versions/1.9.1_spec.rb +0 -10
- data/spec/syntax/versions/1.9.3_spec.rb +0 -9
- data/spec/syntax/versions/2.0.0_spec.rb +0 -13
- data/spec/syntax/versions/2.2.0_spec.rb +0 -9
- data/spec/syntax/versions/aliases_spec.rb +0 -37
- data/spec/token/token_spec.rb +0 -85
- /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,26 +20,15 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
22
27
|
brackets = set_open | set_close;
|
23
28
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
25
|
-
|
26
|
-
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
|
-
'cntrl' | 'digit' | 'graph' |
|
28
|
-
'lower' | 'print' | 'punct' |
|
29
|
-
'space' | 'upper' | 'xdigit' |
|
30
|
-
'word' | 'ascii';
|
31
|
-
|
32
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
|
-
|
29
|
+
comment = ('#' . [^\n]* . '\n'?);
|
34
30
|
|
35
|
-
|
36
|
-
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
|
-
character_equivalent = '[=' . alpha . '=]';
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
line_anchor = beginning_of_line | end_of_line;
|
40
34
|
anchor_char = [AbBzZG];
|
@@ -53,21 +47,20 @@
|
|
53
47
|
|
54
48
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
49
|
|
50
|
+
sequence_char = [CMcux];
|
51
|
+
|
56
52
|
zero_or_one = '?' | '??' | '?+';
|
57
53
|
zero_or_more = '*' | '*?' | '*+';
|
58
54
|
one_or_more = '+' | '+?' | '++';
|
59
55
|
|
60
56
|
quantifier_greedy = '?' | '*' | '+';
|
61
|
-
quantifier_reluctant = '??' | '*?' | '+?';
|
62
|
-
quantifier_possessive = '?+' | '*+' | '++';
|
63
|
-
quantifier_mode = '?' | '+';
|
64
|
-
|
65
|
-
quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
|
66
|
-
range_close . quantifier_mode?;
|
67
|
-
|
68
|
-
quantifiers = quantifier_greedy | quantifier_reluctant |
|
69
|
-
quantifier_possessive | quantifier_interval;
|
70
57
|
|
58
|
+
quantity_exact = (digit+);
|
59
|
+
quantity_minimum = (digit+) . ',';
|
60
|
+
quantity_maximum = ',' . (digit+);
|
61
|
+
quantity_range = (digit+) . ',' . (digit+);
|
62
|
+
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
63
|
+
quantity_maximum | quantity_range ) . range_close;
|
71
64
|
|
72
65
|
conditional = '(?(';
|
73
66
|
|
@@ -85,22 +78,22 @@
|
|
85
78
|
# try to treat every other group head as options group, like Ruby
|
86
79
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
87
80
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
group_number = '-'? . [1-9] . ([0-9]+)?;
|
81
|
+
group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
82
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
83
|
+
group_number = '-'? . [0-9]+;
|
92
84
|
group_level = [+\-] . [0-9]+;
|
93
85
|
|
94
|
-
group_name = ('<' .
|
86
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
87
|
+
("'" . group_name_id_sq? . "'");
|
95
88
|
group_lookup = group_name | group_number;
|
96
89
|
|
97
90
|
group_named = ('?' . group_name );
|
98
91
|
|
99
|
-
|
100
|
-
|
92
|
+
group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
|
93
|
+
("'" . (group_name_id_sq? | group_number) . group_level? "'"));
|
101
94
|
|
102
|
-
|
103
|
-
|
95
|
+
group_ref = 'k' . group_ref_body;
|
96
|
+
group_call = 'g' . group_ref_body;
|
104
97
|
|
105
98
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
106
99
|
|
@@ -111,32 +104,33 @@
|
|
111
104
|
|
112
105
|
# characters that 'break' a literal
|
113
106
|
meta_char = dot | backslash | alternation |
|
114
|
-
curlies |
|
107
|
+
curlies | parentheses | brackets |
|
115
108
|
line_anchor | quantifier_greedy;
|
116
109
|
|
117
|
-
|
118
|
-
ascii_nonprint = (0x01..0x1f | 0x7f);
|
110
|
+
literal_delimiters = ']' | '}';
|
119
111
|
|
120
|
-
|
121
|
-
|
122
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
112
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
113
|
+
ascii_nonprint = (0x01..0x1f | 0x7f);
|
123
114
|
|
124
115
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
125
|
-
|
116
|
+
keep_mark | sequence_char;
|
117
|
+
|
118
|
+
# escapes that also work within a character set
|
119
|
+
set_escape = backslash | brackets | escaped_ascii |
|
120
|
+
octal_sequence | property_char |
|
121
|
+
sequence_char | single_codepoint_char_type;
|
126
122
|
|
127
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
128
|
-
multi_codepoint_char_type | [0-9cCM];
|
129
123
|
|
130
124
|
# EOF error, used where it can be detected
|
131
125
|
action premature_end_error {
|
132
|
-
text =
|
133
|
-
raise PrematureEndError.new(
|
126
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
127
|
+
raise PrematureEndError.new(text)
|
134
128
|
}
|
135
129
|
|
136
130
|
# Invalid sequence error, used from sequences, like escapes and sets
|
137
131
|
action invalid_sequence_error {
|
138
|
-
text =
|
139
|
-
|
132
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
133
|
+
raise ValidationError.for(:sequence, 'sequence', text)
|
140
134
|
}
|
141
135
|
|
142
136
|
# group (nesting) and set open/close actions
|
@@ -150,7 +144,7 @@
|
|
150
144
|
# --------------------------------------------------------------------------
|
151
145
|
character_set := |*
|
152
146
|
set_close > (set_meta, 2) @set_closed {
|
153
|
-
emit(:set, :close,
|
147
|
+
emit(:set, :close, copy(data, ts, te))
|
154
148
|
if in_set?
|
155
149
|
fret;
|
156
150
|
else
|
@@ -159,8 +153,8 @@
|
|
159
153
|
};
|
160
154
|
|
161
155
|
'-]' @set_closed { # special case, emits two tokens
|
162
|
-
emit(:literal, :literal,
|
163
|
-
emit(:set, :close,
|
156
|
+
emit(:literal, :literal, '-')
|
157
|
+
emit(:set, :close, ']')
|
164
158
|
if in_set?
|
165
159
|
fret;
|
166
160
|
else
|
@@ -169,33 +163,32 @@
|
|
169
163
|
};
|
170
164
|
|
171
165
|
'-&&' { # special case, emits two tokens
|
172
|
-
emit(:literal, :literal, '-'
|
173
|
-
emit(:set, :intersection, '&&'
|
166
|
+
emit(:literal, :literal, '-')
|
167
|
+
emit(:set, :intersection, '&&')
|
174
168
|
};
|
175
169
|
|
176
170
|
'^' {
|
177
|
-
|
178
|
-
|
179
|
-
emit(:set, :negate, text, ts, te)
|
171
|
+
if prev_token[1] == :open
|
172
|
+
emit(:set, :negate, '^')
|
180
173
|
else
|
181
|
-
emit(:literal, :literal,
|
174
|
+
emit(:literal, :literal, '^')
|
182
175
|
end
|
183
176
|
};
|
184
177
|
|
185
178
|
'-' {
|
186
|
-
|
187
|
-
#
|
188
|
-
if
|
189
|
-
emit(:literal, :literal,
|
179
|
+
# ranges cant start with the opening bracket, a subset, or
|
180
|
+
# intersection/negation/range operators
|
181
|
+
if prev_token[0] == :set
|
182
|
+
emit(:literal, :literal, '-')
|
190
183
|
else
|
191
|
-
emit(:set, :range,
|
184
|
+
emit(:set, :range, '-')
|
192
185
|
end
|
193
186
|
};
|
194
187
|
|
195
188
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
196
189
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
197
190
|
'&&' {
|
198
|
-
emit(:set, :intersection,
|
191
|
+
emit(:set, :intersection, '&&')
|
199
192
|
};
|
200
193
|
|
201
194
|
backslash {
|
@@ -203,59 +196,60 @@
|
|
203
196
|
};
|
204
197
|
|
205
198
|
set_open >(open_bracket, 1) >set_opened {
|
206
|
-
emit(:set, :open,
|
199
|
+
emit(:set, :open, '[')
|
207
200
|
fcall character_set;
|
208
201
|
};
|
209
202
|
|
210
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
211
|
-
text =
|
203
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
204
|
+
text = copy(data, ts, te)
|
212
205
|
|
213
206
|
type = :posixclass
|
214
207
|
class_name = text[2..-3]
|
215
|
-
if class_name[0]
|
208
|
+
if class_name[0] == '^'
|
216
209
|
class_name = class_name[1..-1]
|
217
210
|
type = :nonposixclass
|
218
211
|
end
|
219
212
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
224
|
-
emit(:set, :collation, *text(data, ts, te))
|
225
|
-
};
|
213
|
+
unless self.class.posix_classes.include?(class_name)
|
214
|
+
raise ValidationError.for(:posix_class, text)
|
215
|
+
end
|
226
216
|
|
227
|
-
|
228
|
-
emit(:set, :equivalent, *text(data, ts, te))
|
217
|
+
emit(type, class_name.to_sym, text)
|
229
218
|
};
|
230
219
|
|
231
220
|
meta_char > (set_meta, 1) {
|
232
|
-
emit(:literal, :literal,
|
221
|
+
emit(:literal, :literal, copy(data, ts, te))
|
233
222
|
};
|
234
223
|
|
235
|
-
any
|
236
|
-
|
237
|
-
|
238
|
-
utf8_3_byte |
|
239
|
-
utf8_4_byte {
|
240
|
-
char, *rest = *text(data, ts, te)
|
241
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
242
|
-
emit(:literal, :literal, char, *rest)
|
224
|
+
any | ascii_nonprint | utf8_multibyte {
|
225
|
+
text = copy(data, ts, te)
|
226
|
+
emit(:literal, :literal, text)
|
243
227
|
};
|
244
228
|
*|;
|
245
229
|
|
246
230
|
# set escapes scanner
|
247
231
|
# --------------------------------------------------------------------------
|
248
232
|
set_escape_sequence := |*
|
249
|
-
|
250
|
-
|
233
|
+
# Special case: in sets, octal sequences have higher priority than backrefs
|
234
|
+
octal_sequence {
|
235
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
251
236
|
fret;
|
252
237
|
};
|
253
238
|
|
254
|
-
|
239
|
+
# Scan all other escapes that work in sets with the generic escape scanner
|
240
|
+
set_escape > (escaped_set_alpha, 2) {
|
255
241
|
fhold;
|
256
242
|
fnext character_set;
|
257
243
|
fcall escape_sequence;
|
258
244
|
};
|
245
|
+
|
246
|
+
# Treat all remaining escapes - those not supported in sets - as literal.
|
247
|
+
# (This currently includes \^, \-, \&, \:, although these could potentially
|
248
|
+
# be meta chars when not escaped, depending on their position in the set.)
|
249
|
+
any > (escaped_set_alpha, 1) {
|
250
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
251
|
+
fret;
|
252
|
+
};
|
259
253
|
*|;
|
260
254
|
|
261
255
|
|
@@ -263,33 +257,33 @@
|
|
263
257
|
# --------------------------------------------------------------------------
|
264
258
|
escape_sequence := |*
|
265
259
|
[1-9] {
|
266
|
-
text =
|
267
|
-
emit(:backref, :number, text
|
260
|
+
text = copy(data, ts-1, te)
|
261
|
+
emit(:backref, :number, text)
|
268
262
|
fret;
|
269
263
|
};
|
270
264
|
|
271
265
|
octal_sequence {
|
272
|
-
emit(:escape, :octal,
|
266
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
273
267
|
fret;
|
274
268
|
};
|
275
269
|
|
276
270
|
meta_char {
|
277
|
-
case text =
|
278
|
-
when '\.'; emit(:escape, :dot, text
|
279
|
-
when '\|'; emit(:escape, :alternation, text
|
280
|
-
when '\^'; emit(:escape, :bol, text
|
281
|
-
when '\$'; emit(:escape, :eol, text
|
282
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
283
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
284
|
-
when '\+'; emit(:escape, :one_or_more, text
|
285
|
-
when '\('; emit(:escape, :group_open, text
|
286
|
-
when '\)'; emit(:escape, :group_close, text
|
287
|
-
when '\{'; emit(:escape, :interval_open, text
|
288
|
-
when '\}'; emit(:escape, :interval_close, text
|
289
|
-
when '\['; emit(:escape, :set_open, text
|
290
|
-
when '\]'; emit(:escape, :set_close, text
|
271
|
+
case text = copy(data, ts-1, te)
|
272
|
+
when '\.'; emit(:escape, :dot, text)
|
273
|
+
when '\|'; emit(:escape, :alternation, text)
|
274
|
+
when '\^'; emit(:escape, :bol, text)
|
275
|
+
when '\$'; emit(:escape, :eol, text)
|
276
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
277
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
278
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
279
|
+
when '\('; emit(:escape, :group_open, text)
|
280
|
+
when '\)'; emit(:escape, :group_close, text)
|
281
|
+
when '\{'; emit(:escape, :interval_open, text)
|
282
|
+
when '\}'; emit(:escape, :interval_close, text)
|
283
|
+
when '\['; emit(:escape, :set_open, text)
|
284
|
+
when '\]'; emit(:escape, :set_close, text)
|
291
285
|
when "\\\\";
|
292
|
-
emit(:escape, :backslash, text
|
286
|
+
emit(:escape, :backslash, text)
|
293
287
|
end
|
294
288
|
fret;
|
295
289
|
};
|
@@ -297,31 +291,31 @@
|
|
297
291
|
escaped_ascii > (escaped_alpha, 7) {
|
298
292
|
# \b is emitted as backspace only when inside a character set, otherwise
|
299
293
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
300
|
-
case text =
|
301
|
-
when '\a'; emit(:escape, :bell, text
|
302
|
-
when '\b'; emit(:escape, :backspace, text
|
303
|
-
when '\e'; emit(:escape, :escape, text
|
304
|
-
when '\f'; emit(:escape, :form_feed, text
|
305
|
-
when '\n'; emit(:escape, :newline, text
|
306
|
-
when '\r'; emit(:escape, :carriage, text
|
307
|
-
when '\t'; emit(:escape, :tab, text
|
308
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
294
|
+
case text = copy(data, ts-1, te)
|
295
|
+
when '\a'; emit(:escape, :bell, text)
|
296
|
+
when '\b'; emit(:escape, :backspace, text)
|
297
|
+
when '\e'; emit(:escape, :escape, text)
|
298
|
+
when '\f'; emit(:escape, :form_feed, text)
|
299
|
+
when '\n'; emit(:escape, :newline, text)
|
300
|
+
when '\r'; emit(:escape, :carriage, text)
|
301
|
+
when '\t'; emit(:escape, :tab, text)
|
302
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
309
303
|
end
|
310
304
|
fret;
|
311
305
|
};
|
312
306
|
|
313
307
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
314
|
-
text =
|
315
|
-
if text[2]
|
316
|
-
emit(:escape, :codepoint_list, text
|
308
|
+
text = copy(data, ts-1, te)
|
309
|
+
if text[2] == '{'
|
310
|
+
emit(:escape, :codepoint_list, text)
|
317
311
|
else
|
318
|
-
emit(:escape, :codepoint, text
|
312
|
+
emit(:escape, :codepoint, text)
|
319
313
|
end
|
320
314
|
fret;
|
321
315
|
};
|
322
316
|
|
323
|
-
hex_sequence > (escaped_alpha, 5)
|
324
|
-
emit(:escape, :hex,
|
317
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
318
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
325
319
|
fret;
|
326
320
|
};
|
327
321
|
|
@@ -351,8 +345,8 @@
|
|
351
345
|
fcall unicode_property;
|
352
346
|
};
|
353
347
|
|
354
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
355
|
-
emit(:escape, :literal,
|
348
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
349
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
356
350
|
fret;
|
357
351
|
};
|
358
352
|
*|;
|
@@ -362,9 +356,9 @@
|
|
362
356
|
# --------------------------------------------------------------------------
|
363
357
|
conditional_expression := |*
|
364
358
|
group_lookup . ')' {
|
365
|
-
text =
|
366
|
-
emit(:conditional, :condition, text
|
367
|
-
emit(:conditional, :condition_close, ')'
|
359
|
+
text = copy(data, ts, te-1)
|
360
|
+
emit(:conditional, :condition, text)
|
361
|
+
emit(:conditional, :condition_close, ')')
|
368
362
|
};
|
369
363
|
|
370
364
|
any {
|
@@ -381,46 +375,50 @@
|
|
381
375
|
# Meta characters
|
382
376
|
# ------------------------------------------------------------------------
|
383
377
|
dot {
|
384
|
-
emit(:meta, :dot,
|
378
|
+
emit(:meta, :dot, copy(data, ts, te))
|
385
379
|
};
|
386
380
|
|
387
381
|
alternation {
|
388
382
|
if conditional_stack.last == group_depth
|
389
|
-
emit(:conditional, :separator,
|
383
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
390
384
|
else
|
391
|
-
emit(:meta, :alternation,
|
385
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
392
386
|
end
|
393
387
|
};
|
394
388
|
|
395
389
|
# Anchors
|
396
390
|
# ------------------------------------------------------------------------
|
397
391
|
beginning_of_line {
|
398
|
-
emit(:anchor, :bol,
|
392
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
399
393
|
};
|
400
394
|
|
401
395
|
end_of_line {
|
402
|
-
emit(:anchor, :eol,
|
396
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
403
397
|
};
|
404
398
|
|
405
399
|
backslash . keep_mark > (backslashed, 4) {
|
406
|
-
emit(:keep, :mark,
|
400
|
+
emit(:keep, :mark, copy(data, ts, te))
|
407
401
|
};
|
408
402
|
|
409
403
|
backslash . anchor_char > (backslashed, 3) {
|
410
|
-
case text =
|
411
|
-
when '
|
412
|
-
when '
|
413
|
-
when '
|
414
|
-
when '
|
415
|
-
when '
|
416
|
-
when '
|
404
|
+
case text = copy(data, ts, te)
|
405
|
+
when '\A'; emit(:anchor, :bos, text)
|
406
|
+
when '\z'; emit(:anchor, :eos, text)
|
407
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
408
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
409
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
410
|
+
when '\G'; emit(:anchor, :match_start, text)
|
417
411
|
end
|
418
412
|
};
|
419
413
|
|
414
|
+
literal_delimiters {
|
415
|
+
append_literal(data, ts, te)
|
416
|
+
};
|
417
|
+
|
420
418
|
# Character sets
|
421
419
|
# ------------------------------------------------------------------------
|
422
420
|
set_open >set_opened {
|
423
|
-
emit(:set, :open,
|
421
|
+
emit(:set, :open, copy(data, ts, te))
|
424
422
|
fcall character_set;
|
425
423
|
};
|
426
424
|
|
@@ -429,23 +427,22 @@
|
|
429
427
|
# (?(condition)Y|N) conditional expression
|
430
428
|
# ------------------------------------------------------------------------
|
431
429
|
conditional {
|
432
|
-
text =
|
430
|
+
text = copy(data, ts, te)
|
433
431
|
|
434
432
|
conditional_stack << group_depth
|
435
433
|
|
436
|
-
emit(:conditional, :open, text[0..-2]
|
437
|
-
emit(:conditional, :condition_open, '('
|
434
|
+
emit(:conditional, :open, text[0..-2])
|
435
|
+
emit(:conditional, :condition_open, '(')
|
438
436
|
fcall conditional_expression;
|
439
437
|
};
|
440
438
|
|
441
439
|
|
442
440
|
# (?#...) comments: parsed as a single expression, without introducing a
|
443
441
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
444
|
-
# special case for close
|
445
|
-
# correct closing count.
|
442
|
+
# special case for close to get the correct closing count.
|
446
443
|
# ------------------------------------------------------------------------
|
447
|
-
group_open . group_comment
|
448
|
-
emit(:group, :comment,
|
444
|
+
(group_open . group_comment) @group_closed {
|
445
|
+
emit(:group, :comment, copy(data, ts, te))
|
449
446
|
};
|
450
447
|
|
451
448
|
# Expression options:
|
@@ -459,12 +456,12 @@
|
|
459
456
|
#
|
460
457
|
# (?imxdau-imx:subexp) option on/off for subexp
|
461
458
|
# ------------------------------------------------------------------------
|
462
|
-
group_open . group_options >group_opened {
|
463
|
-
text =
|
459
|
+
(group_open . group_options) >group_opened {
|
460
|
+
text = copy(data, ts, te)
|
464
461
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
465
|
-
raise
|
462
|
+
raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
|
466
463
|
end
|
467
|
-
emit_options(text
|
464
|
+
emit_options(text)
|
468
465
|
};
|
469
466
|
|
470
467
|
# Assertions
|
@@ -473,12 +470,12 @@
|
|
473
470
|
# (?<=subexp) look-behind
|
474
471
|
# (?<!subexp) negative look-behind
|
475
472
|
# ------------------------------------------------------------------------
|
476
|
-
group_open . assertion_type >group_opened {
|
477
|
-
case text =
|
478
|
-
when '(?='; emit(:assertion, :lookahead, text
|
479
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
480
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
481
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
473
|
+
(group_open . assertion_type) >group_opened {
|
474
|
+
case text = copy(data, ts, te)
|
475
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
476
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
477
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
478
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
482
479
|
end
|
483
480
|
};
|
484
481
|
|
@@ -490,106 +487,78 @@
|
|
490
487
|
# (?'name'subexp) named group (single quoted version)
|
491
488
|
# (subexp) captured group
|
492
489
|
# ------------------------------------------------------------------------
|
493
|
-
group_open . group_type >group_opened {
|
494
|
-
case text =
|
495
|
-
when '(?:'; emit(:group, :passive, text
|
496
|
-
when '(?>'; emit(:group, :atomic, text
|
497
|
-
when '(?~'; emit(:group, :absence, text
|
490
|
+
(group_open . group_type) >group_opened {
|
491
|
+
case text = copy(data, ts, te)
|
492
|
+
when '(?:'; emit(:group, :passive, text)
|
493
|
+
when '(?>'; emit(:group, :atomic, text)
|
494
|
+
when '(?~'; emit(:group, :absence, text)
|
498
495
|
|
499
496
|
when /^\(\?(?:<>|'')/
|
500
|
-
|
497
|
+
raise ValidationError.for(:group, 'named group', 'name is empty')
|
501
498
|
|
502
|
-
when /^\(
|
503
|
-
emit(:group, :named_ab, text
|
499
|
+
when /^\(\?<[^>]+>/
|
500
|
+
emit(:group, :named_ab, text)
|
504
501
|
|
505
|
-
when /^\(\?'
|
506
|
-
emit(:group, :named_sq, text
|
502
|
+
when /^\(\?'[^']+'/
|
503
|
+
emit(:group, :named_sq, text)
|
507
504
|
|
508
505
|
end
|
509
506
|
};
|
510
507
|
|
511
508
|
group_open @group_opened {
|
512
|
-
text =
|
513
|
-
emit(:group, :capture, text
|
509
|
+
text = copy(data, ts, te)
|
510
|
+
emit(:group, :capture, text)
|
514
511
|
};
|
515
512
|
|
516
513
|
group_close @group_closed {
|
517
514
|
if conditional_stack.last == group_depth + 1
|
518
515
|
conditional_stack.pop
|
519
|
-
emit(:conditional, :close,
|
520
|
-
|
516
|
+
emit(:conditional, :close, ')')
|
517
|
+
elsif group_depth >= 0
|
521
518
|
if spacing_stack.length > 1 &&
|
522
519
|
spacing_stack.last[:depth] == group_depth + 1
|
523
520
|
spacing_stack.pop
|
524
521
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
525
522
|
end
|
526
523
|
|
527
|
-
emit(:group, :close,
|
524
|
+
emit(:group, :close, ')')
|
525
|
+
else
|
526
|
+
raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
|
528
527
|
end
|
529
528
|
};
|
530
529
|
|
531
530
|
|
532
531
|
# Group backreference, named and numbered
|
533
532
|
# ------------------------------------------------------------------------
|
534
|
-
backslash . (
|
535
|
-
case text =
|
536
|
-
when /^\\([
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
551
|
-
end
|
552
|
-
|
553
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
554
|
-
if $1 == 'k'
|
555
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
556
|
-
else
|
557
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
558
|
-
end
|
559
|
-
|
560
|
-
when /^\\([gk])'\d+'/ # single quotes
|
561
|
-
if $1 == 'k'
|
562
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
563
|
-
else
|
564
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
565
|
-
end
|
566
|
-
|
567
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
568
|
-
if $1 == 'k'
|
569
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
570
|
-
else
|
571
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
572
|
-
end
|
573
|
-
|
574
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
575
|
-
if $1 == 'k'
|
576
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
577
|
-
else
|
578
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
579
|
-
end
|
580
|
-
|
581
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
582
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
583
|
-
|
584
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
585
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
586
|
-
|
587
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
588
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
589
|
-
|
590
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
591
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
533
|
+
backslash . (group_ref) > (backslashed, 4) {
|
534
|
+
case text = copy(data, ts, te)
|
535
|
+
when /^\\k(.)[^0-9\-][^+\-]*['>]$/
|
536
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
537
|
+
when /^\\k(.)[1-9]\d*['>]$/
|
538
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
539
|
+
when /^\\k(.)-[1-9]\d*['>]$/
|
540
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
541
|
+
when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
|
542
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
543
|
+
when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
|
544
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
545
|
+
else
|
546
|
+
raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
|
547
|
+
end
|
548
|
+
};
|
592
549
|
|
550
|
+
# Group call, named and numbered
|
551
|
+
# ------------------------------------------------------------------------
|
552
|
+
backslash . (group_call) > (backslashed, 4) {
|
553
|
+
case text = copy(data, ts, te)
|
554
|
+
when /^\\g(.)[^0-9+\-].*['>]$/
|
555
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
556
|
+
when /^\\g(.)\d+['>]$/
|
557
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
558
|
+
when /^\\g(.)[+-]\d+/
|
559
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
560
|
+
else
|
561
|
+
raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
|
593
562
|
end
|
594
563
|
};
|
595
564
|
|
@@ -597,31 +566,36 @@
|
|
597
566
|
# Quantifiers
|
598
567
|
# ------------------------------------------------------------------------
|
599
568
|
zero_or_one {
|
600
|
-
case text =
|
601
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
602
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
603
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
569
|
+
case text = copy(data, ts, te)
|
570
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
571
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
572
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
604
573
|
end
|
605
574
|
};
|
606
575
|
|
607
576
|
zero_or_more {
|
608
|
-
case text =
|
609
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
610
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
611
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
577
|
+
case text = copy(data, ts, te)
|
578
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
579
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
580
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
612
581
|
end
|
613
582
|
};
|
614
583
|
|
615
584
|
one_or_more {
|
616
|
-
case text =
|
617
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
618
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
619
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
587
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
588
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
620
589
|
end
|
621
590
|
};
|
622
591
|
|
623
|
-
quantifier_interval
|
624
|
-
emit(:quantifier, :interval,
|
592
|
+
quantifier_interval {
|
593
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
594
|
+
};
|
595
|
+
|
596
|
+
# Catch unmatched curly braces as literals
|
597
|
+
range_open {
|
598
|
+
append_literal(data, ts, te)
|
625
599
|
};
|
626
600
|
|
627
601
|
# Escaped sequences
|
@@ -632,15 +606,17 @@
|
|
632
606
|
|
633
607
|
comment {
|
634
608
|
if free_spacing
|
635
|
-
emit(:free_space, :comment,
|
609
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
636
610
|
else
|
637
|
-
|
611
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
612
|
+
append_literal(data, ts, ts + 1)
|
613
|
+
fexec ts + 1;
|
638
614
|
end
|
639
615
|
};
|
640
616
|
|
641
617
|
space+ {
|
642
618
|
if free_spacing
|
643
|
-
emit(:free_space, :whitespace,
|
619
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
644
620
|
else
|
645
621
|
append_literal(data, ts, te)
|
646
622
|
end
|
@@ -649,105 +625,47 @@
|
|
649
625
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
650
626
|
# except meta characters.
|
651
627
|
# ------------------------------------------------------------------------
|
652
|
-
(ascii_print -- space)+
|
653
|
-
ascii_nonprint+ |
|
654
|
-
utf8_2_byte+ |
|
655
|
-
utf8_3_byte+ |
|
656
|
-
utf8_4_byte+ {
|
628
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
657
629
|
append_literal(data, ts, te)
|
658
630
|
};
|
659
631
|
|
660
632
|
*|;
|
661
633
|
}%%
|
662
634
|
|
663
|
-
|
664
|
-
|
635
|
+
require 'regexp_parser/scanner/errors/scanner_error'
|
636
|
+
require 'regexp_parser/scanner/errors/premature_end_error'
|
637
|
+
require 'regexp_parser/scanner/errors/validation_error'
|
665
638
|
|
666
639
|
class Regexp::Scanner
|
667
|
-
# General scanner error (catch all)
|
668
|
-
class ScannerError < StandardError; end
|
669
|
-
|
670
|
-
# Base for all scanner validation errors
|
671
|
-
class ValidationError < StandardError
|
672
|
-
def initialize(reason)
|
673
|
-
super reason
|
674
|
-
end
|
675
|
-
end
|
676
|
-
|
677
|
-
# Unexpected end of pattern
|
678
|
-
class PrematureEndError < ScannerError
|
679
|
-
def initialize(where = '')
|
680
|
-
super "Premature end of pattern at #{where}"
|
681
|
-
end
|
682
|
-
end
|
683
|
-
|
684
|
-
# Invalid sequence format. Used for escape sequences, mainly.
|
685
|
-
class InvalidSequenceError < ValidationError
|
686
|
-
def initialize(what = 'sequence', where = '')
|
687
|
-
super "Invalid #{what} at #{where}"
|
688
|
-
end
|
689
|
-
end
|
690
|
-
|
691
|
-
# Invalid group. Used for named groups.
|
692
|
-
class InvalidGroupError < ValidationError
|
693
|
-
def initialize(what, reason)
|
694
|
-
super "Invalid #{what}, #{reason}."
|
695
|
-
end
|
696
|
-
end
|
697
|
-
|
698
|
-
# Invalid groupOption. Used for inline options.
|
699
|
-
class InvalidGroupOption < ValidationError
|
700
|
-
def initialize(option, text)
|
701
|
-
super "Invalid group option #{option} in #{text}"
|
702
|
-
end
|
703
|
-
end
|
704
|
-
|
705
|
-
# Invalid back reference. Used for name a number refs/calls.
|
706
|
-
class InvalidBackrefError < ValidationError
|
707
|
-
def initialize(what, reason)
|
708
|
-
super "Invalid back reference #{what}, #{reason}"
|
709
|
-
end
|
710
|
-
end
|
711
|
-
|
712
|
-
# The property name was not recognized by the scanner.
|
713
|
-
class UnknownUnicodePropertyError < ValidationError
|
714
|
-
def initialize(name)
|
715
|
-
super "Unknown unicode character property name #{name}"
|
716
|
-
end
|
717
|
-
end
|
718
|
-
|
719
640
|
# Scans the given regular expression text, or Regexp object and collects the
|
720
641
|
# emitted token into an array that gets returned at the end. If a block is
|
721
642
|
# given, it gets called for each emitted token.
|
722
643
|
#
|
723
644
|
# This method may raise errors if a syntax error is encountered.
|
724
645
|
# --------------------------------------------------------------------------
|
725
|
-
def self.scan(input_object, &block)
|
726
|
-
new.scan(input_object, &block)
|
646
|
+
def self.scan(input_object, options: nil, collect_tokens: true, &block)
|
647
|
+
new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
|
727
648
|
end
|
728
649
|
|
729
|
-
def scan(input_object, &block)
|
730
|
-
self.
|
650
|
+
def scan(input_object, options: nil, collect_tokens: true, &block)
|
651
|
+
self.collect_tokens = collect_tokens
|
652
|
+
self.literal_run = nil
|
731
653
|
stack = []
|
732
654
|
|
733
|
-
|
734
|
-
|
735
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
736
|
-
else
|
737
|
-
input = input_object
|
738
|
-
self.free_spacing = false
|
739
|
-
end
|
655
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
656
|
+
self.free_spacing = free_spacing?(input_object, options)
|
740
657
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
741
658
|
|
742
|
-
data = input.unpack("c*")
|
659
|
+
data = input.unpack("c*")
|
743
660
|
eof = data.length
|
744
661
|
|
745
662
|
self.tokens = []
|
746
|
-
self.block =
|
663
|
+
self.block = block
|
747
664
|
|
748
665
|
self.set_depth = 0
|
749
666
|
self.group_depth = 0
|
750
667
|
self.conditional_stack = []
|
668
|
+
self.char_pos = 0
|
751
669
|
|
752
670
|
%% write data;
|
753
671
|
%% write init;
|
@@ -757,7 +675,7 @@ class Regexp::Scanner
|
|
757
675
|
testEof = testEof
|
758
676
|
|
759
677
|
if cs == re_scanner_error
|
760
|
-
text =
|
678
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
761
679
|
raise ScannerError.new("Scan error at '#{text}'")
|
762
680
|
end
|
763
681
|
|
@@ -767,40 +685,76 @@ class Regexp::Scanner
|
|
767
685
|
"[#{set_depth}]") if in_set?
|
768
686
|
|
769
687
|
# when the entire expression is a literal run
|
770
|
-
emit_literal if
|
688
|
+
emit_literal if literal_run
|
771
689
|
|
772
690
|
tokens
|
773
691
|
end
|
774
692
|
|
775
693
|
# lazy-load property maps when first needed
|
776
|
-
require 'yaml'
|
777
|
-
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
778
|
-
|
779
694
|
def self.short_prop_map
|
780
|
-
@short_prop_map ||=
|
695
|
+
@short_prop_map ||= parse_prop_map('short')
|
781
696
|
end
|
782
697
|
|
783
698
|
def self.long_prop_map
|
784
|
-
@long_prop_map ||=
|
699
|
+
@long_prop_map ||= parse_prop_map('long')
|
700
|
+
end
|
701
|
+
|
702
|
+
def self.parse_prop_map(name)
|
703
|
+
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
704
|
+
end
|
705
|
+
|
706
|
+
def self.posix_classes
|
707
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
708
|
+
lower print punct space upper word xdigit]
|
785
709
|
end
|
786
710
|
|
787
711
|
# Emits an array with the details of the scanned pattern
|
788
|
-
def emit(type, token, text
|
712
|
+
def emit(type, token, text)
|
789
713
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
790
714
|
|
791
|
-
emit_literal if
|
715
|
+
emit_literal if literal_run
|
716
|
+
|
717
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
718
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
719
|
+
ts_char_pos = char_pos
|
720
|
+
te_char_pos = char_pos + text.length
|
721
|
+
|
722
|
+
tok = [type, token, text, ts_char_pos, te_char_pos]
|
723
|
+
|
724
|
+
self.prev_token = tok
|
725
|
+
|
726
|
+
self.char_pos = te_char_pos
|
792
727
|
|
793
728
|
if block
|
794
|
-
block.call type, token, text,
|
729
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
730
|
+
# TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
|
731
|
+
tokens << tok if collect_tokens
|
732
|
+
elsif collect_tokens
|
733
|
+
tokens << tok
|
795
734
|
end
|
796
|
-
|
797
|
-
tokens << [type, token, text, ts, te]
|
798
735
|
end
|
799
736
|
|
737
|
+
attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
|
738
|
+
|
800
739
|
private
|
801
740
|
|
802
|
-
attr_accessor :
|
803
|
-
:
|
741
|
+
attr_accessor :block,
|
742
|
+
:collect_tokens, :tokens, :prev_token,
|
743
|
+
:free_spacing, :spacing_stack,
|
744
|
+
:group_depth, :set_depth, :conditional_stack,
|
745
|
+
:char_pos
|
746
|
+
|
747
|
+
def free_spacing?(input_object, options)
|
748
|
+
if options && !input_object.is_a?(String)
|
749
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
750
|
+
end
|
751
|
+
|
752
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
753
|
+
|
754
|
+
return false unless options
|
755
|
+
|
756
|
+
options & Regexp::EXTENDED != 0
|
757
|
+
end
|
804
758
|
|
805
759
|
def in_group?
|
806
760
|
group_depth > 0
|
@@ -811,36 +765,24 @@ class Regexp::Scanner
|
|
811
765
|
end
|
812
766
|
|
813
767
|
# Copy from ts to te from data as text
|
814
|
-
def copy(data,
|
815
|
-
data[
|
816
|
-
end
|
817
|
-
|
818
|
-
# Copy from ts to te from data as text, returning an array with the text
|
819
|
-
# and the offsets used to copy it.
|
820
|
-
def text(data, ts, te, soff = 0)
|
821
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
768
|
+
def copy(data, ts, te)
|
769
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
822
770
|
end
|
823
771
|
|
824
772
|
# Appends one or more characters to the literal buffer, to be emitted later
|
825
|
-
# by a call to emit_literal.
|
773
|
+
# by a call to emit_literal.
|
826
774
|
def append_literal(data, ts, te)
|
827
|
-
self.
|
828
|
-
literal << text(data, ts, te)
|
775
|
+
(self.literal_run ||= []) << copy(data, ts, te)
|
829
776
|
end
|
830
777
|
|
831
|
-
# Emits the literal run collected by calls to the append_literal method
|
832
|
-
# using the total start (ts) and end (te) offsets of the run.
|
778
|
+
# Emits the literal run collected by calls to the append_literal method.
|
833
779
|
def emit_literal
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
838
|
-
|
839
|
-
self.literal = nil
|
840
|
-
emit(:literal, :literal, text, ts, te)
|
780
|
+
text = literal_run.join
|
781
|
+
self.literal_run = nil
|
782
|
+
emit(:literal, :literal, text)
|
841
783
|
end
|
842
784
|
|
843
|
-
def emit_options(text
|
785
|
+
def emit_options(text)
|
844
786
|
token = nil
|
845
787
|
|
846
788
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -866,28 +808,13 @@ class Regexp::Scanner
|
|
866
808
|
token = :options_switch
|
867
809
|
end
|
868
810
|
|
869
|
-
emit(:group, token, text
|
811
|
+
emit(:group, token, text)
|
870
812
|
end
|
871
813
|
|
872
814
|
def emit_meta_control_sequence(data, ts, te, token)
|
873
815
|
if data.last < 0x00 || data.last > 0x7F
|
874
|
-
|
875
|
-
end
|
876
|
-
emit(:escape, token, *text(data, ts, te, 1))
|
877
|
-
end
|
878
|
-
|
879
|
-
# Centralizes and unifies the handling of validation related
|
880
|
-
# errors.
|
881
|
-
def validation_error(type, what, reason)
|
882
|
-
case type
|
883
|
-
when :group
|
884
|
-
error = InvalidGroupError.new(what, reason)
|
885
|
-
when :backref
|
886
|
-
error = InvalidBackrefError.new(what, reason)
|
887
|
-
when :sequence
|
888
|
-
error = InvalidSequenceError.new(what, reason)
|
816
|
+
raise ValidationError.for(:sequence, 'escape', token.to_s)
|
889
817
|
end
|
890
|
-
|
891
|
-
raise error # unless @@config.validation_ignore
|
818
|
+
emit(:escape, token, copy(data, ts-1, te))
|
892
819
|
end
|
893
820
|
end # module Regexp::Scanner
|