regexp_parser 1.8.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +93 -0
- data/Gemfile +6 -1
- data/README.md +1 -4
- data/Rakefile +8 -8
- data/lib/regexp_parser.rb +1 -0
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +5 -18
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +306 -332
- data/lib/regexp_parser/scanner.rb +1272 -1338
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +206 -238
- data/lib/regexp_parser/syntax.rb +7 -7
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax/versions.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +8 -1
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +4 -7
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,7 +20,7 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -53,6 +58,8 @@
|
|
53
58
|
|
54
59
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
60
|
|
61
|
+
sequence_char = [CMcux];
|
62
|
+
|
56
63
|
zero_or_one = '?' | '??' | '?+';
|
57
64
|
zero_or_more = '*' | '*?' | '*+';
|
58
65
|
one_or_more = '+' | '+?' | '++';
|
@@ -90,21 +97,26 @@
|
|
90
97
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
98
|
|
92
99
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
100
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
101
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
102
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
103
|
group_level = [+\-] . [0-9]+;
|
97
104
|
|
98
|
-
group_name = ('<' .
|
105
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
106
|
+
("'" . group_name_id_sq? . "'");
|
99
107
|
group_lookup = group_name | group_number;
|
100
108
|
|
101
109
|
group_named = ('?' . group_name );
|
102
110
|
|
103
|
-
|
104
|
-
|
111
|
+
group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
|
112
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
113
|
+
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
114
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
115
|
|
106
|
-
|
107
|
-
|
116
|
+
group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
|
117
|
+
("'" . group_number . group_level? "'"));
|
118
|
+
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
119
|
+
("'" . ((group_number . group_level?) | '0') "'"));
|
108
120
|
|
109
121
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
110
122
|
|
@@ -115,7 +127,7 @@
|
|
115
127
|
|
116
128
|
# characters that 'break' a literal
|
117
129
|
meta_char = dot | backslash | alternation |
|
118
|
-
curlies |
|
130
|
+
curlies | parentheses | brackets |
|
119
131
|
line_anchor | quantifier_greedy;
|
120
132
|
|
121
133
|
literal_delimiters = ']' | '}';
|
@@ -123,25 +135,23 @@
|
|
123
135
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
136
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
137
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
138
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
|
-
keep_mark |
|
139
|
+
keep_mark | sequence_char;
|
140
|
+
|
141
|
+
# escapes that also work within a character set
|
142
|
+
set_escape = backslash | brackets | escaped_ascii | property_char |
|
143
|
+
sequence_char | single_codepoint_char_type;
|
132
144
|
|
133
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
134
|
-
multi_codepoint_char_type | [0-9cCM];
|
135
145
|
|
136
146
|
# EOF error, used where it can be detected
|
137
147
|
action premature_end_error {
|
138
|
-
text =
|
148
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
149
|
raise PrematureEndError.new( text )
|
140
150
|
}
|
141
151
|
|
142
152
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
153
|
action invalid_sequence_error {
|
144
|
-
text =
|
154
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
155
|
validation_error(:sequence, 'sequence', text)
|
146
156
|
}
|
147
157
|
|
@@ -156,7 +166,7 @@
|
|
156
166
|
# --------------------------------------------------------------------------
|
157
167
|
character_set := |*
|
158
168
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
169
|
+
emit(:set, :close, copy(data, ts, te))
|
160
170
|
if in_set?
|
161
171
|
fret;
|
162
172
|
else
|
@@ -165,8 +175,8 @@
|
|
165
175
|
};
|
166
176
|
|
167
177
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
178
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
179
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
180
|
if in_set?
|
171
181
|
fret;
|
172
182
|
else
|
@@ -175,33 +185,33 @@
|
|
175
185
|
};
|
176
186
|
|
177
187
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
188
|
+
emit(:literal, :literal, '-')
|
189
|
+
emit(:set, :intersection, '&&')
|
180
190
|
};
|
181
191
|
|
182
192
|
'^' {
|
183
|
-
text =
|
193
|
+
text = copy(data, ts, te)
|
184
194
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
195
|
+
emit(:set, :negate, text)
|
186
196
|
else
|
187
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
188
198
|
end
|
189
199
|
};
|
190
200
|
|
191
201
|
'-' {
|
192
|
-
text =
|
202
|
+
text = copy(data, ts, te)
|
193
203
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
204
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
205
|
+
emit(:literal, :literal, text)
|
196
206
|
else
|
197
|
-
emit(:set, :range, text
|
207
|
+
emit(:set, :range, text)
|
198
208
|
end
|
199
209
|
};
|
200
210
|
|
201
211
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
212
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
213
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
214
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
215
|
};
|
206
216
|
|
207
217
|
backslash {
|
@@ -209,12 +219,12 @@
|
|
209
219
|
};
|
210
220
|
|
211
221
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
222
|
+
emit(:set, :open, copy(data, ts, te))
|
213
223
|
fcall character_set;
|
214
224
|
};
|
215
225
|
|
216
226
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
227
|
+
text = copy(data, ts, te)
|
218
228
|
|
219
229
|
type = :posixclass
|
220
230
|
class_name = text[2..-3]
|
@@ -223,45 +233,40 @@
|
|
223
233
|
type = :nonposixclass
|
224
234
|
end
|
225
235
|
|
226
|
-
emit(type, class_name.to_sym, text
|
236
|
+
emit(type, class_name.to_sym, text)
|
227
237
|
};
|
228
238
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
239
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
+
# emit(:set, :collation, copy(data, ts, te))
|
242
|
+
# };
|
243
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
245
|
+
# };
|
236
246
|
|
237
247
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
248
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
249
|
};
|
240
250
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
251
|
+
any | ascii_nonprint | utf8_multibyte {
|
252
|
+
text = copy(data, ts, te)
|
253
|
+
emit(:literal, :literal, text)
|
249
254
|
};
|
250
255
|
*|;
|
251
256
|
|
252
257
|
# set escapes scanner
|
253
258
|
# --------------------------------------------------------------------------
|
254
259
|
set_escape_sequence := |*
|
255
|
-
|
256
|
-
emit(:escape, :literal, *text(data, ts, te, 1))
|
257
|
-
fret;
|
258
|
-
};
|
259
|
-
|
260
|
-
any > (escaped_set_alpha, 1) {
|
260
|
+
set_escape > (escaped_set_alpha, 2) {
|
261
261
|
fhold;
|
262
262
|
fnext character_set;
|
263
263
|
fcall escape_sequence;
|
264
264
|
};
|
265
|
+
|
266
|
+
any > (escaped_set_alpha, 1) {
|
267
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
268
|
+
fret;
|
269
|
+
};
|
265
270
|
*|;
|
266
271
|
|
267
272
|
|
@@ -269,33 +274,33 @@
|
|
269
274
|
# --------------------------------------------------------------------------
|
270
275
|
escape_sequence := |*
|
271
276
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
277
|
+
text = copy(data, ts-1, te)
|
278
|
+
emit(:backref, :number, text)
|
274
279
|
fret;
|
275
280
|
};
|
276
281
|
|
277
282
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
283
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
284
|
fret;
|
280
285
|
};
|
281
286
|
|
282
287
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
288
|
+
case text = copy(data, ts-1, te)
|
289
|
+
when '\.'; emit(:escape, :dot, text)
|
290
|
+
when '\|'; emit(:escape, :alternation, text)
|
291
|
+
when '\^'; emit(:escape, :bol, text)
|
292
|
+
when '\$'; emit(:escape, :eol, text)
|
293
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
294
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
295
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
296
|
+
when '\('; emit(:escape, :group_open, text)
|
297
|
+
when '\)'; emit(:escape, :group_close, text)
|
298
|
+
when '\{'; emit(:escape, :interval_open, text)
|
299
|
+
when '\}'; emit(:escape, :interval_close, text)
|
300
|
+
when '\['; emit(:escape, :set_open, text)
|
301
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
302
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
303
|
+
emit(:escape, :backslash, text)
|
299
304
|
end
|
300
305
|
fret;
|
301
306
|
};
|
@@ -303,31 +308,31 @@
|
|
303
308
|
escaped_ascii > (escaped_alpha, 7) {
|
304
309
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
310
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
311
|
+
case text = copy(data, ts-1, te)
|
312
|
+
when '\a'; emit(:escape, :bell, text)
|
313
|
+
when '\b'; emit(:escape, :backspace, text)
|
314
|
+
when '\e'; emit(:escape, :escape, text)
|
315
|
+
when '\f'; emit(:escape, :form_feed, text)
|
316
|
+
when '\n'; emit(:escape, :newline, text)
|
317
|
+
when '\r'; emit(:escape, :carriage, text)
|
318
|
+
when '\t'; emit(:escape, :tab, text)
|
319
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
320
|
end
|
316
321
|
fret;
|
317
322
|
};
|
318
323
|
|
319
324
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
325
|
+
text = copy(data, ts-1, te)
|
321
326
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
327
|
+
emit(:escape, :codepoint_list, text)
|
323
328
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
329
|
+
emit(:escape, :codepoint, text)
|
325
330
|
end
|
326
331
|
fret;
|
327
332
|
};
|
328
333
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
334
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
335
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
336
|
fret;
|
332
337
|
};
|
333
338
|
|
@@ -357,8 +362,8 @@
|
|
357
362
|
fcall unicode_property;
|
358
363
|
};
|
359
364
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
365
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
366
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
367
|
fret;
|
363
368
|
};
|
364
369
|
*|;
|
@@ -368,9 +373,9 @@
|
|
368
373
|
# --------------------------------------------------------------------------
|
369
374
|
conditional_expression := |*
|
370
375
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
376
|
+
text = copy(data, ts, te-1)
|
377
|
+
emit(:conditional, :condition, text)
|
378
|
+
emit(:conditional, :condition_close, ')')
|
374
379
|
};
|
375
380
|
|
376
381
|
any {
|
@@ -387,39 +392,39 @@
|
|
387
392
|
# Meta characters
|
388
393
|
# ------------------------------------------------------------------------
|
389
394
|
dot {
|
390
|
-
emit(:meta, :dot,
|
395
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
396
|
};
|
392
397
|
|
393
398
|
alternation {
|
394
399
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
400
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
401
|
else
|
397
|
-
emit(:meta, :alternation,
|
402
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
403
|
end
|
399
404
|
};
|
400
405
|
|
401
406
|
# Anchors
|
402
407
|
# ------------------------------------------------------------------------
|
403
408
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
409
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
410
|
};
|
406
411
|
|
407
412
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
413
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
414
|
};
|
410
415
|
|
411
416
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
417
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
418
|
};
|
414
419
|
|
415
420
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
421
|
+
case text = copy(data, ts, te)
|
422
|
+
when '\\A'; emit(:anchor, :bos, text)
|
423
|
+
when '\\z'; emit(:anchor, :eos, text)
|
424
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
425
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
426
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
427
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
428
|
end
|
424
429
|
};
|
425
430
|
|
@@ -430,7 +435,7 @@
|
|
430
435
|
# Character sets
|
431
436
|
# ------------------------------------------------------------------------
|
432
437
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
438
|
+
emit(:set, :open, copy(data, ts, te))
|
434
439
|
fcall character_set;
|
435
440
|
};
|
436
441
|
|
@@ -439,12 +444,12 @@
|
|
439
444
|
# (?(condition)Y|N) conditional expression
|
440
445
|
# ------------------------------------------------------------------------
|
441
446
|
conditional {
|
442
|
-
text =
|
447
|
+
text = copy(data, ts, te)
|
443
448
|
|
444
449
|
conditional_stack << group_depth
|
445
450
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
451
|
+
emit(:conditional, :open, text[0..-2])
|
452
|
+
emit(:conditional, :condition_open, '(')
|
448
453
|
fcall conditional_expression;
|
449
454
|
};
|
450
455
|
|
@@ -455,7 +460,7 @@
|
|
455
460
|
# correct closing count.
|
456
461
|
# ------------------------------------------------------------------------
|
457
462
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
463
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
464
|
};
|
460
465
|
|
461
466
|
# Expression options:
|
@@ -470,11 +475,11 @@
|
|
470
475
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
476
|
# ------------------------------------------------------------------------
|
472
477
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
478
|
+
text = copy(data, ts, te)
|
474
479
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
480
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
481
|
end
|
477
|
-
emit_options(text
|
482
|
+
emit_options(text)
|
478
483
|
};
|
479
484
|
|
480
485
|
# Assertions
|
@@ -484,11 +489,11 @@
|
|
484
489
|
# (?<!subexp) negative look-behind
|
485
490
|
# ------------------------------------------------------------------------
|
486
491
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
492
|
+
case text = copy(data, ts, te)
|
493
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
494
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
495
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
496
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
497
|
end
|
493
498
|
};
|
494
499
|
|
@@ -501,32 +506,32 @@
|
|
501
506
|
# (subexp) captured group
|
502
507
|
# ------------------------------------------------------------------------
|
503
508
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
509
|
+
case text = copy(data, ts, te)
|
510
|
+
when '(?:'; emit(:group, :passive, text)
|
511
|
+
when '(?>'; emit(:group, :atomic, text)
|
512
|
+
when '(?~'; emit(:group, :absence, text)
|
508
513
|
|
509
514
|
when /^\(\?(?:<>|'')/
|
510
515
|
validation_error(:group, 'named group', 'name is empty')
|
511
516
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
517
|
+
when /^\(\?<[^>]+>/
|
518
|
+
emit(:group, :named_ab, text)
|
514
519
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
520
|
+
when /^\(\?'[^']+'/
|
521
|
+
emit(:group, :named_sq, text)
|
517
522
|
|
518
523
|
end
|
519
524
|
};
|
520
525
|
|
521
526
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
527
|
+
text = copy(data, ts, te)
|
528
|
+
emit(:group, :capture, text)
|
524
529
|
};
|
525
530
|
|
526
531
|
group_close @group_closed {
|
527
532
|
if conditional_stack.last == group_depth + 1
|
528
533
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
534
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
535
|
else
|
531
536
|
if spacing_stack.length > 1 &&
|
532
537
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,72 +539,42 @@
|
|
534
539
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
540
|
end
|
536
541
|
|
537
|
-
emit(:group, :close,
|
542
|
+
emit(:group, :close, copy(data, ts, te))
|
538
543
|
end
|
539
544
|
};
|
540
545
|
|
541
546
|
|
542
547
|
# Group backreference, named and numbered
|
543
548
|
# ------------------------------------------------------------------------
|
544
|
-
backslash . (
|
545
|
-
case text =
|
546
|
-
when /^\\(
|
547
|
-
validation_error(:backref, '
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
when /^\\(
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
561
|
-
end
|
562
|
-
|
563
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
564
|
-
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
566
|
-
else
|
567
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
568
|
-
end
|
569
|
-
|
570
|
-
when /^\\([gk])'\d+'/ # single quotes
|
571
|
-
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
573
|
-
else
|
574
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
575
|
-
end
|
576
|
-
|
577
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
|
-
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
580
|
-
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
582
|
-
end
|
583
|
-
|
584
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
|
-
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
587
|
-
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
589
|
-
end
|
590
|
-
|
591
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
593
|
-
|
594
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
596
|
-
|
597
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
599
|
-
|
600
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
549
|
+
backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
|
550
|
+
case text = copy(data, ts, te)
|
551
|
+
when /^\\k(<>|'')/
|
552
|
+
validation_error(:backref, 'backreference', 'ref ID is empty')
|
553
|
+
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
554
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
555
|
+
when /^\\k(.)\d+\D$/
|
556
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
557
|
+
when /^\\k(.)-\d+\D$/
|
558
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
559
|
+
when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
|
560
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
561
|
+
when /^\\k(.)-?\d+[+\-]\d+\D$/
|
562
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
563
|
+
end
|
564
|
+
};
|
602
565
|
|
566
|
+
# Group call, named and numbered
|
567
|
+
# ------------------------------------------------------------------------
|
568
|
+
backslash . (group_name_call | group_number_call) > (backslashed, 4) {
|
569
|
+
case text = copy(data, ts, te)
|
570
|
+
when /^\\g(<>|'')/
|
571
|
+
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
572
|
+
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
573
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
574
|
+
when /^\\g(.)\d+\D$/
|
575
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
576
|
+
when /^\\g(.)[+-]\d+/
|
577
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
603
578
|
end
|
604
579
|
};
|
605
580
|
|
@@ -607,31 +582,31 @@
|
|
607
582
|
# Quantifiers
|
608
583
|
# ------------------------------------------------------------------------
|
609
584
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
587
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
588
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
589
|
end
|
615
590
|
};
|
616
591
|
|
617
592
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
595
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
596
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
597
|
end
|
623
598
|
};
|
624
599
|
|
625
600
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
601
|
+
case text = copy(data, ts, te)
|
602
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
603
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
604
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
605
|
end
|
631
606
|
};
|
632
607
|
|
633
608
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
609
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
610
|
};
|
636
611
|
|
637
612
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +622,7 @@
|
|
647
622
|
|
648
623
|
comment {
|
649
624
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
625
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
626
|
else
|
652
627
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
628
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +632,7 @@
|
|
657
632
|
|
658
633
|
space+ {
|
659
634
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
635
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
636
|
else
|
662
637
|
append_literal(data, ts, te)
|
663
638
|
end
|
@@ -666,11 +641,7 @@
|
|
666
641
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
667
642
|
# except meta characters.
|
668
643
|
# ------------------------------------------------------------------------
|
669
|
-
(ascii_print -- space)+
|
670
|
-
ascii_nonprint+ |
|
671
|
-
utf8_2_byte+ |
|
672
|
-
utf8_3_byte+ |
|
673
|
-
utf8_4_byte+ {
|
644
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
674
645
|
append_literal(data, ts, te)
|
675
646
|
};
|
676
647
|
|
@@ -682,10 +653,10 @@
|
|
682
653
|
|
683
654
|
class Regexp::Scanner
|
684
655
|
# General scanner error (catch all)
|
685
|
-
class ScannerError <
|
656
|
+
class ScannerError < Regexp::Parser::Error; end
|
686
657
|
|
687
658
|
# Base for all scanner validation errors
|
688
|
-
class ValidationError <
|
659
|
+
class ValidationError < Regexp::Parser::Error
|
689
660
|
def initialize(reason)
|
690
661
|
super reason
|
691
662
|
end
|
@@ -760,6 +731,7 @@ class Regexp::Scanner
|
|
760
731
|
self.set_depth = 0
|
761
732
|
self.group_depth = 0
|
762
733
|
self.conditional_stack = []
|
734
|
+
self.char_pos = 0
|
763
735
|
|
764
736
|
%% write data;
|
765
737
|
%% write init;
|
@@ -769,7 +741,7 @@ class Regexp::Scanner
|
|
769
741
|
testEof = testEof
|
770
742
|
|
771
743
|
if cs == re_scanner_error
|
772
|
-
text =
|
744
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
745
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
746
|
end
|
775
747
|
|
@@ -786,7 +758,7 @@ class Regexp::Scanner
|
|
786
758
|
|
787
759
|
# lazy-load property maps when first needed
|
788
760
|
require 'yaml'
|
789
|
-
PROP_MAPS_DIR = File.
|
761
|
+
PROP_MAPS_DIR = File.join(__dir__, 'scanner', 'properties')
|
790
762
|
|
791
763
|
def self.short_prop_map
|
792
764
|
@short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
|
@@ -797,22 +769,29 @@ class Regexp::Scanner
|
|
797
769
|
end
|
798
770
|
|
799
771
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
772
|
+
def emit(type, token, text)
|
801
773
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
774
|
|
803
775
|
emit_literal if literal
|
804
776
|
|
777
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
778
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
779
|
+
ts_char_pos = char_pos
|
780
|
+
te_char_pos = char_pos + text.length
|
781
|
+
|
805
782
|
if block
|
806
|
-
block.call type, token, text,
|
783
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
784
|
end
|
808
785
|
|
809
|
-
tokens << [type, token, text,
|
786
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
787
|
+
|
788
|
+
self.char_pos = te_char_pos
|
810
789
|
end
|
811
790
|
|
812
791
|
private
|
813
792
|
|
814
793
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
794
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
795
|
|
817
796
|
def free_spacing?(input_object, options)
|
818
797
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +814,25 @@ class Regexp::Scanner
|
|
835
814
|
end
|
836
815
|
|
837
816
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
817
|
+
def copy(data, ts, te)
|
818
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
819
|
end
|
847
820
|
|
848
821
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
822
|
+
# by a call to emit_literal.
|
850
823
|
def append_literal(data, ts, te)
|
851
824
|
self.literal = literal || []
|
852
|
-
literal <<
|
825
|
+
literal << copy(data, ts, te)
|
853
826
|
end
|
854
827
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
828
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
829
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
830
|
+
text = literal.join
|
863
831
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
832
|
+
emit(:literal, :literal, text)
|
865
833
|
end
|
866
834
|
|
867
|
-
def emit_options(text
|
835
|
+
def emit_options(text)
|
868
836
|
token = nil
|
869
837
|
|
870
838
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +858,14 @@ class Regexp::Scanner
|
|
890
858
|
token = :options_switch
|
891
859
|
end
|
892
860
|
|
893
|
-
emit(:group, token, text
|
861
|
+
emit(:group, token, text)
|
894
862
|
end
|
895
863
|
|
896
864
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
865
|
if data.last < 0x00 || data.last > 0x7F
|
898
866
|
validation_error(:sequence, 'escape', token.to_s)
|
899
867
|
end
|
900
|
-
emit(:escape, token,
|
868
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
869
|
end
|
902
870
|
|
903
871
|
# Centralizes and unifies the handling of validation related
|