regexp_parser 1.8.2 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +100 -0
- data/Gemfile +6 -1
- data/README.md +1 -4
- data/Rakefile +8 -8
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
- data/lib/regexp_parser/expression/classes/group.rb +28 -3
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
- data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
- data/lib/regexp_parser/expression/quantifier.rb +10 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -19
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/expression.rb +7 -19
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +307 -332
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +209 -240
- data/lib/regexp_parser/scanner.rb +1275 -1340
- data/lib/regexp_parser/syntax/any.rb +3 -3
- data/lib/regexp_parser/syntax/base.rb +1 -1
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/expression/subexpression_spec.rb +1 -1
- data/spec/expression/to_s_spec.rb +39 -31
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/errors_spec.rb +1 -1
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +16 -0
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +8 -1
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +65 -16
- data/spec/spec_helper.rb +1 -0
- metadata +4 -7
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -15,7 +20,7 @@
|
|
15
20
|
|
16
21
|
group_open = '(';
|
17
22
|
group_close = ')';
|
18
|
-
|
23
|
+
parentheses = group_open | group_close;
|
19
24
|
|
20
25
|
set_open = '[';
|
21
26
|
set_close = ']';
|
@@ -32,7 +37,7 @@
|
|
32
37
|
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
33
38
|
|
34
39
|
|
35
|
-
# these are not supported in ruby
|
40
|
+
# these are not supported in ruby at the moment
|
36
41
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
37
42
|
character_equivalent = '[=' . alpha . '=]';
|
38
43
|
|
@@ -53,6 +58,8 @@
|
|
53
58
|
|
54
59
|
meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
|
55
60
|
|
61
|
+
sequence_char = [CMcux];
|
62
|
+
|
56
63
|
zero_or_one = '?' | '??' | '?+';
|
57
64
|
zero_or_more = '*' | '*?' | '*+';
|
58
65
|
one_or_more = '+' | '+?' | '++';
|
@@ -90,21 +97,26 @@
|
|
90
97
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
98
|
|
92
99
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
100
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
101
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
102
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
103
|
group_level = [+\-] . [0-9]+;
|
97
104
|
|
98
|
-
group_name = ('<' .
|
105
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
106
|
+
("'" . group_name_id_sq? . "'");
|
99
107
|
group_lookup = group_name | group_number;
|
100
108
|
|
101
109
|
group_named = ('?' . group_name );
|
102
110
|
|
103
|
-
|
104
|
-
|
111
|
+
group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
|
112
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
113
|
+
group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
|
114
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
115
|
|
106
|
-
|
107
|
-
|
116
|
+
group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
|
117
|
+
("'" . group_number . group_level? "'"));
|
118
|
+
group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
|
119
|
+
("'" . ((group_number . group_level?) | '0') "'"));
|
108
120
|
|
109
121
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
110
122
|
|
@@ -115,7 +127,7 @@
|
|
115
127
|
|
116
128
|
# characters that 'break' a literal
|
117
129
|
meta_char = dot | backslash | alternation |
|
118
|
-
curlies |
|
130
|
+
curlies | parentheses | brackets |
|
119
131
|
line_anchor | quantifier_greedy;
|
120
132
|
|
121
133
|
literal_delimiters = ']' | '}';
|
@@ -123,25 +135,23 @@
|
|
123
135
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
136
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
137
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
138
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
|
-
keep_mark |
|
139
|
+
keep_mark | sequence_char;
|
140
|
+
|
141
|
+
# escapes that also work within a character set
|
142
|
+
set_escape = backslash | brackets | escaped_ascii | property_char |
|
143
|
+
sequence_char | single_codepoint_char_type;
|
132
144
|
|
133
|
-
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
134
|
-
multi_codepoint_char_type | [0-9cCM];
|
135
145
|
|
136
146
|
# EOF error, used where it can be detected
|
137
147
|
action premature_end_error {
|
138
|
-
text =
|
148
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
149
|
raise PrematureEndError.new( text )
|
140
150
|
}
|
141
151
|
|
142
152
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
153
|
action invalid_sequence_error {
|
144
|
-
text =
|
154
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
155
|
validation_error(:sequence, 'sequence', text)
|
146
156
|
}
|
147
157
|
|
@@ -156,7 +166,7 @@
|
|
156
166
|
# --------------------------------------------------------------------------
|
157
167
|
character_set := |*
|
158
168
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
169
|
+
emit(:set, :close, copy(data, ts, te))
|
160
170
|
if in_set?
|
161
171
|
fret;
|
162
172
|
else
|
@@ -165,8 +175,8 @@
|
|
165
175
|
};
|
166
176
|
|
167
177
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
178
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
179
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
180
|
if in_set?
|
171
181
|
fret;
|
172
182
|
else
|
@@ -175,33 +185,33 @@
|
|
175
185
|
};
|
176
186
|
|
177
187
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
188
|
+
emit(:literal, :literal, '-')
|
189
|
+
emit(:set, :intersection, '&&')
|
180
190
|
};
|
181
191
|
|
182
192
|
'^' {
|
183
|
-
text =
|
193
|
+
text = copy(data, ts, te)
|
184
194
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
195
|
+
emit(:set, :negate, text)
|
186
196
|
else
|
187
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
188
198
|
end
|
189
199
|
};
|
190
200
|
|
191
201
|
'-' {
|
192
|
-
text =
|
202
|
+
text = copy(data, ts, te)
|
193
203
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
204
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
205
|
+
emit(:literal, :literal, text)
|
196
206
|
else
|
197
|
-
emit(:set, :range, text
|
207
|
+
emit(:set, :range, text)
|
198
208
|
end
|
199
209
|
};
|
200
210
|
|
201
211
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
212
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
213
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
214
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
215
|
};
|
206
216
|
|
207
217
|
backslash {
|
@@ -209,12 +219,12 @@
|
|
209
219
|
};
|
210
220
|
|
211
221
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
222
|
+
emit(:set, :open, copy(data, ts, te))
|
213
223
|
fcall character_set;
|
214
224
|
};
|
215
225
|
|
216
226
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
227
|
+
text = copy(data, ts, te)
|
218
228
|
|
219
229
|
type = :posixclass
|
220
230
|
class_name = text[2..-3]
|
@@ -223,45 +233,40 @@
|
|
223
233
|
type = :nonposixclass
|
224
234
|
end
|
225
235
|
|
226
|
-
emit(type, class_name.to_sym, text
|
236
|
+
emit(type, class_name.to_sym, text)
|
227
237
|
};
|
228
238
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
|
235
|
-
};
|
239
|
+
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
|
+
# emit(:set, :collation, copy(data, ts, te))
|
242
|
+
# };
|
243
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
|
+
# emit(:set, :equivalent, copy(data, ts, te))
|
245
|
+
# };
|
236
246
|
|
237
247
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
248
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
249
|
};
|
240
250
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
251
|
+
any | ascii_nonprint | utf8_multibyte {
|
252
|
+
text = copy(data, ts, te)
|
253
|
+
emit(:literal, :literal, text)
|
249
254
|
};
|
250
255
|
*|;
|
251
256
|
|
252
257
|
# set escapes scanner
|
253
258
|
# --------------------------------------------------------------------------
|
254
259
|
set_escape_sequence := |*
|
255
|
-
|
256
|
-
emit(:escape, :literal, *text(data, ts, te, 1))
|
257
|
-
fret;
|
258
|
-
};
|
259
|
-
|
260
|
-
any > (escaped_set_alpha, 1) {
|
260
|
+
set_escape > (escaped_set_alpha, 2) {
|
261
261
|
fhold;
|
262
262
|
fnext character_set;
|
263
263
|
fcall escape_sequence;
|
264
264
|
};
|
265
|
+
|
266
|
+
any > (escaped_set_alpha, 1) {
|
267
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
268
|
+
fret;
|
269
|
+
};
|
265
270
|
*|;
|
266
271
|
|
267
272
|
|
@@ -269,33 +274,33 @@
|
|
269
274
|
# --------------------------------------------------------------------------
|
270
275
|
escape_sequence := |*
|
271
276
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
277
|
+
text = copy(data, ts-1, te)
|
278
|
+
emit(:backref, :number, text)
|
274
279
|
fret;
|
275
280
|
};
|
276
281
|
|
277
282
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
283
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
284
|
fret;
|
280
285
|
};
|
281
286
|
|
282
287
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
288
|
+
case text = copy(data, ts-1, te)
|
289
|
+
when '\.'; emit(:escape, :dot, text)
|
290
|
+
when '\|'; emit(:escape, :alternation, text)
|
291
|
+
when '\^'; emit(:escape, :bol, text)
|
292
|
+
when '\$'; emit(:escape, :eol, text)
|
293
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
294
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
295
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
296
|
+
when '\('; emit(:escape, :group_open, text)
|
297
|
+
when '\)'; emit(:escape, :group_close, text)
|
298
|
+
when '\{'; emit(:escape, :interval_open, text)
|
299
|
+
when '\}'; emit(:escape, :interval_close, text)
|
300
|
+
when '\['; emit(:escape, :set_open, text)
|
301
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
302
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
303
|
+
emit(:escape, :backslash, text)
|
299
304
|
end
|
300
305
|
fret;
|
301
306
|
};
|
@@ -303,31 +308,31 @@
|
|
303
308
|
escaped_ascii > (escaped_alpha, 7) {
|
304
309
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
310
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
311
|
+
case text = copy(data, ts-1, te)
|
312
|
+
when '\a'; emit(:escape, :bell, text)
|
313
|
+
when '\b'; emit(:escape, :backspace, text)
|
314
|
+
when '\e'; emit(:escape, :escape, text)
|
315
|
+
when '\f'; emit(:escape, :form_feed, text)
|
316
|
+
when '\n'; emit(:escape, :newline, text)
|
317
|
+
when '\r'; emit(:escape, :carriage, text)
|
318
|
+
when '\t'; emit(:escape, :tab, text)
|
319
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
320
|
end
|
316
321
|
fret;
|
317
322
|
};
|
318
323
|
|
319
324
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
325
|
+
text = copy(data, ts-1, te)
|
321
326
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
327
|
+
emit(:escape, :codepoint_list, text)
|
323
328
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
329
|
+
emit(:escape, :codepoint, text)
|
325
330
|
end
|
326
331
|
fret;
|
327
332
|
};
|
328
333
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
334
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
335
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
336
|
fret;
|
332
337
|
};
|
333
338
|
|
@@ -357,8 +362,8 @@
|
|
357
362
|
fcall unicode_property;
|
358
363
|
};
|
359
364
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
365
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
366
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
367
|
fret;
|
363
368
|
};
|
364
369
|
*|;
|
@@ -368,9 +373,9 @@
|
|
368
373
|
# --------------------------------------------------------------------------
|
369
374
|
conditional_expression := |*
|
370
375
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
376
|
+
text = copy(data, ts, te-1)
|
377
|
+
emit(:conditional, :condition, text)
|
378
|
+
emit(:conditional, :condition_close, ')')
|
374
379
|
};
|
375
380
|
|
376
381
|
any {
|
@@ -387,39 +392,39 @@
|
|
387
392
|
# Meta characters
|
388
393
|
# ------------------------------------------------------------------------
|
389
394
|
dot {
|
390
|
-
emit(:meta, :dot,
|
395
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
396
|
};
|
392
397
|
|
393
398
|
alternation {
|
394
399
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
400
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
401
|
else
|
397
|
-
emit(:meta, :alternation,
|
402
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
403
|
end
|
399
404
|
};
|
400
405
|
|
401
406
|
# Anchors
|
402
407
|
# ------------------------------------------------------------------------
|
403
408
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
409
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
410
|
};
|
406
411
|
|
407
412
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
413
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
414
|
};
|
410
415
|
|
411
416
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
417
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
418
|
};
|
414
419
|
|
415
420
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
421
|
+
case text = copy(data, ts, te)
|
422
|
+
when '\\A'; emit(:anchor, :bos, text)
|
423
|
+
when '\\z'; emit(:anchor, :eos, text)
|
424
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
425
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
426
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
427
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
428
|
end
|
424
429
|
};
|
425
430
|
|
@@ -430,7 +435,7 @@
|
|
430
435
|
# Character sets
|
431
436
|
# ------------------------------------------------------------------------
|
432
437
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
438
|
+
emit(:set, :open, copy(data, ts, te))
|
434
439
|
fcall character_set;
|
435
440
|
};
|
436
441
|
|
@@ -439,12 +444,12 @@
|
|
439
444
|
# (?(condition)Y|N) conditional expression
|
440
445
|
# ------------------------------------------------------------------------
|
441
446
|
conditional {
|
442
|
-
text =
|
447
|
+
text = copy(data, ts, te)
|
443
448
|
|
444
449
|
conditional_stack << group_depth
|
445
450
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
451
|
+
emit(:conditional, :open, text[0..-2])
|
452
|
+
emit(:conditional, :condition_open, '(')
|
448
453
|
fcall conditional_expression;
|
449
454
|
};
|
450
455
|
|
@@ -455,7 +460,7 @@
|
|
455
460
|
# correct closing count.
|
456
461
|
# ------------------------------------------------------------------------
|
457
462
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
463
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
464
|
};
|
460
465
|
|
461
466
|
# Expression options:
|
@@ -470,11 +475,11 @@
|
|
470
475
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
476
|
# ------------------------------------------------------------------------
|
472
477
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
478
|
+
text = copy(data, ts, te)
|
474
479
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
480
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
481
|
end
|
477
|
-
emit_options(text
|
482
|
+
emit_options(text)
|
478
483
|
};
|
479
484
|
|
480
485
|
# Assertions
|
@@ -484,11 +489,11 @@
|
|
484
489
|
# (?<!subexp) negative look-behind
|
485
490
|
# ------------------------------------------------------------------------
|
486
491
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
492
|
+
case text = copy(data, ts, te)
|
493
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
494
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
495
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
496
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
497
|
end
|
493
498
|
};
|
494
499
|
|
@@ -501,32 +506,32 @@
|
|
501
506
|
# (subexp) captured group
|
502
507
|
# ------------------------------------------------------------------------
|
503
508
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
509
|
+
case text = copy(data, ts, te)
|
510
|
+
when '(?:'; emit(:group, :passive, text)
|
511
|
+
when '(?>'; emit(:group, :atomic, text)
|
512
|
+
when '(?~'; emit(:group, :absence, text)
|
508
513
|
|
509
514
|
when /^\(\?(?:<>|'')/
|
510
515
|
validation_error(:group, 'named group', 'name is empty')
|
511
516
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
517
|
+
when /^\(\?<[^>]+>/
|
518
|
+
emit(:group, :named_ab, text)
|
514
519
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
520
|
+
when /^\(\?'[^']+'/
|
521
|
+
emit(:group, :named_sq, text)
|
517
522
|
|
518
523
|
end
|
519
524
|
};
|
520
525
|
|
521
526
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
527
|
+
text = copy(data, ts, te)
|
528
|
+
emit(:group, :capture, text)
|
524
529
|
};
|
525
530
|
|
526
531
|
group_close @group_closed {
|
527
532
|
if conditional_stack.last == group_depth + 1
|
528
533
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
534
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
535
|
else
|
531
536
|
if spacing_stack.length > 1 &&
|
532
537
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,72 +539,42 @@
|
|
534
539
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
540
|
end
|
536
541
|
|
537
|
-
emit(:group, :close,
|
542
|
+
emit(:group, :close, copy(data, ts, te))
|
538
543
|
end
|
539
544
|
};
|
540
545
|
|
541
546
|
|
542
547
|
# Group backreference, named and numbered
|
543
548
|
# ------------------------------------------------------------------------
|
544
|
-
backslash . (
|
545
|
-
case text =
|
546
|
-
when /^\\(
|
547
|
-
validation_error(:backref, '
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
when /^\\(
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
emit(:backref, :name_call_sq, text, ts, te)
|
561
|
-
end
|
562
|
-
|
563
|
-
when /^\\([gk])<\d+>/ # angle-brackets
|
564
|
-
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text, ts, te)
|
566
|
-
else
|
567
|
-
emit(:backref, :number_call_ab, text, ts, te)
|
568
|
-
end
|
569
|
-
|
570
|
-
when /^\\([gk])'\d+'/ # single quotes
|
571
|
-
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text, ts, te)
|
573
|
-
else
|
574
|
-
emit(:backref, :number_call_sq, text, ts, te)
|
575
|
-
end
|
576
|
-
|
577
|
-
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
|
-
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
580
|
-
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text, ts, te)
|
582
|
-
end
|
583
|
-
|
584
|
-
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
|
-
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
587
|
-
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text, ts, te)
|
589
|
-
end
|
590
|
-
|
591
|
-
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
593
|
-
|
594
|
-
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
596
|
-
|
597
|
-
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
599
|
-
|
600
|
-
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
549
|
+
backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
|
550
|
+
case text = copy(data, ts, te)
|
551
|
+
when /^\\k(<>|'')/
|
552
|
+
validation_error(:backref, 'backreference', 'ref ID is empty')
|
553
|
+
when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
|
554
|
+
emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
|
555
|
+
when /^\\k(.)\d+\D$/
|
556
|
+
emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
|
557
|
+
when /^\\k(.)-\d+\D$/
|
558
|
+
emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
|
559
|
+
when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
|
560
|
+
emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
|
561
|
+
when /^\\k(.)-?\d+[+\-]\d+\D$/
|
562
|
+
emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
|
563
|
+
end
|
564
|
+
};
|
602
565
|
|
566
|
+
# Group call, named and numbered
|
567
|
+
# ------------------------------------------------------------------------
|
568
|
+
backslash . (group_name_call | group_number_call) > (backslashed, 4) {
|
569
|
+
case text = copy(data, ts, te)
|
570
|
+
when /^\\g(<>|'')/
|
571
|
+
validation_error(:backref, 'subexpression call', 'ref ID is empty')
|
572
|
+
when /^\\g(.)[^\p{digit}+\->][^+\-]*/
|
573
|
+
emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
|
574
|
+
when /^\\g(.)\d+\D$/
|
575
|
+
emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
|
576
|
+
when /^\\g(.)[+-]\d+/
|
577
|
+
emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
|
603
578
|
end
|
604
579
|
};
|
605
580
|
|
@@ -607,31 +582,31 @@
|
|
607
582
|
# Quantifiers
|
608
583
|
# ------------------------------------------------------------------------
|
609
584
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
585
|
+
case text = copy(data, ts, te)
|
586
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
587
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
588
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
589
|
end
|
615
590
|
};
|
616
591
|
|
617
592
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
593
|
+
case text = copy(data, ts, te)
|
594
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
595
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
596
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
597
|
end
|
623
598
|
};
|
624
599
|
|
625
600
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
601
|
+
case text = copy(data, ts, te)
|
602
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
603
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
604
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
605
|
end
|
631
606
|
};
|
632
607
|
|
633
608
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
609
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
610
|
};
|
636
611
|
|
637
612
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +622,7 @@
|
|
647
622
|
|
648
623
|
comment {
|
649
624
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
625
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
626
|
else
|
652
627
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
628
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +632,7 @@
|
|
657
632
|
|
658
633
|
space+ {
|
659
634
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
635
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
636
|
else
|
662
637
|
append_literal(data, ts, te)
|
663
638
|
end
|
@@ -666,11 +641,7 @@
|
|
666
641
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
667
642
|
# except meta characters.
|
668
643
|
# ------------------------------------------------------------------------
|
669
|
-
(ascii_print -- space)+
|
670
|
-
ascii_nonprint+ |
|
671
|
-
utf8_2_byte+ |
|
672
|
-
utf8_3_byte+ |
|
673
|
-
utf8_4_byte+ {
|
644
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
674
645
|
append_literal(data, ts, te)
|
675
646
|
};
|
676
647
|
|
@@ -680,12 +651,14 @@
|
|
680
651
|
# THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
|
681
652
|
# This file was generated from lib/regexp_parser/scanner/scanner.rl
|
682
653
|
|
654
|
+
require 'regexp_parser/error'
|
655
|
+
|
683
656
|
class Regexp::Scanner
|
684
657
|
# General scanner error (catch all)
|
685
|
-
class ScannerError <
|
658
|
+
class ScannerError < Regexp::Parser::Error; end
|
686
659
|
|
687
660
|
# Base for all scanner validation errors
|
688
|
-
class ValidationError <
|
661
|
+
class ValidationError < Regexp::Parser::Error
|
689
662
|
def initialize(reason)
|
690
663
|
super reason
|
691
664
|
end
|
@@ -760,6 +733,7 @@ class Regexp::Scanner
|
|
760
733
|
self.set_depth = 0
|
761
734
|
self.group_depth = 0
|
762
735
|
self.conditional_stack = []
|
736
|
+
self.char_pos = 0
|
763
737
|
|
764
738
|
%% write data;
|
765
739
|
%% write init;
|
@@ -769,7 +743,7 @@ class Regexp::Scanner
|
|
769
743
|
testEof = testEof
|
770
744
|
|
771
745
|
if cs == re_scanner_error
|
772
|
-
text =
|
746
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
747
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
748
|
end
|
775
749
|
|
@@ -786,33 +760,39 @@ class Regexp::Scanner
|
|
786
760
|
|
787
761
|
# lazy-load property maps when first needed
|
788
762
|
require 'yaml'
|
789
|
-
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
790
763
|
|
791
764
|
def self.short_prop_map
|
792
|
-
@short_prop_map ||= YAML.load_file("#{
|
765
|
+
@short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
|
793
766
|
end
|
794
767
|
|
795
768
|
def self.long_prop_map
|
796
|
-
@long_prop_map ||= YAML.load_file("#{
|
769
|
+
@long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
|
797
770
|
end
|
798
771
|
|
799
772
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
773
|
+
def emit(type, token, text)
|
801
774
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
775
|
|
803
776
|
emit_literal if literal
|
804
777
|
|
778
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
779
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
780
|
+
ts_char_pos = char_pos
|
781
|
+
te_char_pos = char_pos + text.length
|
782
|
+
|
805
783
|
if block
|
806
|
-
block.call type, token, text,
|
784
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
785
|
end
|
808
786
|
|
809
|
-
tokens << [type, token, text,
|
787
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
788
|
+
|
789
|
+
self.char_pos = te_char_pos
|
810
790
|
end
|
811
791
|
|
812
792
|
private
|
813
793
|
|
814
794
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
795
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
796
|
|
817
797
|
def free_spacing?(input_object, options)
|
818
798
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +815,25 @@ class Regexp::Scanner
|
|
835
815
|
end
|
836
816
|
|
837
817
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
818
|
+
def copy(data, ts, te)
|
819
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
820
|
end
|
847
821
|
|
848
822
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
823
|
+
# by a call to emit_literal.
|
850
824
|
def append_literal(data, ts, te)
|
851
825
|
self.literal = literal || []
|
852
|
-
literal <<
|
826
|
+
literal << copy(data, ts, te)
|
853
827
|
end
|
854
828
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
829
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
830
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
831
|
+
text = literal.join
|
863
832
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
833
|
+
emit(:literal, :literal, text)
|
865
834
|
end
|
866
835
|
|
867
|
-
def emit_options(text
|
836
|
+
def emit_options(text)
|
868
837
|
token = nil
|
869
838
|
|
870
839
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +859,14 @@ class Regexp::Scanner
|
|
890
859
|
token = :options_switch
|
891
860
|
end
|
892
861
|
|
893
|
-
emit(:group, token, text
|
862
|
+
emit(:group, token, text)
|
894
863
|
end
|
895
864
|
|
896
865
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
866
|
if data.last < 0x00 || data.last > 0x7F
|
898
867
|
validation_error(:sequence, 'escape', token.to_s)
|
899
868
|
end
|
900
|
-
emit(:escape, token,
|
869
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
870
|
end
|
902
871
|
|
903
872
|
# Centralizes and unifies the handling of validation related
|