regexp_parser 1.7.1 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +83 -0
- data/README.md +23 -11
- data/lib/regexp_parser/expression.rb +10 -19
- data/lib/regexp_parser/expression/classes/group.rb +17 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +6 -6
- data/lib/regexp_parser/parser.rb +45 -12
- data/lib/regexp_parser/scanner.rb +1264 -1280
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +195 -194
- data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/to_s_spec.rb +16 -0
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/options_spec.rb +28 -0
- data/spec/parser/quantifiers_spec.rb +15 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +11 -0
- data/spec/scanner/free_space_spec.rb +32 -0
- data/spec/scanner/groups_spec.rb +10 -1
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/options_spec.rb +36 -0
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +8 -2
- metadata +60 -60
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -21,7 +26,7 @@
|
|
21
26
|
set_close = ']';
|
22
27
|
brackets = set_open | set_close;
|
23
28
|
|
24
|
-
comment = ('#' . [^\n]* . '\n');
|
29
|
+
comment = ('#' . [^\n]* . '\n'?);
|
25
30
|
|
26
31
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
27
32
|
'cntrl' | 'digit' | 'graph' |
|
@@ -90,18 +95,19 @@
|
|
90
95
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
96
|
|
92
97
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
98
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
99
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
100
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
101
|
group_level = [+\-] . [0-9]+;
|
97
102
|
|
98
|
-
group_name = ('<' .
|
103
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
104
|
+
("'" . group_name_id_sq? . "'");
|
99
105
|
group_lookup = group_name | group_number;
|
100
106
|
|
101
107
|
group_named = ('?' . group_name );
|
102
108
|
|
103
|
-
group_name_ref = group_ref . (('<' .
|
104
|
-
("'" .
|
109
|
+
group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
|
110
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
111
|
|
106
112
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
107
113
|
("'" . group_number . group_level? "'"));
|
@@ -120,28 +126,24 @@
|
|
120
126
|
|
121
127
|
literal_delimiters = ']' | '}';
|
122
128
|
|
123
|
-
ascii_print = ((0x20..0x7e) - meta_char);
|
129
|
+
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
130
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
131
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
132
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
|
-
|
133
|
+
keep_mark | [xucCM];
|
132
134
|
|
133
135
|
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
134
136
|
multi_codepoint_char_type | [0-9cCM];
|
135
137
|
|
136
138
|
# EOF error, used where it can be detected
|
137
139
|
action premature_end_error {
|
138
|
-
text =
|
140
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
141
|
raise PrematureEndError.new( text )
|
140
142
|
}
|
141
143
|
|
142
144
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
145
|
action invalid_sequence_error {
|
144
|
-
text =
|
146
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
147
|
validation_error(:sequence, 'sequence', text)
|
146
148
|
}
|
147
149
|
|
@@ -156,7 +158,7 @@
|
|
156
158
|
# --------------------------------------------------------------------------
|
157
159
|
character_set := |*
|
158
160
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
161
|
+
emit(:set, :close, copy(data, ts, te))
|
160
162
|
if in_set?
|
161
163
|
fret;
|
162
164
|
else
|
@@ -165,8 +167,8 @@
|
|
165
167
|
};
|
166
168
|
|
167
169
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
170
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
171
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
172
|
if in_set?
|
171
173
|
fret;
|
172
174
|
else
|
@@ -175,33 +177,33 @@
|
|
175
177
|
};
|
176
178
|
|
177
179
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
180
|
+
emit(:literal, :literal, '-')
|
181
|
+
emit(:set, :intersection, '&&')
|
180
182
|
};
|
181
183
|
|
182
184
|
'^' {
|
183
|
-
text =
|
185
|
+
text = copy(data, ts, te)
|
184
186
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
187
|
+
emit(:set, :negate, text)
|
186
188
|
else
|
187
|
-
emit(:literal, :literal, text
|
189
|
+
emit(:literal, :literal, text)
|
188
190
|
end
|
189
191
|
};
|
190
192
|
|
191
193
|
'-' {
|
192
|
-
text =
|
194
|
+
text = copy(data, ts, te)
|
193
195
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
196
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
197
|
+
emit(:literal, :literal, text)
|
196
198
|
else
|
197
|
-
emit(:set, :range, text
|
199
|
+
emit(:set, :range, text)
|
198
200
|
end
|
199
201
|
};
|
200
202
|
|
201
203
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
204
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
205
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
206
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
207
|
};
|
206
208
|
|
207
209
|
backslash {
|
@@ -209,12 +211,12 @@
|
|
209
211
|
};
|
210
212
|
|
211
213
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
214
|
+
emit(:set, :open, copy(data, ts, te))
|
213
215
|
fcall character_set;
|
214
216
|
};
|
215
217
|
|
216
218
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
219
|
+
text = copy(data, ts, te)
|
218
220
|
|
219
221
|
type = :posixclass
|
220
222
|
class_name = text[2..-3]
|
@@ -223,29 +225,24 @@
|
|
223
225
|
type = :nonposixclass
|
224
226
|
end
|
225
227
|
|
226
|
-
emit(type, class_name.to_sym, text
|
228
|
+
emit(type, class_name.to_sym, text)
|
227
229
|
};
|
228
230
|
|
229
231
|
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
230
|
-
emit(:set, :collation,
|
232
|
+
emit(:set, :collation, copy(data, ts, te))
|
231
233
|
};
|
232
234
|
|
233
235
|
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
emit(:set, :equivalent,
|
236
|
+
emit(:set, :equivalent, copy(data, ts, te))
|
235
237
|
};
|
236
238
|
|
237
239
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
240
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
241
|
};
|
240
242
|
|
241
|
-
any
|
242
|
-
|
243
|
-
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
246
|
-
char, *rest = *text(data, ts, te)
|
247
|
-
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
248
|
-
emit(:literal, :literal, char, *rest)
|
243
|
+
any | ascii_nonprint | utf8_multibyte {
|
244
|
+
text = copy(data, ts, te)
|
245
|
+
emit(:literal, :literal, text)
|
249
246
|
};
|
250
247
|
*|;
|
251
248
|
|
@@ -253,7 +250,7 @@
|
|
253
250
|
# --------------------------------------------------------------------------
|
254
251
|
set_escape_sequence := |*
|
255
252
|
non_set_escape > (escaped_set_alpha, 2) {
|
256
|
-
emit(:escape, :literal,
|
253
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
257
254
|
fret;
|
258
255
|
};
|
259
256
|
|
@@ -269,33 +266,33 @@
|
|
269
266
|
# --------------------------------------------------------------------------
|
270
267
|
escape_sequence := |*
|
271
268
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
269
|
+
text = copy(data, ts-1, te)
|
270
|
+
emit(:backref, :number, text)
|
274
271
|
fret;
|
275
272
|
};
|
276
273
|
|
277
274
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
275
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
276
|
fret;
|
280
277
|
};
|
281
278
|
|
282
279
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
280
|
+
case text = copy(data, ts-1, te)
|
281
|
+
when '\.'; emit(:escape, :dot, text)
|
282
|
+
when '\|'; emit(:escape, :alternation, text)
|
283
|
+
when '\^'; emit(:escape, :bol, text)
|
284
|
+
when '\$'; emit(:escape, :eol, text)
|
285
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
286
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
287
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
288
|
+
when '\('; emit(:escape, :group_open, text)
|
289
|
+
when '\)'; emit(:escape, :group_close, text)
|
290
|
+
when '\{'; emit(:escape, :interval_open, text)
|
291
|
+
when '\}'; emit(:escape, :interval_close, text)
|
292
|
+
when '\['; emit(:escape, :set_open, text)
|
293
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
294
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
295
|
+
emit(:escape, :backslash, text)
|
299
296
|
end
|
300
297
|
fret;
|
301
298
|
};
|
@@ -303,31 +300,31 @@
|
|
303
300
|
escaped_ascii > (escaped_alpha, 7) {
|
304
301
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
302
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
303
|
+
case text = copy(data, ts-1, te)
|
304
|
+
when '\a'; emit(:escape, :bell, text)
|
305
|
+
when '\b'; emit(:escape, :backspace, text)
|
306
|
+
when '\e'; emit(:escape, :escape, text)
|
307
|
+
when '\f'; emit(:escape, :form_feed, text)
|
308
|
+
when '\n'; emit(:escape, :newline, text)
|
309
|
+
when '\r'; emit(:escape, :carriage, text)
|
310
|
+
when '\t'; emit(:escape, :tab, text)
|
311
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
312
|
end
|
316
313
|
fret;
|
317
314
|
};
|
318
315
|
|
319
316
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
317
|
+
text = copy(data, ts-1, te)
|
321
318
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
319
|
+
emit(:escape, :codepoint_list, text)
|
323
320
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
321
|
+
emit(:escape, :codepoint, text)
|
325
322
|
end
|
326
323
|
fret;
|
327
324
|
};
|
328
325
|
|
329
|
-
hex_sequence > (escaped_alpha, 5)
|
330
|
-
emit(:escape, :hex,
|
326
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
327
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
328
|
fret;
|
332
329
|
};
|
333
330
|
|
@@ -357,8 +354,8 @@
|
|
357
354
|
fcall unicode_property;
|
358
355
|
};
|
359
356
|
|
360
|
-
(any -- non_literal_escape) > (escaped_alpha, 1)
|
361
|
-
emit(:escape, :literal,
|
357
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
358
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
359
|
fret;
|
363
360
|
};
|
364
361
|
*|;
|
@@ -368,9 +365,9 @@
|
|
368
365
|
# --------------------------------------------------------------------------
|
369
366
|
conditional_expression := |*
|
370
367
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
368
|
+
text = copy(data, ts, te-1)
|
369
|
+
emit(:conditional, :condition, text)
|
370
|
+
emit(:conditional, :condition_close, ')')
|
374
371
|
};
|
375
372
|
|
376
373
|
any {
|
@@ -387,39 +384,39 @@
|
|
387
384
|
# Meta characters
|
388
385
|
# ------------------------------------------------------------------------
|
389
386
|
dot {
|
390
|
-
emit(:meta, :dot,
|
387
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
388
|
};
|
392
389
|
|
393
390
|
alternation {
|
394
391
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
392
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
393
|
else
|
397
|
-
emit(:meta, :alternation,
|
394
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
395
|
end
|
399
396
|
};
|
400
397
|
|
401
398
|
# Anchors
|
402
399
|
# ------------------------------------------------------------------------
|
403
400
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
401
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
402
|
};
|
406
403
|
|
407
404
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
405
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
406
|
};
|
410
407
|
|
411
408
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
409
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
410
|
};
|
414
411
|
|
415
412
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
413
|
+
case text = copy(data, ts, te)
|
414
|
+
when '\\A'; emit(:anchor, :bos, text)
|
415
|
+
when '\\z'; emit(:anchor, :eos, text)
|
416
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
417
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
418
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
419
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
420
|
end
|
424
421
|
};
|
425
422
|
|
@@ -430,7 +427,7 @@
|
|
430
427
|
# Character sets
|
431
428
|
# ------------------------------------------------------------------------
|
432
429
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
430
|
+
emit(:set, :open, copy(data, ts, te))
|
434
431
|
fcall character_set;
|
435
432
|
};
|
436
433
|
|
@@ -439,12 +436,12 @@
|
|
439
436
|
# (?(condition)Y|N) conditional expression
|
440
437
|
# ------------------------------------------------------------------------
|
441
438
|
conditional {
|
442
|
-
text =
|
439
|
+
text = copy(data, ts, te)
|
443
440
|
|
444
441
|
conditional_stack << group_depth
|
445
442
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
443
|
+
emit(:conditional, :open, text[0..-2])
|
444
|
+
emit(:conditional, :condition_open, '(')
|
448
445
|
fcall conditional_expression;
|
449
446
|
};
|
450
447
|
|
@@ -455,7 +452,7 @@
|
|
455
452
|
# correct closing count.
|
456
453
|
# ------------------------------------------------------------------------
|
457
454
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
455
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
456
|
};
|
460
457
|
|
461
458
|
# Expression options:
|
@@ -470,11 +467,11 @@
|
|
470
467
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
468
|
# ------------------------------------------------------------------------
|
472
469
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
470
|
+
text = copy(data, ts, te)
|
474
471
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
472
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
473
|
end
|
477
|
-
emit_options(text
|
474
|
+
emit_options(text)
|
478
475
|
};
|
479
476
|
|
480
477
|
# Assertions
|
@@ -484,11 +481,11 @@
|
|
484
481
|
# (?<!subexp) negative look-behind
|
485
482
|
# ------------------------------------------------------------------------
|
486
483
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
484
|
+
case text = copy(data, ts, te)
|
485
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
486
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
487
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
488
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
489
|
end
|
493
490
|
};
|
494
491
|
|
@@ -501,32 +498,32 @@
|
|
501
498
|
# (subexp) captured group
|
502
499
|
# ------------------------------------------------------------------------
|
503
500
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
501
|
+
case text = copy(data, ts, te)
|
502
|
+
when '(?:'; emit(:group, :passive, text)
|
503
|
+
when '(?>'; emit(:group, :atomic, text)
|
504
|
+
when '(?~'; emit(:group, :absence, text)
|
508
505
|
|
509
506
|
when /^\(\?(?:<>|'')/
|
510
507
|
validation_error(:group, 'named group', 'name is empty')
|
511
508
|
|
512
|
-
when /^\(
|
513
|
-
emit(:group, :named_ab, text
|
509
|
+
when /^\(\?<[^>]+>/
|
510
|
+
emit(:group, :named_ab, text)
|
514
511
|
|
515
|
-
when /^\(\?'
|
516
|
-
emit(:group, :named_sq, text
|
512
|
+
when /^\(\?'[^']+'/
|
513
|
+
emit(:group, :named_sq, text)
|
517
514
|
|
518
515
|
end
|
519
516
|
};
|
520
517
|
|
521
518
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
519
|
+
text = copy(data, ts, te)
|
520
|
+
emit(:group, :capture, text)
|
524
521
|
};
|
525
522
|
|
526
523
|
group_close @group_closed {
|
527
524
|
if conditional_stack.last == group_depth + 1
|
528
525
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
526
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
527
|
else
|
531
528
|
if spacing_stack.length > 1 &&
|
532
529
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,7 +531,7 @@
|
|
534
531
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
532
|
end
|
536
533
|
|
537
|
-
emit(:group, :close,
|
534
|
+
emit(:group, :close, copy(data, ts, te))
|
538
535
|
end
|
539
536
|
};
|
540
537
|
|
@@ -542,63 +539,65 @@
|
|
542
539
|
# Group backreference, named and numbered
|
543
540
|
# ------------------------------------------------------------------------
|
544
541
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
545
|
-
case text =
|
542
|
+
case text = copy(data, ts, te)
|
546
543
|
when /^\\([gk])(<>|'')/ # angle brackets
|
547
544
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
548
545
|
|
549
|
-
|
546
|
+
# TODO: finer quirks of choosing recursive or non-recursive refs/calls.
|
547
|
+
# e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
|
548
|
+
when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
|
550
549
|
if $1 == 'k'
|
551
|
-
emit(:backref, :name_ref_ab, text
|
550
|
+
emit(:backref, :name_ref_ab, text)
|
552
551
|
else
|
553
|
-
emit(:backref, :name_call_ab, text
|
552
|
+
emit(:backref, :name_call_ab, text)
|
554
553
|
end
|
555
554
|
|
556
|
-
when /^\\([gk])'[^\
|
555
|
+
when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
|
557
556
|
if $1 == 'k'
|
558
|
-
emit(:backref, :name_ref_sq, text
|
557
|
+
emit(:backref, :name_ref_sq, text)
|
559
558
|
else
|
560
|
-
emit(:backref, :name_call_sq, text
|
559
|
+
emit(:backref, :name_call_sq, text)
|
561
560
|
end
|
562
561
|
|
563
562
|
when /^\\([gk])<\d+>/ # angle-brackets
|
564
563
|
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text
|
564
|
+
emit(:backref, :number_ref_ab, text)
|
566
565
|
else
|
567
|
-
emit(:backref, :number_call_ab, text
|
566
|
+
emit(:backref, :number_call_ab, text)
|
568
567
|
end
|
569
568
|
|
570
569
|
when /^\\([gk])'\d+'/ # single quotes
|
571
570
|
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text
|
571
|
+
emit(:backref, :number_ref_sq, text)
|
573
572
|
else
|
574
|
-
emit(:backref, :number_call_sq, text
|
573
|
+
emit(:backref, :number_call_sq, text)
|
575
574
|
end
|
576
575
|
|
577
576
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
577
|
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text
|
578
|
+
emit(:backref, :number_rel_ref_ab, text)
|
580
579
|
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text
|
580
|
+
emit(:backref, :number_rel_call_ab, text)
|
582
581
|
end
|
583
582
|
|
584
583
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
584
|
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text
|
585
|
+
emit(:backref, :number_rel_ref_sq, text)
|
587
586
|
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text
|
587
|
+
emit(:backref, :number_rel_call_sq, text)
|
589
588
|
end
|
590
589
|
|
591
|
-
when /^\\k<[^\
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text
|
590
|
+
when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
|
591
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
593
592
|
|
594
|
-
when /^\\k'[^\
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text
|
593
|
+
when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
|
594
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
596
595
|
|
597
596
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text
|
597
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
599
598
|
|
600
599
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text
|
600
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
602
601
|
|
603
602
|
end
|
604
603
|
};
|
@@ -607,31 +606,31 @@
|
|
607
606
|
# Quantifiers
|
608
607
|
# ------------------------------------------------------------------------
|
609
608
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
609
|
+
case text = copy(data, ts, te)
|
610
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
611
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
612
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
613
|
end
|
615
614
|
};
|
616
615
|
|
617
616
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
617
|
+
case text = copy(data, ts, te)
|
618
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
619
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
620
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
621
|
end
|
623
622
|
};
|
624
623
|
|
625
624
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
625
|
+
case text = copy(data, ts, te)
|
626
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
627
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
628
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
629
|
end
|
631
630
|
};
|
632
631
|
|
633
632
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
633
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
634
|
};
|
636
635
|
|
637
636
|
# Catch unmatched curly braces as literals
|
@@ -647,15 +646,17 @@
|
|
647
646
|
|
648
647
|
comment {
|
649
648
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
649
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
650
|
else
|
652
|
-
|
651
|
+
# consume only the pound sign (#) and backtrack to do regular scanning
|
652
|
+
append_literal(data, ts, ts + 1)
|
653
|
+
fexec ts + 1;
|
653
654
|
end
|
654
655
|
};
|
655
656
|
|
656
657
|
space+ {
|
657
658
|
if free_spacing
|
658
|
-
emit(:free_space, :whitespace,
|
659
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
659
660
|
else
|
660
661
|
append_literal(data, ts, te)
|
661
662
|
end
|
@@ -664,11 +665,7 @@
|
|
664
665
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
665
666
|
# except meta characters.
|
666
667
|
# ------------------------------------------------------------------------
|
667
|
-
(ascii_print -- space)+
|
668
|
-
ascii_nonprint+ |
|
669
|
-
utf8_2_byte+ |
|
670
|
-
utf8_3_byte+ |
|
671
|
-
utf8_4_byte+ {
|
668
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
672
669
|
append_literal(data, ts, te)
|
673
670
|
};
|
674
671
|
|
@@ -737,21 +734,16 @@ class Regexp::Scanner
|
|
737
734
|
#
|
738
735
|
# This method may raise errors if a syntax error is encountered.
|
739
736
|
# --------------------------------------------------------------------------
|
740
|
-
def self.scan(input_object, &block)
|
741
|
-
new.scan(input_object, &block)
|
737
|
+
def self.scan(input_object, options: nil, &block)
|
738
|
+
new.scan(input_object, options: options, &block)
|
742
739
|
end
|
743
740
|
|
744
|
-
def scan(input_object, &block)
|
741
|
+
def scan(input_object, options: nil, &block)
|
745
742
|
self.literal = nil
|
746
743
|
stack = []
|
747
744
|
|
748
|
-
|
749
|
-
|
750
|
-
self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
751
|
-
else
|
752
|
-
input = input_object
|
753
|
-
self.free_spacing = false
|
754
|
-
end
|
745
|
+
input = input_object.is_a?(Regexp) ? input_object.source : input_object
|
746
|
+
self.free_spacing = free_spacing?(input_object, options)
|
755
747
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
756
748
|
|
757
749
|
data = input.unpack("c*") if input.is_a?(String)
|
@@ -763,6 +755,7 @@ class Regexp::Scanner
|
|
763
755
|
self.set_depth = 0
|
764
756
|
self.group_depth = 0
|
765
757
|
self.conditional_stack = []
|
758
|
+
self.char_pos = 0
|
766
759
|
|
767
760
|
%% write data;
|
768
761
|
%% write init;
|
@@ -772,7 +765,7 @@ class Regexp::Scanner
|
|
772
765
|
testEof = testEof
|
773
766
|
|
774
767
|
if cs == re_scanner_error
|
775
|
-
text =
|
768
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
776
769
|
raise ScannerError.new("Scan error at '#{text}'")
|
777
770
|
end
|
778
771
|
|
@@ -800,22 +793,41 @@ class Regexp::Scanner
|
|
800
793
|
end
|
801
794
|
|
802
795
|
# Emits an array with the details of the scanned pattern
|
803
|
-
def emit(type, token, text
|
796
|
+
def emit(type, token, text)
|
804
797
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
805
798
|
|
806
799
|
emit_literal if literal
|
807
800
|
|
801
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
802
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
803
|
+
ts_char_pos = char_pos
|
804
|
+
te_char_pos = char_pos + text.length
|
805
|
+
|
808
806
|
if block
|
809
|
-
block.call type, token, text,
|
807
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
810
808
|
end
|
811
809
|
|
812
|
-
tokens << [type, token, text,
|
810
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
811
|
+
|
812
|
+
self.char_pos = te_char_pos
|
813
813
|
end
|
814
814
|
|
815
815
|
private
|
816
816
|
|
817
817
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
818
|
-
:group_depth, :set_depth, :conditional_stack
|
818
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
819
|
+
|
820
|
+
def free_spacing?(input_object, options)
|
821
|
+
if options && !input_object.is_a?(String)
|
822
|
+
raise ArgumentError, 'options cannot be supplied unless scanning a String'
|
823
|
+
end
|
824
|
+
|
825
|
+
options = input_object.options if input_object.is_a?(::Regexp)
|
826
|
+
|
827
|
+
return false unless options
|
828
|
+
|
829
|
+
options & Regexp::EXTENDED != 0
|
830
|
+
end
|
819
831
|
|
820
832
|
def in_group?
|
821
833
|
group_depth > 0
|
@@ -826,36 +838,25 @@ class Regexp::Scanner
|
|
826
838
|
end
|
827
839
|
|
828
840
|
# Copy from ts to te from data as text
|
829
|
-
def copy(data,
|
830
|
-
data[
|
831
|
-
end
|
832
|
-
|
833
|
-
# Copy from ts to te from data as text, returning an array with the text
|
834
|
-
# and the offsets used to copy it.
|
835
|
-
def text(data, ts, te, soff = 0)
|
836
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
841
|
+
def copy(data, ts, te)
|
842
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
837
843
|
end
|
838
844
|
|
839
845
|
# Appends one or more characters to the literal buffer, to be emitted later
|
840
|
-
# by a call to emit_literal.
|
846
|
+
# by a call to emit_literal.
|
841
847
|
def append_literal(data, ts, te)
|
842
848
|
self.literal = literal || []
|
843
|
-
literal <<
|
849
|
+
literal << copy(data, ts, te)
|
844
850
|
end
|
845
851
|
|
846
|
-
# Emits the literal run collected by calls to the append_literal method
|
847
|
-
# using the total start (ts) and end (te) offsets of the run.
|
852
|
+
# Emits the literal run collected by calls to the append_literal method.
|
848
853
|
def emit_literal
|
849
|
-
|
850
|
-
text = literal.map {|t| t[0]}.join
|
851
|
-
|
852
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
853
|
-
|
854
|
+
text = literal.join
|
854
855
|
self.literal = nil
|
855
|
-
emit(:literal, :literal, text
|
856
|
+
emit(:literal, :literal, text)
|
856
857
|
end
|
857
858
|
|
858
|
-
def emit_options(text
|
859
|
+
def emit_options(text)
|
859
860
|
token = nil
|
860
861
|
|
861
862
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -881,14 +882,14 @@ class Regexp::Scanner
|
|
881
882
|
token = :options_switch
|
882
883
|
end
|
883
884
|
|
884
|
-
emit(:group, token, text
|
885
|
+
emit(:group, token, text)
|
885
886
|
end
|
886
887
|
|
887
888
|
def emit_meta_control_sequence(data, ts, te, token)
|
888
889
|
if data.last < 0x00 || data.last > 0x7F
|
889
890
|
validation_error(:sequence, 'escape', token.to_s)
|
890
891
|
end
|
891
|
-
emit(:escape, token,
|
892
|
+
emit(:escape, token, copy(data, ts-1, te))
|
892
893
|
end
|
893
894
|
|
894
895
|
# Centralizes and unifies the handling of validation related
|