regexp_parser 1.8.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/lib/regexp_parser/expression.rb +4 -17
- data/lib/regexp_parser/expression/classes/group.rb +17 -2
- data/lib/regexp_parser/expression/classes/root.rb +4 -16
- data/lib/regexp_parser/expression/quantifier.rb +9 -0
- data/lib/regexp_parser/expression/sequence.rb +0 -10
- data/lib/regexp_parser/lexer.rb +2 -2
- data/lib/regexp_parser/parser.rb +27 -0
- data/lib/regexp_parser/scanner.rb +901 -820
- data/lib/regexp_parser/scanner/char_type.rl +11 -11
- data/lib/regexp_parser/scanner/property.rl +2 -2
- data/lib/regexp_parser/scanner/scanner.rl +152 -153
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/base_spec.rb +10 -0
- data/spec/expression/to_s_spec.rb +16 -0
- data/spec/lexer/literals_spec.rb +24 -49
- data/spec/parser/escapes_spec.rb +1 -1
- data/spec/parser/quantifiers_spec.rb +15 -0
- data/spec/parser/set/ranges_spec.rb +3 -3
- data/spec/scanner/escapes_spec.rb +6 -0
- data/spec/scanner/literals_spec.rb +28 -38
- data/spec/scanner/quantifiers_spec.rb +18 -13
- data/spec/scanner/sets_spec.rb +8 -2
- metadata +2 -6
- data/spec/expression/root_spec.rb +0 -9
- data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
|
|
10
10
|
# --------------------------------------------------------------------------
|
11
11
|
char_type := |*
|
12
12
|
char_type_char {
|
13
|
-
case text =
|
14
|
-
when '\d'; emit(:type, :digit, text
|
15
|
-
when '\D'; emit(:type, :nondigit, text
|
16
|
-
when '\h'; emit(:type, :hex, text
|
17
|
-
when '\H'; emit(:type, :nonhex, text
|
18
|
-
when '\s'; emit(:type, :space, text
|
19
|
-
when '\S'; emit(:type, :nonspace, text
|
20
|
-
when '\w'; emit(:type, :word, text
|
21
|
-
when '\W'; emit(:type, :nonword, text
|
22
|
-
when '\R'; emit(:type, :linebreak, text
|
23
|
-
when '\X'; emit(:type, :xgrapheme, text
|
13
|
+
case text = copy(data, ts-1, te)
|
14
|
+
when '\d'; emit(:type, :digit, text)
|
15
|
+
when '\D'; emit(:type, :nondigit, text)
|
16
|
+
when '\h'; emit(:type, :hex, text)
|
17
|
+
when '\H'; emit(:type, :nonhex, text)
|
18
|
+
when '\s'; emit(:type, :space, text)
|
19
|
+
when '\S'; emit(:type, :nonspace, text)
|
20
|
+
when '\w'; emit(:type, :word, text)
|
21
|
+
when '\W'; emit(:type, :nonword, text)
|
22
|
+
when '\R'; emit(:type, :linebreak, text)
|
23
|
+
when '\X'; emit(:type, :xgrapheme, text)
|
24
24
|
end
|
25
25
|
fret;
|
26
26
|
};
|
@@ -14,7 +14,7 @@
|
|
14
14
|
unicode_property := |*
|
15
15
|
|
16
16
|
property_sequence < eof(premature_property_end) {
|
17
|
-
text =
|
17
|
+
text = copy(data, ts-1, te)
|
18
18
|
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
19
19
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
@@ -22,7 +22,7 @@
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
23
|
raise UnknownUnicodePropertyError.new(name) unless token
|
24
24
|
|
25
|
-
self.emit(type, token.to_sym, text
|
25
|
+
self.emit(type, token.to_sym, text)
|
26
26
|
|
27
27
|
fret;
|
28
28
|
};
|
@@ -135,13 +135,13 @@
|
|
135
135
|
|
136
136
|
# EOF error, used where it can be detected
|
137
137
|
action premature_end_error {
|
138
|
-
text =
|
138
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
139
139
|
raise PrematureEndError.new( text )
|
140
140
|
}
|
141
141
|
|
142
142
|
# Invalid sequence error, used from sequences, like escapes and sets
|
143
143
|
action invalid_sequence_error {
|
144
|
-
text =
|
144
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
145
145
|
validation_error(:sequence, 'sequence', text)
|
146
146
|
}
|
147
147
|
|
@@ -156,7 +156,7 @@
|
|
156
156
|
# --------------------------------------------------------------------------
|
157
157
|
character_set := |*
|
158
158
|
set_close > (set_meta, 2) @set_closed {
|
159
|
-
emit(:set, :close,
|
159
|
+
emit(:set, :close, copy(data, ts, te))
|
160
160
|
if in_set?
|
161
161
|
fret;
|
162
162
|
else
|
@@ -165,8 +165,8 @@
|
|
165
165
|
};
|
166
166
|
|
167
167
|
'-]' @set_closed { # special case, emits two tokens
|
168
|
-
emit(:literal, :literal, copy(data, ts
|
169
|
-
emit(:set, :close, copy(data, ts+1
|
168
|
+
emit(:literal, :literal, copy(data, ts, te-1))
|
169
|
+
emit(:set, :close, copy(data, ts+1, te))
|
170
170
|
if in_set?
|
171
171
|
fret;
|
172
172
|
else
|
@@ -175,33 +175,33 @@
|
|
175
175
|
};
|
176
176
|
|
177
177
|
'-&&' { # special case, emits two tokens
|
178
|
-
emit(:literal, :literal, '-'
|
179
|
-
emit(:set, :intersection, '&&'
|
178
|
+
emit(:literal, :literal, '-')
|
179
|
+
emit(:set, :intersection, '&&')
|
180
180
|
};
|
181
181
|
|
182
182
|
'^' {
|
183
|
-
text =
|
183
|
+
text = copy(data, ts, te)
|
184
184
|
if tokens.last[1] == :open
|
185
|
-
emit(:set, :negate, text
|
185
|
+
emit(:set, :negate, text)
|
186
186
|
else
|
187
|
-
emit(:literal, :literal, text
|
187
|
+
emit(:literal, :literal, text)
|
188
188
|
end
|
189
189
|
};
|
190
190
|
|
191
191
|
'-' {
|
192
|
-
text =
|
192
|
+
text = copy(data, ts, te)
|
193
193
|
# ranges cant start with a subset or intersection/negation/range operator
|
194
194
|
if tokens.last[0] == :set
|
195
|
-
emit(:literal, :literal, text
|
195
|
+
emit(:literal, :literal, text)
|
196
196
|
else
|
197
|
-
emit(:set, :range, text
|
197
|
+
emit(:set, :range, text)
|
198
198
|
end
|
199
199
|
};
|
200
200
|
|
201
201
|
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
202
202
|
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
203
203
|
'&&' {
|
204
|
-
emit(:set, :intersection,
|
204
|
+
emit(:set, :intersection, copy(data, ts, te))
|
205
205
|
};
|
206
206
|
|
207
207
|
backslash {
|
@@ -209,12 +209,12 @@
|
|
209
209
|
};
|
210
210
|
|
211
211
|
set_open >(open_bracket, 1) >set_opened {
|
212
|
-
emit(:set, :open,
|
212
|
+
emit(:set, :open, copy(data, ts, te))
|
213
213
|
fcall character_set;
|
214
214
|
};
|
215
215
|
|
216
216
|
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
217
|
-
text =
|
217
|
+
text = copy(data, ts, te)
|
218
218
|
|
219
219
|
type = :posixclass
|
220
220
|
class_name = text[2..-3]
|
@@ -223,19 +223,19 @@
|
|
223
223
|
type = :nonposixclass
|
224
224
|
end
|
225
225
|
|
226
|
-
emit(type, class_name.to_sym, text
|
226
|
+
emit(type, class_name.to_sym, text)
|
227
227
|
};
|
228
228
|
|
229
229
|
collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
230
|
-
emit(:set, :collation,
|
230
|
+
emit(:set, :collation, copy(data, ts, te))
|
231
231
|
};
|
232
232
|
|
233
233
|
character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
234
|
-
emit(:set, :equivalent,
|
234
|
+
emit(:set, :equivalent, copy(data, ts, te))
|
235
235
|
};
|
236
236
|
|
237
237
|
meta_char > (set_meta, 1) {
|
238
|
-
emit(:literal, :literal,
|
238
|
+
emit(:literal, :literal, copy(data, ts, te))
|
239
239
|
};
|
240
240
|
|
241
241
|
any |
|
@@ -243,9 +243,8 @@
|
|
243
243
|
utf8_2_byte |
|
244
244
|
utf8_3_byte |
|
245
245
|
utf8_4_byte {
|
246
|
-
|
247
|
-
|
248
|
-
emit(:literal, :literal, char, *rest)
|
246
|
+
text = copy(data, ts, te)
|
247
|
+
emit(:literal, :literal, text)
|
249
248
|
};
|
250
249
|
*|;
|
251
250
|
|
@@ -253,7 +252,7 @@
|
|
253
252
|
# --------------------------------------------------------------------------
|
254
253
|
set_escape_sequence := |*
|
255
254
|
non_set_escape > (escaped_set_alpha, 2) {
|
256
|
-
emit(:escape, :literal,
|
255
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
257
256
|
fret;
|
258
257
|
};
|
259
258
|
|
@@ -269,33 +268,33 @@
|
|
269
268
|
# --------------------------------------------------------------------------
|
270
269
|
escape_sequence := |*
|
271
270
|
[1-9] {
|
272
|
-
text =
|
273
|
-
emit(:backref, :number, text
|
271
|
+
text = copy(data, ts-1, te)
|
272
|
+
emit(:backref, :number, text)
|
274
273
|
fret;
|
275
274
|
};
|
276
275
|
|
277
276
|
octal_sequence {
|
278
|
-
emit(:escape, :octal,
|
277
|
+
emit(:escape, :octal, copy(data, ts-1, te))
|
279
278
|
fret;
|
280
279
|
};
|
281
280
|
|
282
281
|
meta_char {
|
283
|
-
case text =
|
284
|
-
when '\.'; emit(:escape, :dot, text
|
285
|
-
when '\|'; emit(:escape, :alternation, text
|
286
|
-
when '\^'; emit(:escape, :bol, text
|
287
|
-
when '\$'; emit(:escape, :eol, text
|
288
|
-
when '\?'; emit(:escape, :zero_or_one, text
|
289
|
-
when '\*'; emit(:escape, :zero_or_more, text
|
290
|
-
when '\+'; emit(:escape, :one_or_more, text
|
291
|
-
when '\('; emit(:escape, :group_open, text
|
292
|
-
when '\)'; emit(:escape, :group_close, text
|
293
|
-
when '\{'; emit(:escape, :interval_open, text
|
294
|
-
when '\}'; emit(:escape, :interval_close, text
|
295
|
-
when '\['; emit(:escape, :set_open, text
|
296
|
-
when '\]'; emit(:escape, :set_close, text
|
282
|
+
case text = copy(data, ts-1, te)
|
283
|
+
when '\.'; emit(:escape, :dot, text)
|
284
|
+
when '\|'; emit(:escape, :alternation, text)
|
285
|
+
when '\^'; emit(:escape, :bol, text)
|
286
|
+
when '\$'; emit(:escape, :eol, text)
|
287
|
+
when '\?'; emit(:escape, :zero_or_one, text)
|
288
|
+
when '\*'; emit(:escape, :zero_or_more, text)
|
289
|
+
when '\+'; emit(:escape, :one_or_more, text)
|
290
|
+
when '\('; emit(:escape, :group_open, text)
|
291
|
+
when '\)'; emit(:escape, :group_close, text)
|
292
|
+
when '\{'; emit(:escape, :interval_open, text)
|
293
|
+
when '\}'; emit(:escape, :interval_close, text)
|
294
|
+
when '\['; emit(:escape, :set_open, text)
|
295
|
+
when '\]'; emit(:escape, :set_close, text)
|
297
296
|
when "\\\\";
|
298
|
-
emit(:escape, :backslash, text
|
297
|
+
emit(:escape, :backslash, text)
|
299
298
|
end
|
300
299
|
fret;
|
301
300
|
};
|
@@ -303,31 +302,31 @@
|
|
303
302
|
escaped_ascii > (escaped_alpha, 7) {
|
304
303
|
# \b is emitted as backspace only when inside a character set, otherwise
|
305
304
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
306
|
-
case text =
|
307
|
-
when '\a'; emit(:escape, :bell, text
|
308
|
-
when '\b'; emit(:escape, :backspace, text
|
309
|
-
when '\e'; emit(:escape, :escape, text
|
310
|
-
when '\f'; emit(:escape, :form_feed, text
|
311
|
-
when '\n'; emit(:escape, :newline, text
|
312
|
-
when '\r'; emit(:escape, :carriage, text
|
313
|
-
when '\t'; emit(:escape, :tab, text
|
314
|
-
when '\v'; emit(:escape, :vertical_tab, text
|
305
|
+
case text = copy(data, ts-1, te)
|
306
|
+
when '\a'; emit(:escape, :bell, text)
|
307
|
+
when '\b'; emit(:escape, :backspace, text)
|
308
|
+
when '\e'; emit(:escape, :escape, text)
|
309
|
+
when '\f'; emit(:escape, :form_feed, text)
|
310
|
+
when '\n'; emit(:escape, :newline, text)
|
311
|
+
when '\r'; emit(:escape, :carriage, text)
|
312
|
+
when '\t'; emit(:escape, :tab, text)
|
313
|
+
when '\v'; emit(:escape, :vertical_tab, text)
|
315
314
|
end
|
316
315
|
fret;
|
317
316
|
};
|
318
317
|
|
319
318
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
320
|
-
text =
|
319
|
+
text = copy(data, ts-1, te)
|
321
320
|
if text[2].chr == '{'
|
322
|
-
emit(:escape, :codepoint_list, text
|
321
|
+
emit(:escape, :codepoint_list, text)
|
323
322
|
else
|
324
|
-
emit(:escape, :codepoint, text
|
323
|
+
emit(:escape, :codepoint, text)
|
325
324
|
end
|
326
325
|
fret;
|
327
326
|
};
|
328
327
|
|
329
328
|
hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
330
|
-
emit(:escape, :hex,
|
329
|
+
emit(:escape, :hex, copy(data, ts-1, te))
|
331
330
|
fret;
|
332
331
|
};
|
333
332
|
|
@@ -357,8 +356,11 @@
|
|
357
356
|
fcall unicode_property;
|
358
357
|
};
|
359
358
|
|
360
|
-
(any -- non_literal_escape)
|
361
|
-
|
359
|
+
(any -- non_literal_escape) |
|
360
|
+
utf8_2_byte |
|
361
|
+
utf8_3_byte |
|
362
|
+
utf8_4_byte > (escaped_alpha, 1) {
|
363
|
+
emit(:escape, :literal, copy(data, ts-1, te))
|
362
364
|
fret;
|
363
365
|
};
|
364
366
|
*|;
|
@@ -368,9 +370,9 @@
|
|
368
370
|
# --------------------------------------------------------------------------
|
369
371
|
conditional_expression := |*
|
370
372
|
group_lookup . ')' {
|
371
|
-
text =
|
372
|
-
emit(:conditional, :condition, text
|
373
|
-
emit(:conditional, :condition_close, ')'
|
373
|
+
text = copy(data, ts, te-1)
|
374
|
+
emit(:conditional, :condition, text)
|
375
|
+
emit(:conditional, :condition_close, ')')
|
374
376
|
};
|
375
377
|
|
376
378
|
any {
|
@@ -387,39 +389,39 @@
|
|
387
389
|
# Meta characters
|
388
390
|
# ------------------------------------------------------------------------
|
389
391
|
dot {
|
390
|
-
emit(:meta, :dot,
|
392
|
+
emit(:meta, :dot, copy(data, ts, te))
|
391
393
|
};
|
392
394
|
|
393
395
|
alternation {
|
394
396
|
if conditional_stack.last == group_depth
|
395
|
-
emit(:conditional, :separator,
|
397
|
+
emit(:conditional, :separator, copy(data, ts, te))
|
396
398
|
else
|
397
|
-
emit(:meta, :alternation,
|
399
|
+
emit(:meta, :alternation, copy(data, ts, te))
|
398
400
|
end
|
399
401
|
};
|
400
402
|
|
401
403
|
# Anchors
|
402
404
|
# ------------------------------------------------------------------------
|
403
405
|
beginning_of_line {
|
404
|
-
emit(:anchor, :bol,
|
406
|
+
emit(:anchor, :bol, copy(data, ts, te))
|
405
407
|
};
|
406
408
|
|
407
409
|
end_of_line {
|
408
|
-
emit(:anchor, :eol,
|
410
|
+
emit(:anchor, :eol, copy(data, ts, te))
|
409
411
|
};
|
410
412
|
|
411
413
|
backslash . keep_mark > (backslashed, 4) {
|
412
|
-
emit(:keep, :mark,
|
414
|
+
emit(:keep, :mark, copy(data, ts, te))
|
413
415
|
};
|
414
416
|
|
415
417
|
backslash . anchor_char > (backslashed, 3) {
|
416
|
-
case text =
|
417
|
-
when '\\A'; emit(:anchor, :bos, text
|
418
|
-
when '\\z'; emit(:anchor, :eos, text
|
419
|
-
when '\\Z'; emit(:anchor, :eos_ob_eol, text
|
420
|
-
when '\\b'; emit(:anchor, :word_boundary, text
|
421
|
-
when '\\B'; emit(:anchor, :nonword_boundary, text
|
422
|
-
when '\\G'; emit(:anchor, :match_start, text
|
418
|
+
case text = copy(data, ts, te)
|
419
|
+
when '\\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\\G'; emit(:anchor, :match_start, text)
|
423
425
|
end
|
424
426
|
};
|
425
427
|
|
@@ -430,7 +432,7 @@
|
|
430
432
|
# Character sets
|
431
433
|
# ------------------------------------------------------------------------
|
432
434
|
set_open >set_opened {
|
433
|
-
emit(:set, :open,
|
435
|
+
emit(:set, :open, copy(data, ts, te))
|
434
436
|
fcall character_set;
|
435
437
|
};
|
436
438
|
|
@@ -439,12 +441,12 @@
|
|
439
441
|
# (?(condition)Y|N) conditional expression
|
440
442
|
# ------------------------------------------------------------------------
|
441
443
|
conditional {
|
442
|
-
text =
|
444
|
+
text = copy(data, ts, te)
|
443
445
|
|
444
446
|
conditional_stack << group_depth
|
445
447
|
|
446
|
-
emit(:conditional, :open, text[0..-2]
|
447
|
-
emit(:conditional, :condition_open, '('
|
448
|
+
emit(:conditional, :open, text[0..-2])
|
449
|
+
emit(:conditional, :condition_open, '(')
|
448
450
|
fcall conditional_expression;
|
449
451
|
};
|
450
452
|
|
@@ -455,7 +457,7 @@
|
|
455
457
|
# correct closing count.
|
456
458
|
# ------------------------------------------------------------------------
|
457
459
|
group_open . group_comment $group_closed {
|
458
|
-
emit(:group, :comment,
|
460
|
+
emit(:group, :comment, copy(data, ts, te))
|
459
461
|
};
|
460
462
|
|
461
463
|
# Expression options:
|
@@ -470,11 +472,11 @@
|
|
470
472
|
# (?imxdau-imx:subexp) option on/off for subexp
|
471
473
|
# ------------------------------------------------------------------------
|
472
474
|
group_open . group_options >group_opened {
|
473
|
-
text =
|
475
|
+
text = copy(data, ts, te)
|
474
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
475
477
|
raise InvalidGroupOption.new($1 || "-#{$2}", text)
|
476
478
|
end
|
477
|
-
emit_options(text
|
479
|
+
emit_options(text)
|
478
480
|
};
|
479
481
|
|
480
482
|
# Assertions
|
@@ -484,11 +486,11 @@
|
|
484
486
|
# (?<!subexp) negative look-behind
|
485
487
|
# ------------------------------------------------------------------------
|
486
488
|
group_open . assertion_type >group_opened {
|
487
|
-
case text =
|
488
|
-
when '(?='; emit(:assertion, :lookahead, text
|
489
|
-
when '(?!'; emit(:assertion, :nlookahead, text
|
490
|
-
when '(?<='; emit(:assertion, :lookbehind, text
|
491
|
-
when '(?<!'; emit(:assertion, :nlookbehind, text
|
489
|
+
case text = copy(data, ts, te)
|
490
|
+
when '(?='; emit(:assertion, :lookahead, text)
|
491
|
+
when '(?!'; emit(:assertion, :nlookahead, text)
|
492
|
+
when '(?<='; emit(:assertion, :lookbehind, text)
|
493
|
+
when '(?<!'; emit(:assertion, :nlookbehind, text)
|
492
494
|
end
|
493
495
|
};
|
494
496
|
|
@@ -501,32 +503,32 @@
|
|
501
503
|
# (subexp) captured group
|
502
504
|
# ------------------------------------------------------------------------
|
503
505
|
group_open . group_type >group_opened {
|
504
|
-
case text =
|
505
|
-
when '(?:'; emit(:group, :passive, text
|
506
|
-
when '(?>'; emit(:group, :atomic, text
|
507
|
-
when '(?~'; emit(:group, :absence, text
|
506
|
+
case text = copy(data, ts, te)
|
507
|
+
when '(?:'; emit(:group, :passive, text)
|
508
|
+
when '(?>'; emit(:group, :atomic, text)
|
509
|
+
when '(?~'; emit(:group, :absence, text)
|
508
510
|
|
509
511
|
when /^\(\?(?:<>|'')/
|
510
512
|
validation_error(:group, 'named group', 'name is empty')
|
511
513
|
|
512
514
|
when /^\(\?<\w*>/
|
513
|
-
emit(:group, :named_ab, text
|
515
|
+
emit(:group, :named_ab, text)
|
514
516
|
|
515
517
|
when /^\(\?'\w*'/
|
516
|
-
emit(:group, :named_sq, text
|
518
|
+
emit(:group, :named_sq, text)
|
517
519
|
|
518
520
|
end
|
519
521
|
};
|
520
522
|
|
521
523
|
group_open @group_opened {
|
522
|
-
text =
|
523
|
-
emit(:group, :capture, text
|
524
|
+
text = copy(data, ts, te)
|
525
|
+
emit(:group, :capture, text)
|
524
526
|
};
|
525
527
|
|
526
528
|
group_close @group_closed {
|
527
529
|
if conditional_stack.last == group_depth + 1
|
528
530
|
conditional_stack.pop
|
529
|
-
emit(:conditional, :close,
|
531
|
+
emit(:conditional, :close, copy(data, ts, te))
|
530
532
|
else
|
531
533
|
if spacing_stack.length > 1 &&
|
532
534
|
spacing_stack.last[:depth] == group_depth + 1
|
@@ -534,7 +536,7 @@
|
|
534
536
|
self.free_spacing = spacing_stack.last[:free_spacing]
|
535
537
|
end
|
536
538
|
|
537
|
-
emit(:group, :close,
|
539
|
+
emit(:group, :close, copy(data, ts, te))
|
538
540
|
end
|
539
541
|
};
|
540
542
|
|
@@ -542,63 +544,63 @@
|
|
542
544
|
# Group backreference, named and numbered
|
543
545
|
# ------------------------------------------------------------------------
|
544
546
|
backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
|
545
|
-
case text =
|
547
|
+
case text = copy(data, ts, te)
|
546
548
|
when /^\\([gk])(<>|'')/ # angle brackets
|
547
549
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
548
550
|
|
549
551
|
when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
|
550
552
|
if $1 == 'k'
|
551
|
-
emit(:backref, :name_ref_ab, text
|
553
|
+
emit(:backref, :name_ref_ab, text)
|
552
554
|
else
|
553
|
-
emit(:backref, :name_call_ab, text
|
555
|
+
emit(:backref, :name_call_ab, text)
|
554
556
|
end
|
555
557
|
|
556
558
|
when /^\\([gk])'[^\d+-]\w*'/ #single quotes
|
557
559
|
if $1 == 'k'
|
558
|
-
emit(:backref, :name_ref_sq, text
|
560
|
+
emit(:backref, :name_ref_sq, text)
|
559
561
|
else
|
560
|
-
emit(:backref, :name_call_sq, text
|
562
|
+
emit(:backref, :name_call_sq, text)
|
561
563
|
end
|
562
564
|
|
563
565
|
when /^\\([gk])<\d+>/ # angle-brackets
|
564
566
|
if $1 == 'k'
|
565
|
-
emit(:backref, :number_ref_ab, text
|
567
|
+
emit(:backref, :number_ref_ab, text)
|
566
568
|
else
|
567
|
-
emit(:backref, :number_call_ab, text
|
569
|
+
emit(:backref, :number_call_ab, text)
|
568
570
|
end
|
569
571
|
|
570
572
|
when /^\\([gk])'\d+'/ # single quotes
|
571
573
|
if $1 == 'k'
|
572
|
-
emit(:backref, :number_ref_sq, text
|
574
|
+
emit(:backref, :number_ref_sq, text)
|
573
575
|
else
|
574
|
-
emit(:backref, :number_call_sq, text
|
576
|
+
emit(:backref, :number_call_sq, text)
|
575
577
|
end
|
576
578
|
|
577
579
|
when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
|
578
580
|
if $1 == 'k'
|
579
|
-
emit(:backref, :number_rel_ref_ab, text
|
581
|
+
emit(:backref, :number_rel_ref_ab, text)
|
580
582
|
else
|
581
|
-
emit(:backref, :number_rel_call_ab, text
|
583
|
+
emit(:backref, :number_rel_call_ab, text)
|
582
584
|
end
|
583
585
|
|
584
586
|
when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
|
585
587
|
if $1 == 'k'
|
586
|
-
emit(:backref, :number_rel_ref_sq, text
|
588
|
+
emit(:backref, :number_rel_ref_sq, text)
|
587
589
|
else
|
588
|
-
emit(:backref, :number_rel_call_sq, text
|
590
|
+
emit(:backref, :number_rel_call_sq, text)
|
589
591
|
end
|
590
592
|
|
591
593
|
when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
|
592
|
-
emit(:backref, :name_recursion_ref_ab, text
|
594
|
+
emit(:backref, :name_recursion_ref_ab, text)
|
593
595
|
|
594
596
|
when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
|
595
|
-
emit(:backref, :name_recursion_ref_sq, text
|
597
|
+
emit(:backref, :name_recursion_ref_sq, text)
|
596
598
|
|
597
599
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
598
|
-
emit(:backref, :number_recursion_ref_ab, text
|
600
|
+
emit(:backref, :number_recursion_ref_ab, text)
|
599
601
|
|
600
602
|
when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
|
601
|
-
emit(:backref, :number_recursion_ref_sq, text
|
603
|
+
emit(:backref, :number_recursion_ref_sq, text)
|
602
604
|
|
603
605
|
end
|
604
606
|
};
|
@@ -607,31 +609,31 @@
|
|
607
609
|
# Quantifiers
|
608
610
|
# ------------------------------------------------------------------------
|
609
611
|
zero_or_one {
|
610
|
-
case text =
|
611
|
-
when '?' ; emit(:quantifier, :zero_or_one, text
|
612
|
-
when '??'; emit(:quantifier, :zero_or_one_reluctant, text
|
613
|
-
when '?+'; emit(:quantifier, :zero_or_one_possessive, text
|
612
|
+
case text = copy(data, ts, te)
|
613
|
+
when '?' ; emit(:quantifier, :zero_or_one, text)
|
614
|
+
when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
|
615
|
+
when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
|
614
616
|
end
|
615
617
|
};
|
616
618
|
|
617
619
|
zero_or_more {
|
618
|
-
case text =
|
619
|
-
when '*' ; emit(:quantifier, :zero_or_more, text
|
620
|
-
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text
|
621
|
-
when '*+'; emit(:quantifier, :zero_or_more_possessive, text
|
620
|
+
case text = copy(data, ts, te)
|
621
|
+
when '*' ; emit(:quantifier, :zero_or_more, text)
|
622
|
+
when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
|
623
|
+
when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
|
622
624
|
end
|
623
625
|
};
|
624
626
|
|
625
627
|
one_or_more {
|
626
|
-
case text =
|
627
|
-
when '+' ; emit(:quantifier, :one_or_more, text
|
628
|
-
when '+?'; emit(:quantifier, :one_or_more_reluctant, text
|
629
|
-
when '++'; emit(:quantifier, :one_or_more_possessive, text
|
628
|
+
case text = copy(data, ts, te)
|
629
|
+
when '+' ; emit(:quantifier, :one_or_more, text)
|
630
|
+
when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
|
631
|
+
when '++'; emit(:quantifier, :one_or_more_possessive, text)
|
630
632
|
end
|
631
633
|
};
|
632
634
|
|
633
635
|
quantifier_interval {
|
634
|
-
emit(:quantifier, :interval,
|
636
|
+
emit(:quantifier, :interval, copy(data, ts, te))
|
635
637
|
};
|
636
638
|
|
637
639
|
# Catch unmatched curly braces as literals
|
@@ -647,7 +649,7 @@
|
|
647
649
|
|
648
650
|
comment {
|
649
651
|
if free_spacing
|
650
|
-
emit(:free_space, :comment,
|
652
|
+
emit(:free_space, :comment, copy(data, ts, te))
|
651
653
|
else
|
652
654
|
# consume only the pound sign (#) and backtrack to do regular scanning
|
653
655
|
append_literal(data, ts, ts + 1)
|
@@ -657,7 +659,7 @@
|
|
657
659
|
|
658
660
|
space+ {
|
659
661
|
if free_spacing
|
660
|
-
emit(:free_space, :whitespace,
|
662
|
+
emit(:free_space, :whitespace, copy(data, ts, te))
|
661
663
|
else
|
662
664
|
append_literal(data, ts, te)
|
663
665
|
end
|
@@ -760,6 +762,7 @@ class Regexp::Scanner
|
|
760
762
|
self.set_depth = 0
|
761
763
|
self.group_depth = 0
|
762
764
|
self.conditional_stack = []
|
765
|
+
self.char_pos = 0
|
763
766
|
|
764
767
|
%% write data;
|
765
768
|
%% write init;
|
@@ -769,7 +772,7 @@ class Regexp::Scanner
|
|
769
772
|
testEof = testEof
|
770
773
|
|
771
774
|
if cs == re_scanner_error
|
772
|
-
text =
|
775
|
+
text = copy(data, ts ? ts-1 : 0, -1)
|
773
776
|
raise ScannerError.new("Scan error at '#{text}'")
|
774
777
|
end
|
775
778
|
|
@@ -797,22 +800,29 @@ class Regexp::Scanner
|
|
797
800
|
end
|
798
801
|
|
799
802
|
# Emits an array with the details of the scanned pattern
|
800
|
-
def emit(type, token, text
|
803
|
+
def emit(type, token, text)
|
801
804
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
802
805
|
|
803
806
|
emit_literal if literal
|
804
807
|
|
808
|
+
# Ragel runs with byte-based indices (ts, te). These are of little value to
|
809
|
+
# end-users, so we keep track of char-based indices and emit those instead.
|
810
|
+
ts_char_pos = char_pos
|
811
|
+
te_char_pos = char_pos + text.length
|
812
|
+
|
805
813
|
if block
|
806
|
-
block.call type, token, text,
|
814
|
+
block.call type, token, text, ts_char_pos, te_char_pos
|
807
815
|
end
|
808
816
|
|
809
|
-
tokens << [type, token, text,
|
817
|
+
tokens << [type, token, text, ts_char_pos, te_char_pos]
|
818
|
+
|
819
|
+
self.char_pos = te_char_pos
|
810
820
|
end
|
811
821
|
|
812
822
|
private
|
813
823
|
|
814
824
|
attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
|
815
|
-
:group_depth, :set_depth, :conditional_stack
|
825
|
+
:group_depth, :set_depth, :conditional_stack, :char_pos
|
816
826
|
|
817
827
|
def free_spacing?(input_object, options)
|
818
828
|
if options && !input_object.is_a?(String)
|
@@ -835,36 +845,25 @@ class Regexp::Scanner
|
|
835
845
|
end
|
836
846
|
|
837
847
|
# Copy from ts to te from data as text
|
838
|
-
def copy(data,
|
839
|
-
data[
|
840
|
-
end
|
841
|
-
|
842
|
-
# Copy from ts to te from data as text, returning an array with the text
|
843
|
-
# and the offsets used to copy it.
|
844
|
-
def text(data, ts, te, soff = 0)
|
845
|
-
[copy(data, ts-soff..te-1), ts-soff, te]
|
848
|
+
def copy(data, ts, te)
|
849
|
+
data[ts...te].pack('c*').force_encoding('utf-8')
|
846
850
|
end
|
847
851
|
|
848
852
|
# Appends one or more characters to the literal buffer, to be emitted later
|
849
|
-
# by a call to emit_literal.
|
853
|
+
# by a call to emit_literal.
|
850
854
|
def append_literal(data, ts, te)
|
851
855
|
self.literal = literal || []
|
852
|
-
literal <<
|
856
|
+
literal << copy(data, ts, te)
|
853
857
|
end
|
854
858
|
|
855
|
-
# Emits the literal run collected by calls to the append_literal method
|
856
|
-
# using the total start (ts) and end (te) offsets of the run.
|
859
|
+
# Emits the literal run collected by calls to the append_literal method.
|
857
860
|
def emit_literal
|
858
|
-
|
859
|
-
text = literal.map {|t| t[0]}.join
|
860
|
-
|
861
|
-
text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
|
862
|
-
|
861
|
+
text = literal.join
|
863
862
|
self.literal = nil
|
864
|
-
emit(:literal, :literal, text
|
863
|
+
emit(:literal, :literal, text)
|
865
864
|
end
|
866
865
|
|
867
|
-
def emit_options(text
|
866
|
+
def emit_options(text)
|
868
867
|
token = nil
|
869
868
|
|
870
869
|
# Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
|
@@ -890,14 +889,14 @@ class Regexp::Scanner
|
|
890
889
|
token = :options_switch
|
891
890
|
end
|
892
891
|
|
893
|
-
emit(:group, token, text
|
892
|
+
emit(:group, token, text)
|
894
893
|
end
|
895
894
|
|
896
895
|
def emit_meta_control_sequence(data, ts, te, token)
|
897
896
|
if data.last < 0x00 || data.last > 0x7F
|
898
897
|
validation_error(:sequence, 'escape', token.to_s)
|
899
898
|
end
|
900
|
-
emit(:escape, token,
|
899
|
+
emit(:escape, token, copy(data, ts-1, te))
|
901
900
|
end
|
902
901
|
|
903
902
|
# Centralizes and unifies the handling of validation related
|