regexp_parser 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -1,6 +1,7 @@
|
|
1
1
|
%%{
|
2
2
|
machine re_scanner;
|
3
|
-
include
|
3
|
+
include re_char_type "char_type.rl";
|
4
|
+
include re_property "property.rl";
|
4
5
|
|
5
6
|
dot = '.';
|
6
7
|
backslash = '\\';
|
@@ -35,25 +36,17 @@
|
|
35
36
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
37
|
character_equivalent = '[=' . alpha . '=]';
|
37
38
|
|
38
|
-
char_type = [dDhHsSwWRX];
|
39
|
-
|
40
39
|
line_anchor = beginning_of_line | end_of_line;
|
41
40
|
anchor_char = [AbBzZG];
|
42
41
|
|
43
|
-
escaped_ascii = [
|
42
|
+
escaped_ascii = [abefnrtv];
|
44
43
|
octal_sequence = [0-7]{1,3};
|
45
44
|
|
46
45
|
hex_sequence = 'x' . xdigit{1,2};
|
47
46
|
hex_sequence_err = 'x' . [^0-9a-fA-F{];
|
48
|
-
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
49
|
-
|
50
|
-
hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
|
51
|
-
|
52
|
-
wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
|
53
|
-
wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
|
54
47
|
|
55
48
|
codepoint_single = 'u' . xdigit{4};
|
56
|
-
codepoint_list = 'u{' . xdigit{1,
|
49
|
+
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
57
50
|
codepoint_sequence = codepoint_single | codepoint_list;
|
58
51
|
|
59
52
|
control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
|
@@ -110,6 +103,7 @@
|
|
110
103
|
|
111
104
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
112
105
|
|
106
|
+
keep_mark = 'K';
|
113
107
|
|
114
108
|
assertion_type = assertion_lookahead | assertion_nlookahead |
|
115
109
|
assertion_lookbehind | assertion_nlookbehind;
|
@@ -119,16 +113,18 @@
|
|
119
113
|
curlies | parantheses | brackets |
|
120
114
|
line_anchor | quantifier_greedy;
|
121
115
|
|
122
|
-
ascii_print = ((0x20..0x7e) - meta_char)
|
123
|
-
ascii_nonprint = (0x01..0x1f | 0x7f)
|
116
|
+
ascii_print = ((0x20..0x7e) - meta_char);
|
117
|
+
ascii_nonprint = (0x01..0x1f | 0x7f);
|
118
|
+
|
119
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
120
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
121
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
|
128
|
-
utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
123
|
+
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
124
|
+
group_ref | keep_mark | [xucCM];
|
129
125
|
|
130
|
-
|
131
|
-
|
126
|
+
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
127
|
+
multi_codepoint_char_type | [0-9cCM];
|
132
128
|
|
133
129
|
# EOF error, used where it can be detected
|
134
130
|
action premature_end_error {
|
@@ -150,11 +146,11 @@
|
|
150
146
|
# closing bracket of the set.
|
151
147
|
# --------------------------------------------------------------------------
|
152
148
|
character_set := |*
|
153
|
-
|
154
|
-
|
155
|
-
|
149
|
+
set_close > (set_meta, 2) {
|
150
|
+
set_depth -= 1
|
151
|
+
in_set = set_depth > 0 ? true : false
|
156
152
|
|
157
|
-
emit(
|
153
|
+
emit(:set, :close, *text(data, ts, te))
|
158
154
|
|
159
155
|
if set_depth == 0
|
160
156
|
fgoto main;
|
@@ -164,11 +160,11 @@
|
|
164
160
|
};
|
165
161
|
|
166
162
|
'-]' { # special case, emits two tokens
|
167
|
-
|
168
|
-
|
163
|
+
set_depth -= 1
|
164
|
+
in_set = set_depth > 0 ? true : false
|
169
165
|
|
170
|
-
emit(
|
171
|
-
emit(
|
166
|
+
emit(:literal, :literal, copy(data, ts..te-2), ts, te)
|
167
|
+
emit(:set, :close, copy(data, ts+1..te-1), ts, te)
|
172
168
|
|
173
169
|
if set_depth == 0
|
174
170
|
fgoto main;
|
@@ -177,59 +173,70 @@
|
|
177
173
|
end
|
178
174
|
};
|
179
175
|
|
176
|
+
'-&&' { # special case, emits two tokens
|
177
|
+
emit(:literal, :literal, '-', ts, te)
|
178
|
+
emit(:set, :intersection, '&&', ts, te)
|
179
|
+
};
|
180
|
+
|
180
181
|
'^' {
|
181
182
|
text = text(data, ts, te).first
|
182
183
|
if tokens.last[1] == :open
|
183
|
-
emit(
|
184
|
+
emit(:set, :negate, text, ts, te)
|
184
185
|
else
|
185
|
-
emit(
|
186
|
+
emit(:literal, :literal, text, ts, te)
|
186
187
|
end
|
187
188
|
};
|
188
189
|
|
189
|
-
|
190
|
-
|
190
|
+
'-' {
|
191
|
+
text = text(data, ts, te).first
|
192
|
+
# ranges cant start with a subset or intersection/negation/range operator
|
193
|
+
if tokens.last[0] == :set
|
194
|
+
emit(:literal, :literal, text, ts, te)
|
195
|
+
else
|
196
|
+
emit(:set, :range, text, ts, te)
|
197
|
+
end
|
191
198
|
};
|
192
199
|
|
200
|
+
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
201
|
+
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
193
202
|
'&&' {
|
194
|
-
emit(
|
203
|
+
emit(:set, :intersection, *text(data, ts, te))
|
195
204
|
};
|
196
205
|
|
197
|
-
|
206
|
+
backslash {
|
198
207
|
fcall set_escape_sequence;
|
199
208
|
};
|
200
209
|
|
201
|
-
|
202
|
-
set_depth += 1
|
203
|
-
set_type = set_depth > 1 ? :subset : :set
|
210
|
+
set_open >(open_bracket, 1) {
|
211
|
+
set_depth += 1
|
204
212
|
|
205
|
-
emit(
|
213
|
+
emit(:set, :open, *text(data, ts, te))
|
206
214
|
fcall character_set;
|
207
215
|
};
|
208
216
|
|
209
217
|
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
210
218
|
text = text(data, ts, te).first
|
211
219
|
|
220
|
+
type = :posixclass
|
212
221
|
class_name = text[2..-3]
|
213
222
|
if class_name[0].chr == '^'
|
214
|
-
class_name =
|
223
|
+
class_name = class_name[1..-1]
|
224
|
+
type = :nonposixclass
|
215
225
|
end
|
216
226
|
|
217
|
-
|
218
|
-
emit(set_type, token_sym, text, ts, te)
|
227
|
+
emit(type, class_name.to_sym, text, ts, te)
|
219
228
|
};
|
220
229
|
|
221
230
|
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
222
|
-
emit(
|
231
|
+
emit(:set, :collation, *text(data, ts, te))
|
223
232
|
};
|
224
233
|
|
225
234
|
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
226
|
-
emit(
|
235
|
+
emit(:set, :equivalent, *text(data, ts, te))
|
227
236
|
};
|
228
237
|
|
229
|
-
|
230
|
-
|
231
|
-
meta_char -- ']' {
|
232
|
-
emit(set_type, :member, *text(data, ts, te))
|
238
|
+
meta_char > (set_meta, 1) {
|
239
|
+
emit(:literal, :literal, *text(data, ts, te))
|
233
240
|
};
|
234
241
|
|
235
242
|
any |
|
@@ -237,63 +244,24 @@
|
|
237
244
|
utf8_2_byte |
|
238
245
|
utf8_3_byte |
|
239
246
|
utf8_4_byte {
|
240
|
-
|
247
|
+
char, *rest = *text(data, ts, te)
|
248
|
+
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
249
|
+
emit(:literal, :literal, char, *rest)
|
241
250
|
};
|
242
251
|
*|;
|
243
252
|
|
244
253
|
# set escapes scanner
|
245
254
|
# --------------------------------------------------------------------------
|
246
255
|
set_escape_sequence := |*
|
247
|
-
|
248
|
-
emit(
|
249
|
-
fret;
|
250
|
-
};
|
251
|
-
|
252
|
-
char_type > (escaped_set_alpha, 4) {
|
253
|
-
case text = text(data, ts, te, 1).first
|
254
|
-
when '\d'; emit(set_type, :type_digit, text, ts-1, te)
|
255
|
-
when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
|
256
|
-
when '\h'; emit(set_type, :type_hex, text, ts-1, te)
|
257
|
-
when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
|
258
|
-
when '\s'; emit(set_type, :type_space, text, ts-1, te)
|
259
|
-
when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
|
260
|
-
when '\w'; emit(set_type, :type_word, text, ts-1, te)
|
261
|
-
when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
|
262
|
-
when '\R'; emit(set_type, :type_linebreak, text, ts-1, te)
|
263
|
-
when '\X'; emit(set_type, :type_xgrapheme, text, ts-1, te)
|
264
|
-
end
|
265
|
-
fret;
|
266
|
-
};
|
267
|
-
|
268
|
-
hex_sequence . '-\\' . hex_sequence {
|
269
|
-
emit(set_type, :range_hex, *text(data, ts, te, 1))
|
270
|
-
fret;
|
271
|
-
};
|
272
|
-
|
273
|
-
hex_sequence {
|
274
|
-
emit(set_type, :member_hex, *text(data, ts, te, 1))
|
275
|
-
fret;
|
276
|
-
};
|
277
|
-
|
278
|
-
meta_char | [\\\]\-\,] {
|
279
|
-
emit(set_type, :escape, *text(data, ts, te, 1))
|
256
|
+
non_set_escape > (escaped_set_alpha, 2) {
|
257
|
+
emit(:escape, :literal, *text(data, ts, te, 1))
|
280
258
|
fret;
|
281
259
|
};
|
282
260
|
|
283
|
-
|
261
|
+
any > (escaped_set_alpha, 1) {
|
284
262
|
fhold;
|
285
263
|
fnext character_set;
|
286
|
-
fcall
|
287
|
-
};
|
288
|
-
|
289
|
-
# special case exclusion of escaped dash, could be cleaner.
|
290
|
-
(ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
|
291
|
-
ascii_nonprint |
|
292
|
-
utf8_2_byte |
|
293
|
-
utf8_3_byte |
|
294
|
-
utf8_4_byte {
|
295
|
-
emit(set_type, :escape, *text(data, ts, te, 1))
|
296
|
-
fret;
|
264
|
+
fcall escape_sequence;
|
297
265
|
};
|
298
266
|
*|;
|
299
267
|
|
@@ -338,11 +306,11 @@
|
|
338
306
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
339
307
|
case text = text(data, ts, te, 1).first
|
340
308
|
when '\a'; emit(:escape, :bell, text, ts-1, te)
|
309
|
+
when '\b'; emit(:escape, :backspace, text, ts-1, te)
|
341
310
|
when '\e'; emit(:escape, :escape, text, ts-1, te)
|
342
311
|
when '\f'; emit(:escape, :form_feed, text, ts-1, te)
|
343
312
|
when '\n'; emit(:escape, :newline, text, ts-1, te)
|
344
313
|
when '\r'; emit(:escape, :carriage, text, ts-1, te)
|
345
|
-
when '\s'; emit(:escape, :space, text, ts-1, te)
|
346
314
|
when '\t'; emit(:escape, :tab, text, ts-1, te)
|
347
315
|
when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
|
348
316
|
end
|
@@ -364,20 +332,10 @@
|
|
364
332
|
fret;
|
365
333
|
};
|
366
334
|
|
367
|
-
wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
368
|
-
emit(:escape, :hex_wide, *text(data, ts, te, 1))
|
369
|
-
fret;
|
370
|
-
};
|
371
|
-
|
372
335
|
hex_sequence_err @invalid_sequence_error {
|
373
336
|
fret;
|
374
337
|
};
|
375
338
|
|
376
|
-
(wide_hex_seq_invalid | wide_hex_seq_empty) {
|
377
|
-
raise InvalidSequenceError.new("wide hex sequence")
|
378
|
-
fret;
|
379
|
-
};
|
380
|
-
|
381
339
|
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
382
340
|
if data[te]
|
383
341
|
c = data[te].chr
|
@@ -408,9 +366,15 @@
|
|
408
366
|
fret;
|
409
367
|
};
|
410
368
|
|
369
|
+
char_type_char > (escaped_alpha, 2) {
|
370
|
+
fhold;
|
371
|
+
fnext *(in_set ? fentry(character_set) : fentry(main));
|
372
|
+
fcall char_type;
|
373
|
+
};
|
374
|
+
|
411
375
|
property_char > (escaped_alpha, 2) {
|
412
376
|
fhold;
|
413
|
-
fnext main;
|
377
|
+
fnext *(in_set ? fentry(character_set) : fentry(main));
|
414
378
|
fcall unicode_property;
|
415
379
|
};
|
416
380
|
|
@@ -466,7 +430,7 @@
|
|
466
430
|
emit(:anchor, :eol, *text(data, ts, te))
|
467
431
|
};
|
468
432
|
|
469
|
-
backslash .
|
433
|
+
backslash . keep_mark > (backslashed, 4) {
|
470
434
|
emit(:keep, :mark, *text(data, ts, te))
|
471
435
|
};
|
472
436
|
|
@@ -484,38 +448,13 @@
|
|
484
448
|
end
|
485
449
|
};
|
486
450
|
|
487
|
-
# Character types
|
488
|
-
# \d, \D digit, non-digit
|
489
|
-
# \h, \H hex, non-hex
|
490
|
-
# \s, \S space, non-space
|
491
|
-
# \w, \W word, non-word
|
492
|
-
# ------------------------------------------------------------------------
|
493
|
-
backslash . char_type > (backslashed, 2) {
|
494
|
-
case text = text(data, ts, te).first
|
495
|
-
when '\\d'; emit(:type, :digit, text, ts, te)
|
496
|
-
when '\\D'; emit(:type, :nondigit, text, ts, te)
|
497
|
-
when '\\h'; emit(:type, :hex, text, ts, te)
|
498
|
-
when '\\H'; emit(:type, :nonhex, text, ts, te)
|
499
|
-
when '\\s'; emit(:type, :space, text, ts, te)
|
500
|
-
when '\\S'; emit(:type, :nonspace, text, ts, te)
|
501
|
-
when '\\w'; emit(:type, :word, text, ts, te)
|
502
|
-
when '\\W'; emit(:type, :nonword, text, ts, te)
|
503
|
-
when '\\R'; emit(:type, :linebreak, text, ts, te)
|
504
|
-
when '\\X'; emit(:type, :xgrapheme, text, ts, te)
|
505
|
-
else
|
506
|
-
raise ScannerError.new(
|
507
|
-
"Unexpected character in type at #{text} (char #{ts})")
|
508
|
-
end
|
509
|
-
};
|
510
|
-
|
511
|
-
|
512
451
|
# Character sets
|
513
452
|
# ------------------------------------------------------------------------
|
514
453
|
set_open {
|
515
|
-
set_depth += 1
|
516
|
-
|
454
|
+
set_depth += 1
|
455
|
+
in_set = true
|
517
456
|
|
518
|
-
emit(
|
457
|
+
emit(:set, :open, *text(data, ts, te))
|
519
458
|
fcall character_set;
|
520
459
|
};
|
521
460
|
|
@@ -645,57 +584,57 @@
|
|
645
584
|
|
646
585
|
when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
647
586
|
if $1 == 'k'
|
648
|
-
emit(:backref, :name_ref_ab,
|
587
|
+
emit(:backref, :name_ref_ab, text, ts, te)
|
649
588
|
else
|
650
|
-
emit(:backref, :name_call_ab,
|
589
|
+
emit(:backref, :name_call_ab, text, ts, te)
|
651
590
|
end
|
652
591
|
|
653
592
|
when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
|
654
593
|
if $1 == 'k'
|
655
|
-
emit(:backref, :name_ref_sq,
|
594
|
+
emit(:backref, :name_ref_sq, text, ts, te)
|
656
595
|
else
|
657
|
-
emit(:backref, :name_call_sq,
|
596
|
+
emit(:backref, :name_call_sq, text, ts, te)
|
658
597
|
end
|
659
598
|
|
660
599
|
when /^\\([gk])<\d+>/ # angle-brackets
|
661
600
|
if $1 == 'k'
|
662
|
-
emit(:backref, :number_ref_ab,
|
601
|
+
emit(:backref, :number_ref_ab, text, ts, te)
|
663
602
|
else
|
664
|
-
emit(:backref, :number_call_ab,
|
603
|
+
emit(:backref, :number_call_ab, text, ts, te)
|
665
604
|
end
|
666
605
|
|
667
606
|
when /^\\([gk])'\d+'/ # single quotes
|
668
607
|
if $1 == 'k'
|
669
|
-
emit(:backref, :number_ref_sq,
|
608
|
+
emit(:backref, :number_ref_sq, text, ts, te)
|
670
609
|
else
|
671
|
-
emit(:backref, :number_call_sq,
|
610
|
+
emit(:backref, :number_call_sq, text, ts, te)
|
672
611
|
end
|
673
612
|
|
674
613
|
when /^\\([gk])<-\d+>/ # angle-brackets
|
675
614
|
if $1 == 'k'
|
676
|
-
emit(:backref, :number_rel_ref_ab,
|
615
|
+
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
677
616
|
else
|
678
|
-
emit(:backref, :number_rel_call_ab,
|
617
|
+
emit(:backref, :number_rel_call_ab, text, ts, te)
|
679
618
|
end
|
680
619
|
|
681
620
|
when /^\\([gk])'-\d+'/ # single quotes
|
682
621
|
if $1 == 'k'
|
683
|
-
emit(:backref, :number_rel_ref_sq,
|
622
|
+
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
684
623
|
else
|
685
|
-
emit(:backref, :number_rel_call_sq,
|
624
|
+
emit(:backref, :number_rel_call_sq, text, ts, te)
|
686
625
|
end
|
687
626
|
|
688
627
|
when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
689
|
-
emit(:backref, :
|
628
|
+
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
690
629
|
|
691
630
|
when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
692
|
-
emit(:backref, :
|
631
|
+
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
693
632
|
|
694
|
-
when /^\\([gk])
|
695
|
-
emit(:backref, :
|
633
|
+
when /^\\([gk])<-?\d+[+\-]\d+>/ # angle-brackets
|
634
|
+
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
696
635
|
|
697
|
-
when /^\\([gk])'
|
698
|
-
emit(:backref, :
|
636
|
+
when /^\\([gk])'-?\d+[+\-]\d+'/ # single-quotes
|
637
|
+
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
699
638
|
|
700
639
|
else
|
701
640
|
raise ScannerError.new(
|
@@ -859,8 +798,11 @@ class Regexp::Scanner
|
|
859
798
|
self.group_depth = 0
|
860
799
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
861
800
|
|
862
|
-
in_set
|
863
|
-
|
801
|
+
in_set = false
|
802
|
+
set_depth = 0
|
803
|
+
in_conditional = false
|
804
|
+
conditional_depth = 0
|
805
|
+
conditional_stack = []
|
864
806
|
|
865
807
|
%% write data;
|
866
808
|
%% write init;
|
@@ -882,6 +824,18 @@ class Regexp::Scanner
|
|
882
824
|
tokens
|
883
825
|
end
|
884
826
|
|
827
|
+
# lazy-load property maps when first needed
|
828
|
+
require 'yaml'
|
829
|
+
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
830
|
+
|
831
|
+
def self.short_prop_map
|
832
|
+
@short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
|
833
|
+
end
|
834
|
+
|
835
|
+
def self.long_prop_map
|
836
|
+
@long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
|
837
|
+
end
|
838
|
+
|
885
839
|
# Emits an array with the details of the scanned pattern
|
886
840
|
def emit(type, token, text, ts, te)
|
887
841
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -986,6 +940,8 @@ class Regexp::Scanner
|
|
986
940
|
end
|
987
941
|
|
988
942
|
def emit_options(text, ts, te)
|
943
|
+
token = nil
|
944
|
+
|
989
945
|
if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
|
990
946
|
positive, negative, group_local = $1, $2, $3
|
991
947
|
|
@@ -1001,13 +957,15 @@ class Regexp::Scanner
|
|
1001
957
|
|
1002
958
|
if group_local
|
1003
959
|
spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
|
960
|
+
token = :options
|
1004
961
|
else
|
1005
962
|
# switch for parent group level
|
1006
963
|
spacing_stack.last[:free_spacing] = free_spacing
|
964
|
+
token = :options_switch
|
1007
965
|
end
|
1008
966
|
end
|
1009
967
|
|
1010
|
-
emit(:group,
|
968
|
+
emit(:group, token, text, ts, te)
|
1011
969
|
end
|
1012
970
|
|
1013
971
|
# Centralizes and unifies the handling of validation related
|