regexp_parser 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -1,6 +1,7 @@
|
|
1
1
|
%%{
|
2
2
|
machine re_scanner;
|
3
|
-
include
|
3
|
+
include re_char_type "char_type.rl";
|
4
|
+
include re_property "property.rl";
|
4
5
|
|
5
6
|
dot = '.';
|
6
7
|
backslash = '\\';
|
@@ -35,25 +36,17 @@
|
|
35
36
|
collating_sequence = '[.' . (alpha | [\-])+ . '.]';
|
36
37
|
character_equivalent = '[=' . alpha . '=]';
|
37
38
|
|
38
|
-
char_type = [dDhHsSwWRX];
|
39
|
-
|
40
39
|
line_anchor = beginning_of_line | end_of_line;
|
41
40
|
anchor_char = [AbBzZG];
|
42
41
|
|
43
|
-
escaped_ascii = [
|
42
|
+
escaped_ascii = [abefnrtv];
|
44
43
|
octal_sequence = [0-7]{1,3};
|
45
44
|
|
46
45
|
hex_sequence = 'x' . xdigit{1,2};
|
47
46
|
hex_sequence_err = 'x' . [^0-9a-fA-F{];
|
48
|
-
wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
|
49
|
-
|
50
|
-
hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
|
51
|
-
|
52
|
-
wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
|
53
|
-
wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
|
54
47
|
|
55
48
|
codepoint_single = 'u' . xdigit{4};
|
56
|
-
codepoint_list = 'u{' . xdigit{1,
|
49
|
+
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
|
57
50
|
codepoint_sequence = codepoint_single | codepoint_list;
|
58
51
|
|
59
52
|
control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
|
@@ -110,6 +103,7 @@
|
|
110
103
|
|
111
104
|
group_type = group_atomic | group_passive | group_absence | group_named;
|
112
105
|
|
106
|
+
keep_mark = 'K';
|
113
107
|
|
114
108
|
assertion_type = assertion_lookahead | assertion_nlookahead |
|
115
109
|
assertion_lookbehind | assertion_nlookbehind;
|
@@ -119,16 +113,18 @@
|
|
119
113
|
curlies | parantheses | brackets |
|
120
114
|
line_anchor | quantifier_greedy;
|
121
115
|
|
122
|
-
ascii_print = ((0x20..0x7e) - meta_char)
|
123
|
-
ascii_nonprint = (0x01..0x1f | 0x7f)
|
116
|
+
ascii_print = ((0x20..0x7e) - meta_char);
|
117
|
+
ascii_nonprint = (0x01..0x1f | 0x7f);
|
118
|
+
|
119
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
120
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
121
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf)+;
|
128
|
-
utf8_byte_sequence = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
123
|
+
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
124
|
+
group_ref | keep_mark | [xucCM];
|
129
125
|
|
130
|
-
|
131
|
-
|
126
|
+
non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
|
127
|
+
multi_codepoint_char_type | [0-9cCM];
|
132
128
|
|
133
129
|
# EOF error, used where it can be detected
|
134
130
|
action premature_end_error {
|
@@ -150,11 +146,11 @@
|
|
150
146
|
# closing bracket of the set.
|
151
147
|
# --------------------------------------------------------------------------
|
152
148
|
character_set := |*
|
153
|
-
|
154
|
-
|
155
|
-
|
149
|
+
set_close > (set_meta, 2) {
|
150
|
+
set_depth -= 1
|
151
|
+
in_set = set_depth > 0 ? true : false
|
156
152
|
|
157
|
-
emit(
|
153
|
+
emit(:set, :close, *text(data, ts, te))
|
158
154
|
|
159
155
|
if set_depth == 0
|
160
156
|
fgoto main;
|
@@ -164,11 +160,11 @@
|
|
164
160
|
};
|
165
161
|
|
166
162
|
'-]' { # special case, emits two tokens
|
167
|
-
|
168
|
-
|
163
|
+
set_depth -= 1
|
164
|
+
in_set = set_depth > 0 ? true : false
|
169
165
|
|
170
|
-
emit(
|
171
|
-
emit(
|
166
|
+
emit(:literal, :literal, copy(data, ts..te-2), ts, te)
|
167
|
+
emit(:set, :close, copy(data, ts+1..te-1), ts, te)
|
172
168
|
|
173
169
|
if set_depth == 0
|
174
170
|
fgoto main;
|
@@ -177,59 +173,70 @@
|
|
177
173
|
end
|
178
174
|
};
|
179
175
|
|
176
|
+
'-&&' { # special case, emits two tokens
|
177
|
+
emit(:literal, :literal, '-', ts, te)
|
178
|
+
emit(:set, :intersection, '&&', ts, te)
|
179
|
+
};
|
180
|
+
|
180
181
|
'^' {
|
181
182
|
text = text(data, ts, te).first
|
182
183
|
if tokens.last[1] == :open
|
183
|
-
emit(
|
184
|
+
emit(:set, :negate, text, ts, te)
|
184
185
|
else
|
185
|
-
emit(
|
186
|
+
emit(:literal, :literal, text, ts, te)
|
186
187
|
end
|
187
188
|
};
|
188
189
|
|
189
|
-
|
190
|
-
|
190
|
+
'-' {
|
191
|
+
text = text(data, ts, te).first
|
192
|
+
# ranges cant start with a subset or intersection/negation/range operator
|
193
|
+
if tokens.last[0] == :set
|
194
|
+
emit(:literal, :literal, text, ts, te)
|
195
|
+
else
|
196
|
+
emit(:set, :range, text, ts, te)
|
197
|
+
end
|
191
198
|
};
|
192
199
|
|
200
|
+
# Unlike ranges, intersections can start or end at set boundaries, whereupon
|
201
|
+
# they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
|
193
202
|
'&&' {
|
194
|
-
emit(
|
203
|
+
emit(:set, :intersection, *text(data, ts, te))
|
195
204
|
};
|
196
205
|
|
197
|
-
|
206
|
+
backslash {
|
198
207
|
fcall set_escape_sequence;
|
199
208
|
};
|
200
209
|
|
201
|
-
|
202
|
-
set_depth += 1
|
203
|
-
set_type = set_depth > 1 ? :subset : :set
|
210
|
+
set_open >(open_bracket, 1) {
|
211
|
+
set_depth += 1
|
204
212
|
|
205
|
-
emit(
|
213
|
+
emit(:set, :open, *text(data, ts, te))
|
206
214
|
fcall character_set;
|
207
215
|
};
|
208
216
|
|
209
217
|
class_posix >(open_bracket, 1) @eof(premature_end_error) {
|
210
218
|
text = text(data, ts, te).first
|
211
219
|
|
220
|
+
type = :posixclass
|
212
221
|
class_name = text[2..-3]
|
213
222
|
if class_name[0].chr == '^'
|
214
|
-
class_name =
|
223
|
+
class_name = class_name[1..-1]
|
224
|
+
type = :nonposixclass
|
215
225
|
end
|
216
226
|
|
217
|
-
|
218
|
-
emit(set_type, token_sym, text, ts, te)
|
227
|
+
emit(type, class_name.to_sym, text, ts, te)
|
219
228
|
};
|
220
229
|
|
221
230
|
collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
|
222
|
-
emit(
|
231
|
+
emit(:set, :collation, *text(data, ts, te))
|
223
232
|
};
|
224
233
|
|
225
234
|
character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
|
226
|
-
emit(
|
235
|
+
emit(:set, :equivalent, *text(data, ts, te))
|
227
236
|
};
|
228
237
|
|
229
|
-
|
230
|
-
|
231
|
-
meta_char -- ']' {
|
232
|
-
emit(set_type, :member, *text(data, ts, te))
|
238
|
+
meta_char > (set_meta, 1) {
|
239
|
+
emit(:literal, :literal, *text(data, ts, te))
|
233
240
|
};
|
234
241
|
|
235
242
|
any |
|
@@ -237,63 +244,24 @@
|
|
237
244
|
utf8_2_byte |
|
238
245
|
utf8_3_byte |
|
239
246
|
utf8_4_byte {
|
240
|
-
|
247
|
+
char, *rest = *text(data, ts, te)
|
248
|
+
char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
|
249
|
+
emit(:literal, :literal, char, *rest)
|
241
250
|
};
|
242
251
|
*|;
|
243
252
|
|
244
253
|
# set escapes scanner
|
245
254
|
# --------------------------------------------------------------------------
|
246
255
|
set_escape_sequence := |*
|
247
|
-
|
248
|
-
emit(
|
249
|
-
fret;
|
250
|
-
};
|
251
|
-
|
252
|
-
char_type > (escaped_set_alpha, 4) {
|
253
|
-
case text = text(data, ts, te, 1).first
|
254
|
-
when '\d'; emit(set_type, :type_digit, text, ts-1, te)
|
255
|
-
when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
|
256
|
-
when '\h'; emit(set_type, :type_hex, text, ts-1, te)
|
257
|
-
when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
|
258
|
-
when '\s'; emit(set_type, :type_space, text, ts-1, te)
|
259
|
-
when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
|
260
|
-
when '\w'; emit(set_type, :type_word, text, ts-1, te)
|
261
|
-
when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
|
262
|
-
when '\R'; emit(set_type, :type_linebreak, text, ts-1, te)
|
263
|
-
when '\X'; emit(set_type, :type_xgrapheme, text, ts-1, te)
|
264
|
-
end
|
265
|
-
fret;
|
266
|
-
};
|
267
|
-
|
268
|
-
hex_sequence . '-\\' . hex_sequence {
|
269
|
-
emit(set_type, :range_hex, *text(data, ts, te, 1))
|
270
|
-
fret;
|
271
|
-
};
|
272
|
-
|
273
|
-
hex_sequence {
|
274
|
-
emit(set_type, :member_hex, *text(data, ts, te, 1))
|
275
|
-
fret;
|
276
|
-
};
|
277
|
-
|
278
|
-
meta_char | [\\\]\-\,] {
|
279
|
-
emit(set_type, :escape, *text(data, ts, te, 1))
|
256
|
+
non_set_escape > (escaped_set_alpha, 2) {
|
257
|
+
emit(:escape, :literal, *text(data, ts, te, 1))
|
280
258
|
fret;
|
281
259
|
};
|
282
260
|
|
283
|
-
|
261
|
+
any > (escaped_set_alpha, 1) {
|
284
262
|
fhold;
|
285
263
|
fnext character_set;
|
286
|
-
fcall
|
287
|
-
};
|
288
|
-
|
289
|
-
# special case exclusion of escaped dash, could be cleaner.
|
290
|
-
(ascii_print - char_type -- [\-}]) > (escaped_set_alpha, 1) |
|
291
|
-
ascii_nonprint |
|
292
|
-
utf8_2_byte |
|
293
|
-
utf8_3_byte |
|
294
|
-
utf8_4_byte {
|
295
|
-
emit(set_type, :escape, *text(data, ts, te, 1))
|
296
|
-
fret;
|
264
|
+
fcall escape_sequence;
|
297
265
|
};
|
298
266
|
*|;
|
299
267
|
|
@@ -338,11 +306,11 @@
|
|
338
306
|
# it is a word boundary anchor. A syntax might "normalize" it if needed.
|
339
307
|
case text = text(data, ts, te, 1).first
|
340
308
|
when '\a'; emit(:escape, :bell, text, ts-1, te)
|
309
|
+
when '\b'; emit(:escape, :backspace, text, ts-1, te)
|
341
310
|
when '\e'; emit(:escape, :escape, text, ts-1, te)
|
342
311
|
when '\f'; emit(:escape, :form_feed, text, ts-1, te)
|
343
312
|
when '\n'; emit(:escape, :newline, text, ts-1, te)
|
344
313
|
when '\r'; emit(:escape, :carriage, text, ts-1, te)
|
345
|
-
when '\s'; emit(:escape, :space, text, ts-1, te)
|
346
314
|
when '\t'; emit(:escape, :tab, text, ts-1, te)
|
347
315
|
when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
|
348
316
|
end
|
@@ -364,20 +332,10 @@
|
|
364
332
|
fret;
|
365
333
|
};
|
366
334
|
|
367
|
-
wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
|
368
|
-
emit(:escape, :hex_wide, *text(data, ts, te, 1))
|
369
|
-
fret;
|
370
|
-
};
|
371
|
-
|
372
335
|
hex_sequence_err @invalid_sequence_error {
|
373
336
|
fret;
|
374
337
|
};
|
375
338
|
|
376
|
-
(wide_hex_seq_invalid | wide_hex_seq_empty) {
|
377
|
-
raise InvalidSequenceError.new("wide hex sequence")
|
378
|
-
fret;
|
379
|
-
};
|
380
|
-
|
381
339
|
control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
|
382
340
|
if data[te]
|
383
341
|
c = data[te].chr
|
@@ -408,9 +366,15 @@
|
|
408
366
|
fret;
|
409
367
|
};
|
410
368
|
|
369
|
+
char_type_char > (escaped_alpha, 2) {
|
370
|
+
fhold;
|
371
|
+
fnext *(in_set ? fentry(character_set) : fentry(main));
|
372
|
+
fcall char_type;
|
373
|
+
};
|
374
|
+
|
411
375
|
property_char > (escaped_alpha, 2) {
|
412
376
|
fhold;
|
413
|
-
fnext main;
|
377
|
+
fnext *(in_set ? fentry(character_set) : fentry(main));
|
414
378
|
fcall unicode_property;
|
415
379
|
};
|
416
380
|
|
@@ -466,7 +430,7 @@
|
|
466
430
|
emit(:anchor, :eol, *text(data, ts, te))
|
467
431
|
};
|
468
432
|
|
469
|
-
backslash .
|
433
|
+
backslash . keep_mark > (backslashed, 4) {
|
470
434
|
emit(:keep, :mark, *text(data, ts, te))
|
471
435
|
};
|
472
436
|
|
@@ -484,38 +448,13 @@
|
|
484
448
|
end
|
485
449
|
};
|
486
450
|
|
487
|
-
# Character types
|
488
|
-
# \d, \D digit, non-digit
|
489
|
-
# \h, \H hex, non-hex
|
490
|
-
# \s, \S space, non-space
|
491
|
-
# \w, \W word, non-word
|
492
|
-
# ------------------------------------------------------------------------
|
493
|
-
backslash . char_type > (backslashed, 2) {
|
494
|
-
case text = text(data, ts, te).first
|
495
|
-
when '\\d'; emit(:type, :digit, text, ts, te)
|
496
|
-
when '\\D'; emit(:type, :nondigit, text, ts, te)
|
497
|
-
when '\\h'; emit(:type, :hex, text, ts, te)
|
498
|
-
when '\\H'; emit(:type, :nonhex, text, ts, te)
|
499
|
-
when '\\s'; emit(:type, :space, text, ts, te)
|
500
|
-
when '\\S'; emit(:type, :nonspace, text, ts, te)
|
501
|
-
when '\\w'; emit(:type, :word, text, ts, te)
|
502
|
-
when '\\W'; emit(:type, :nonword, text, ts, te)
|
503
|
-
when '\\R'; emit(:type, :linebreak, text, ts, te)
|
504
|
-
when '\\X'; emit(:type, :xgrapheme, text, ts, te)
|
505
|
-
else
|
506
|
-
raise ScannerError.new(
|
507
|
-
"Unexpected character in type at #{text} (char #{ts})")
|
508
|
-
end
|
509
|
-
};
|
510
|
-
|
511
|
-
|
512
451
|
# Character sets
|
513
452
|
# ------------------------------------------------------------------------
|
514
453
|
set_open {
|
515
|
-
set_depth += 1
|
516
|
-
|
454
|
+
set_depth += 1
|
455
|
+
in_set = true
|
517
456
|
|
518
|
-
emit(
|
457
|
+
emit(:set, :open, *text(data, ts, te))
|
519
458
|
fcall character_set;
|
520
459
|
};
|
521
460
|
|
@@ -645,57 +584,57 @@
|
|
645
584
|
|
646
585
|
when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
|
647
586
|
if $1 == 'k'
|
648
|
-
emit(:backref, :name_ref_ab,
|
587
|
+
emit(:backref, :name_ref_ab, text, ts, te)
|
649
588
|
else
|
650
|
-
emit(:backref, :name_call_ab,
|
589
|
+
emit(:backref, :name_call_ab, text, ts, te)
|
651
590
|
end
|
652
591
|
|
653
592
|
when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
|
654
593
|
if $1 == 'k'
|
655
|
-
emit(:backref, :name_ref_sq,
|
594
|
+
emit(:backref, :name_ref_sq, text, ts, te)
|
656
595
|
else
|
657
|
-
emit(:backref, :name_call_sq,
|
596
|
+
emit(:backref, :name_call_sq, text, ts, te)
|
658
597
|
end
|
659
598
|
|
660
599
|
when /^\\([gk])<\d+>/ # angle-brackets
|
661
600
|
if $1 == 'k'
|
662
|
-
emit(:backref, :number_ref_ab,
|
601
|
+
emit(:backref, :number_ref_ab, text, ts, te)
|
663
602
|
else
|
664
|
-
emit(:backref, :number_call_ab,
|
603
|
+
emit(:backref, :number_call_ab, text, ts, te)
|
665
604
|
end
|
666
605
|
|
667
606
|
when /^\\([gk])'\d+'/ # single quotes
|
668
607
|
if $1 == 'k'
|
669
|
-
emit(:backref, :number_ref_sq,
|
608
|
+
emit(:backref, :number_ref_sq, text, ts, te)
|
670
609
|
else
|
671
|
-
emit(:backref, :number_call_sq,
|
610
|
+
emit(:backref, :number_call_sq, text, ts, te)
|
672
611
|
end
|
673
612
|
|
674
613
|
when /^\\([gk])<-\d+>/ # angle-brackets
|
675
614
|
if $1 == 'k'
|
676
|
-
emit(:backref, :number_rel_ref_ab,
|
615
|
+
emit(:backref, :number_rel_ref_ab, text, ts, te)
|
677
616
|
else
|
678
|
-
emit(:backref, :number_rel_call_ab,
|
617
|
+
emit(:backref, :number_rel_call_ab, text, ts, te)
|
679
618
|
end
|
680
619
|
|
681
620
|
when /^\\([gk])'-\d+'/ # single quotes
|
682
621
|
if $1 == 'k'
|
683
|
-
emit(:backref, :number_rel_ref_sq,
|
622
|
+
emit(:backref, :number_rel_ref_sq, text, ts, te)
|
684
623
|
else
|
685
|
-
emit(:backref, :number_rel_call_sq,
|
624
|
+
emit(:backref, :number_rel_call_sq, text, ts, te)
|
686
625
|
end
|
687
626
|
|
688
627
|
when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
|
689
|
-
emit(:backref, :
|
628
|
+
emit(:backref, :name_recursion_ref_ab, text, ts, te)
|
690
629
|
|
691
630
|
when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
|
692
|
-
emit(:backref, :
|
631
|
+
emit(:backref, :name_recursion_ref_sq, text, ts, te)
|
693
632
|
|
694
|
-
when /^\\([gk])
|
695
|
-
emit(:backref, :
|
633
|
+
when /^\\([gk])<-?\d+[+\-]\d+>/ # angle-brackets
|
634
|
+
emit(:backref, :number_recursion_ref_ab, text, ts, te)
|
696
635
|
|
697
|
-
when /^\\([gk])'
|
698
|
-
emit(:backref, :
|
636
|
+
when /^\\([gk])'-?\d+[+\-]\d+'/ # single-quotes
|
637
|
+
emit(:backref, :number_recursion_ref_sq, text, ts, te)
|
699
638
|
|
700
639
|
else
|
701
640
|
raise ScannerError.new(
|
@@ -859,8 +798,11 @@ class Regexp::Scanner
|
|
859
798
|
self.group_depth = 0
|
860
799
|
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
|
861
800
|
|
862
|
-
in_set
|
863
|
-
|
801
|
+
in_set = false
|
802
|
+
set_depth = 0
|
803
|
+
in_conditional = false
|
804
|
+
conditional_depth = 0
|
805
|
+
conditional_stack = []
|
864
806
|
|
865
807
|
%% write data;
|
866
808
|
%% write init;
|
@@ -882,6 +824,18 @@ class Regexp::Scanner
|
|
882
824
|
tokens
|
883
825
|
end
|
884
826
|
|
827
|
+
# lazy-load property maps when first needed
|
828
|
+
require 'yaml'
|
829
|
+
PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
|
830
|
+
|
831
|
+
def self.short_prop_map
|
832
|
+
@short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
|
833
|
+
end
|
834
|
+
|
835
|
+
def self.long_prop_map
|
836
|
+
@long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
|
837
|
+
end
|
838
|
+
|
885
839
|
# Emits an array with the details of the scanned pattern
|
886
840
|
def emit(type, token, text, ts, te)
|
887
841
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -986,6 +940,8 @@ class Regexp::Scanner
|
|
986
940
|
end
|
987
941
|
|
988
942
|
def emit_options(text, ts, te)
|
943
|
+
token = nil
|
944
|
+
|
989
945
|
if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
|
990
946
|
positive, negative, group_local = $1, $2, $3
|
991
947
|
|
@@ -1001,13 +957,15 @@ class Regexp::Scanner
|
|
1001
957
|
|
1002
958
|
if group_local
|
1003
959
|
spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
|
960
|
+
token = :options
|
1004
961
|
else
|
1005
962
|
# switch for parent group level
|
1006
963
|
spacing_stack.last[:free_spacing] = free_spacing
|
964
|
+
token = :options_switch
|
1007
965
|
end
|
1008
966
|
end
|
1009
967
|
|
1010
|
-
emit(:group,
|
968
|
+
emit(:group, token, text, ts, te)
|
1011
969
|
end
|
1012
970
|
|
1013
971
|
# Centralizes and unifies the handling of validation related
|