regexp_parser 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/README.md +1 -4
- data/lib/regexp_parser/scanner.rb +998 -1095
- data/lib/regexp_parser/scanner/scanner.rl +24 -31
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/scanner/escapes_spec.rb +1 -0
- data/spec/scanner/groups_spec.rb +10 -1
- metadata +3 -3
@@ -3,6 +3,11 @@
|
|
3
3
|
include re_char_type "char_type.rl";
|
4
4
|
include re_property "property.rl";
|
5
5
|
|
6
|
+
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
7
|
+
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
8
|
+
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
9
|
+
utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
|
10
|
+
|
6
11
|
dot = '.';
|
7
12
|
backslash = '\\';
|
8
13
|
alternation = '|';
|
@@ -90,18 +95,19 @@
|
|
90
95
|
group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
|
91
96
|
|
92
97
|
group_ref = [gk];
|
93
|
-
|
94
|
-
|
95
|
-
group_number = '-'? . [1-9] .
|
98
|
+
group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
|
99
|
+
group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
|
100
|
+
group_number = '-'? . [1-9] . [0-9]*;
|
96
101
|
group_level = [+\-] . [0-9]+;
|
97
102
|
|
98
|
-
group_name = ('<' .
|
103
|
+
group_name = ('<' . group_name_id_ab? . '>') |
|
104
|
+
("'" . group_name_id_sq? . "'");
|
99
105
|
group_lookup = group_name | group_number;
|
100
106
|
|
101
107
|
group_named = ('?' . group_name );
|
102
108
|
|
103
|
-
group_name_ref = group_ref . (('<' .
|
104
|
-
("'" .
|
109
|
+
group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
|
110
|
+
("'" . group_name_id_sq? . group_level? "'"));
|
105
111
|
|
106
112
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
107
113
|
("'" . group_number . group_level? "'"));
|
@@ -123,10 +129,6 @@
|
|
123
129
|
ascii_print = ((0x20..0x7e) - meta_char - '#');
|
124
130
|
ascii_nonprint = (0x01..0x1f | 0x7f);
|
125
131
|
|
126
|
-
utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
|
127
|
-
utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
|
128
|
-
utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
|
129
|
-
|
130
132
|
non_literal_escape = char_type_char | anchor_char | escaped_ascii |
|
131
133
|
keep_mark | [xucCM];
|
132
134
|
|
@@ -238,11 +240,7 @@
|
|
238
240
|
emit(:literal, :literal, copy(data, ts, te))
|
239
241
|
};
|
240
242
|
|
241
|
-
any
|
242
|
-
ascii_nonprint |
|
243
|
-
utf8_2_byte |
|
244
|
-
utf8_3_byte |
|
245
|
-
utf8_4_byte {
|
243
|
+
any | ascii_nonprint | utf8_multibyte {
|
246
244
|
text = copy(data, ts, te)
|
247
245
|
emit(:literal, :literal, text)
|
248
246
|
};
|
@@ -325,7 +323,7 @@
|
|
325
323
|
fret;
|
326
324
|
};
|
327
325
|
|
328
|
-
hex_sequence > (escaped_alpha, 5)
|
326
|
+
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
|
329
327
|
emit(:escape, :hex, copy(data, ts-1, te))
|
330
328
|
fret;
|
331
329
|
};
|
@@ -356,10 +354,7 @@
|
|
356
354
|
fcall unicode_property;
|
357
355
|
};
|
358
356
|
|
359
|
-
(any -- non_literal_escape) |
|
360
|
-
utf8_2_byte |
|
361
|
-
utf8_3_byte |
|
362
|
-
utf8_4_byte > (escaped_alpha, 1) {
|
357
|
+
(any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
|
363
358
|
emit(:escape, :literal, copy(data, ts-1, te))
|
364
359
|
fret;
|
365
360
|
};
|
@@ -511,10 +506,10 @@
|
|
511
506
|
when /^\(\?(?:<>|'')/
|
512
507
|
validation_error(:group, 'named group', 'name is empty')
|
513
508
|
|
514
|
-
when /^\(
|
509
|
+
when /^\(\?<[^>]+>/
|
515
510
|
emit(:group, :named_ab, text)
|
516
511
|
|
517
|
-
when /^\(\?'
|
512
|
+
when /^\(\?'[^']+'/
|
518
513
|
emit(:group, :named_sq, text)
|
519
514
|
|
520
515
|
end
|
@@ -548,14 +543,16 @@
|
|
548
543
|
when /^\\([gk])(<>|'')/ # angle brackets
|
549
544
|
validation_error(:backref, 'ref/call', 'ref ID is empty')
|
550
545
|
|
551
|
-
|
546
|
+
# TODO: finer quirks of choosing recursive or non-recursive refs/calls.
|
547
|
+
# e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
|
548
|
+
when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
|
552
549
|
if $1 == 'k'
|
553
550
|
emit(:backref, :name_ref_ab, text)
|
554
551
|
else
|
555
552
|
emit(:backref, :name_call_ab, text)
|
556
553
|
end
|
557
554
|
|
558
|
-
when /^\\([gk])'[^\
|
555
|
+
when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
|
559
556
|
if $1 == 'k'
|
560
557
|
emit(:backref, :name_ref_sq, text)
|
561
558
|
else
|
@@ -590,10 +587,10 @@
|
|
590
587
|
emit(:backref, :number_rel_call_sq, text)
|
591
588
|
end
|
592
589
|
|
593
|
-
when /^\\k<[^\
|
590
|
+
when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
|
594
591
|
emit(:backref, :name_recursion_ref_ab, text)
|
595
592
|
|
596
|
-
when /^\\k'[^\
|
593
|
+
when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
|
597
594
|
emit(:backref, :name_recursion_ref_sq, text)
|
598
595
|
|
599
596
|
when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
|
@@ -668,11 +665,7 @@
|
|
668
665
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
669
666
|
# except meta characters.
|
670
667
|
# ------------------------------------------------------------------------
|
671
|
-
(ascii_print -- space)+
|
672
|
-
ascii_nonprint+ |
|
673
|
-
utf8_2_byte+ |
|
674
|
-
utf8_3_byte+ |
|
675
|
-
utf8_4_byte+ {
|
668
|
+
(ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
|
676
669
|
append_literal(data, ts, te)
|
677
670
|
};
|
678
671
|
|
@@ -27,6 +27,7 @@ RSpec.describe('Escape scanning') do
|
|
27
27
|
include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5]
|
28
28
|
include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4]
|
29
29
|
|
30
|
+
include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4]
|
30
31
|
include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5]
|
31
32
|
include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5]
|
32
33
|
|
data/spec/scanner/groups_spec.rb
CHANGED
@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
|
|
5
5
|
include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3]
|
6
6
|
include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1]
|
7
7
|
|
8
|
+
# Named groups
|
9
|
+
# only names that start with a hyphen or digit (ascii or other) are invalid
|
8
10
|
include_examples 'scan', '(?<name>abc)', 0 => [:group, :named_ab, '(?<name>', 0, 8]
|
9
11
|
include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8]
|
10
|
-
|
11
12
|
include_examples 'scan', '(?<name_1>abc)', 0 => [:group, :named_ab, '(?<name_1>', 0,10]
|
12
13
|
include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10]
|
14
|
+
include_examples 'scan', '(?<name-1>abc)', 0 => [:group, :named_ab, '(?<name-1>', 0,10]
|
15
|
+
include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10]
|
16
|
+
include_examples 'scan', "(?<name'1>abc)", 0 => [:group, :named_ab, "(?<name'1>", 0,10]
|
17
|
+
include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10]
|
18
|
+
include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10]
|
19
|
+
include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10]
|
20
|
+
include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10]
|
21
|
+
include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10]
|
13
22
|
|
14
23
|
include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3]
|
15
24
|
include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: regexp_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ammar Ali
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A library for tokenizing, lexing, and parsing Ruby regular expressions.
|
14
14
|
email:
|
@@ -183,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
183
|
- !ruby/object:Gem::Version
|
184
184
|
version: '0'
|
185
185
|
requirements: []
|
186
|
-
rubygems_version: 3.2.0
|
186
|
+
rubygems_version: 3.2.0
|
187
187
|
signing_key:
|
188
188
|
specification_version: 4
|
189
189
|
summary: Scanner, lexer, parser for ruby's regular expressions
|