regexp_parser 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -123,10 +129,6 @@
123
129
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
133
  keep_mark | [xucCM];
132
134
 
@@ -238,11 +240,7 @@
238
240
  emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
243
+ any | ascii_nonprint | utf8_multibyte {
246
244
  text = copy(data, ts, te)
247
245
  emit(:literal, :literal, text)
248
246
  };
@@ -325,7 +323,7 @@
325
323
  fret;
326
324
  };
327
325
 
328
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
329
327
  emit(:escape, :hex, copy(data, ts-1, te))
330
328
  fret;
331
329
  };
@@ -356,10 +354,7 @@
356
354
  fcall unicode_property;
357
355
  };
358
356
 
359
- (any -- non_literal_escape) |
360
- utf8_2_byte |
361
- utf8_3_byte |
362
- utf8_4_byte > (escaped_alpha, 1) {
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
363
358
  emit(:escape, :literal, copy(data, ts-1, te))
364
359
  fret;
365
360
  };
@@ -511,10 +506,10 @@
511
506
  when /^\(\?(?:<>|'')/
512
507
  validation_error(:group, 'named group', 'name is empty')
513
508
 
514
- when /^\(\?<\w*>/
509
+ when /^\(\?<[^>]+>/
515
510
  emit(:group, :named_ab, text)
516
511
 
517
- when /^\(\?'\w*'/
512
+ when /^\(\?'[^']+'/
518
513
  emit(:group, :named_sq, text)
519
514
 
520
515
  end
@@ -548,14 +543,16 @@
548
543
  when /^\\([gk])(<>|'')/ # angle brackets
549
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
550
545
 
551
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
552
549
  if $1 == 'k'
553
550
  emit(:backref, :name_ref_ab, text)
554
551
  else
555
552
  emit(:backref, :name_call_ab, text)
556
553
  end
557
554
 
558
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
559
556
  if $1 == 'k'
560
557
  emit(:backref, :name_ref_sq, text)
561
558
  else
@@ -590,10 +587,10 @@
590
587
  emit(:backref, :number_rel_call_sq, text)
591
588
  end
592
589
 
593
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
594
591
  emit(:backref, :name_recursion_ref_ab, text)
595
592
 
596
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
597
594
  emit(:backref, :name_recursion_ref_sq, text)
598
595
 
599
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
@@ -668,11 +665,7 @@
668
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
669
666
  # except meta characters.
670
667
  # ------------------------------------------------------------------------
671
- (ascii_print -- space)+ |
672
- ascii_nonprint+ |
673
- utf8_2_byte+ |
674
- utf8_3_byte+ |
675
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
676
669
  append_literal(data, ts, te)
677
670
  };
678
671
 
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '2.0.0'
3
+ VERSION = '2.0.1'
4
4
  end
5
5
  end
@@ -27,6 +27,7 @@ RSpec.describe('Escape scanning') do
27
27
  include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5]
28
28
  include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4]
29
29
 
30
+ include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4]
30
31
  include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5]
31
32
  include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5]
32
33
 
@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
5
5
  include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3]
6
6
  include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1]
7
7
 
8
+ # Named groups
9
+ # only names that start with a hyphen or digit (ascii or other) are invalid
8
10
  include_examples 'scan', '(?<name>abc)', 0 => [:group, :named_ab, '(?<name>', 0, 8]
9
11
  include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8]
10
-
11
12
  include_examples 'scan', '(?<name_1>abc)', 0 => [:group, :named_ab, '(?<name_1>', 0,10]
12
13
  include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10]
14
+ include_examples 'scan', '(?<name-1>abc)', 0 => [:group, :named_ab, '(?<name-1>', 0,10]
15
+ include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10]
16
+ include_examples 'scan', "(?<name'1>abc)", 0 => [:group, :named_ab, "(?<name'1>", 0,10]
17
+ include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10]
18
+ include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10]
19
+ include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10]
20
+ include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10]
21
+ include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10]
13
22
 
14
23
  include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3]
15
24
  include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: regexp_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ammar Ali
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-25 00:00:00.000000000 Z
11
+ date: 2020-12-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A library for tokenizing, lexing, and parsing Ruby regular expressions.
14
14
  email:
@@ -183,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  - !ruby/object:Gem::Version
184
184
  version: '0'
185
185
  requirements: []
186
- rubygems_version: 3.2.0.rc.1
186
+ rubygems_version: 3.2.0
187
187
  signing_key:
188
188
  specification_version: 4
189
189
  summary: Scanner, lexer, parser for ruby's regular expressions