regexp_parser 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -123,10 +129,6 @@
123
129
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
133
  keep_mark | [xucCM];
132
134
 
@@ -238,11 +240,7 @@
238
240
  emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
243
+ any | ascii_nonprint | utf8_multibyte {
246
244
  text = copy(data, ts, te)
247
245
  emit(:literal, :literal, text)
248
246
  };
@@ -325,7 +323,7 @@
325
323
  fret;
326
324
  };
327
325
 
328
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
329
327
  emit(:escape, :hex, copy(data, ts-1, te))
330
328
  fret;
331
329
  };
@@ -356,10 +354,7 @@
356
354
  fcall unicode_property;
357
355
  };
358
356
 
359
- (any -- non_literal_escape) |
360
- utf8_2_byte |
361
- utf8_3_byte |
362
- utf8_4_byte > (escaped_alpha, 1) {
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
363
358
  emit(:escape, :literal, copy(data, ts-1, te))
364
359
  fret;
365
360
  };
@@ -511,10 +506,10 @@
511
506
  when /^\(\?(?:<>|'')/
512
507
  validation_error(:group, 'named group', 'name is empty')
513
508
 
514
- when /^\(\?<\w*>/
509
+ when /^\(\?<[^>]+>/
515
510
  emit(:group, :named_ab, text)
516
511
 
517
- when /^\(\?'\w*'/
512
+ when /^\(\?'[^']+'/
518
513
  emit(:group, :named_sq, text)
519
514
 
520
515
  end
@@ -548,14 +543,16 @@
548
543
  when /^\\([gk])(<>|'')/ # angle brackets
549
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
550
545
 
551
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
552
549
  if $1 == 'k'
553
550
  emit(:backref, :name_ref_ab, text)
554
551
  else
555
552
  emit(:backref, :name_call_ab, text)
556
553
  end
557
554
 
558
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
559
556
  if $1 == 'k'
560
557
  emit(:backref, :name_ref_sq, text)
561
558
  else
@@ -590,10 +587,10 @@
590
587
  emit(:backref, :number_rel_call_sq, text)
591
588
  end
592
589
 
593
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
594
591
  emit(:backref, :name_recursion_ref_ab, text)
595
592
 
596
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
597
594
  emit(:backref, :name_recursion_ref_sq, text)
598
595
 
599
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
@@ -668,11 +665,7 @@
668
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
669
666
  # except meta characters.
670
667
  # ------------------------------------------------------------------------
671
- (ascii_print -- space)+ |
672
- ascii_nonprint+ |
673
- utf8_2_byte+ |
674
- utf8_3_byte+ |
675
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
676
669
  append_literal(data, ts, te)
677
670
  };
678
671
 
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '2.0.0'
3
+ VERSION = '2.0.1'
4
4
  end
5
5
  end
@@ -27,6 +27,7 @@ RSpec.describe('Escape scanning') do
27
27
  include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5]
28
28
  include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4]
29
29
 
30
+ include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4]
30
31
  include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5]
31
32
  include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5]
32
33
 
@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
5
5
  include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3]
6
6
  include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1]
7
7
 
8
+ # Named groups
9
+ # only names that start with a hyphen or digit (ascii or other) are invalid
8
10
  include_examples 'scan', '(?<name>abc)', 0 => [:group, :named_ab, '(?<name>', 0, 8]
9
11
  include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8]
10
-
11
12
  include_examples 'scan', '(?<name_1>abc)', 0 => [:group, :named_ab, '(?<name_1>', 0,10]
12
13
  include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10]
14
+ include_examples 'scan', '(?<name-1>abc)', 0 => [:group, :named_ab, '(?<name-1>', 0,10]
15
+ include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10]
16
+ include_examples 'scan', "(?<name'1>abc)", 0 => [:group, :named_ab, "(?<name'1>", 0,10]
17
+ include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10]
18
+ include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10]
19
+ include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10]
20
+ include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10]
21
+ include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10]
13
22
 
14
23
  include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3]
15
24
  include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: regexp_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ammar Ali
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-25 00:00:00.000000000 Z
11
+ date: 2020-12-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A library for tokenizing, lexing, and parsing Ruby regular expressions.
14
14
  email:
@@ -183,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  - !ruby/object:Gem::Version
184
184
  version: '0'
185
185
  requirements: []
186
- rubygems_version: 3.2.0.rc.1
186
+ rubygems_version: 3.2.0
187
187
  signing_key:
188
188
  specification_version: 4
189
189
  summary: Scanner, lexer, parser for ruby's regular expressions