regexp_parser 1.8.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +62 -0
  3. data/Gemfile +1 -0
  4. data/README.md +1 -4
  5. data/Rakefile +2 -2
  6. data/lib/regexp_parser/expression.rb +4 -17
  7. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/group.rb +22 -2
  9. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  10. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  11. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  12. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  13. data/lib/regexp_parser/expression/sequence.rb +0 -10
  14. data/lib/regexp_parser/lexer.rb +2 -2
  15. data/lib/regexp_parser/parser.rb +27 -2
  16. data/lib/regexp_parser/scanner.rb +1194 -1272
  17. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  18. data/lib/regexp_parser/scanner/property.rl +2 -2
  19. data/lib/regexp_parser/scanner/scanner.rl +178 -186
  20. data/lib/regexp_parser/syntax.rb +4 -4
  21. data/lib/regexp_parser/syntax/any.rb +2 -2
  22. data/lib/regexp_parser/syntax/base.rb +1 -1
  23. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  24. data/lib/regexp_parser/version.rb +1 -1
  25. data/spec/expression/base_spec.rb +10 -0
  26. data/spec/expression/subexpression_spec.rb +1 -1
  27. data/spec/expression/to_s_spec.rb +39 -31
  28. data/spec/lexer/literals_spec.rb +24 -49
  29. data/spec/parser/errors_spec.rb +1 -1
  30. data/spec/parser/escapes_spec.rb +1 -1
  31. data/spec/parser/quantifiers_spec.rb +16 -0
  32. data/spec/parser/set/ranges_spec.rb +3 -3
  33. data/spec/scanner/escapes_spec.rb +7 -0
  34. data/spec/scanner/groups_spec.rb +10 -1
  35. data/spec/scanner/literals_spec.rb +28 -38
  36. data/spec/scanner/quantifiers_spec.rb +18 -13
  37. data/spec/scanner/sets_spec.rb +23 -5
  38. data/spec/spec_helper.rb +1 -0
  39. metadata +3 -7
  40. data/spec/expression/root_spec.rb +0 -9
  41. data/spec/expression/sequence_spec.rb +0 -9
@@ -1,9 +1,9 @@
1
+ module Regexp::Syntax
2
+ class SyntaxError < StandardError; end
3
+ end
4
+
1
5
  require File.expand_path('../syntax/tokens', __FILE__)
2
6
  require File.expand_path('../syntax/base', __FILE__)
3
7
  require File.expand_path('../syntax/any', __FILE__)
4
8
  require File.expand_path('../syntax/version_lookup', __FILE__)
5
9
  require File.expand_path('../syntax/versions', __FILE__)
6
-
7
- module Regexp::Syntax
8
- class SyntaxError < StandardError; end
9
- end
@@ -8,8 +8,8 @@ module Regexp::Syntax
8
8
  @implements = { :* => [:*] }
9
9
  end
10
10
 
11
- def implements?(type, token) true end
12
- def implements!(type, token) true end
11
+ def implements?(_type, _token) true end
12
+ def implements!(_type, _token) true end
13
13
  end
14
14
 
15
15
  end
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Regexp::Syntax
4
- class NotImplementedError < SyntaxError
4
+ class NotImplementedError < Regexp::Syntax::SyntaxError
5
5
  def initialize(syntax, type, token)
6
6
  super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
7
7
  end
@@ -3,13 +3,13 @@ module Regexp::Syntax
3
3
  VERSION_REGEXP = /#{VERSION_FORMAT}/
4
4
  VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
5
5
 
6
- class InvalidVersionNameError < SyntaxError
6
+ class InvalidVersionNameError < Regexp::Syntax::SyntaxError
7
7
  def initialize(name)
8
8
  super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
9
9
  end
10
10
  end
11
11
 
12
- class UnknownSyntaxNameError < SyntaxError
12
+ class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
13
13
  def initialize(name)
14
14
  super "Unknown syntax name '#{name}'."
15
15
  end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '1.8.2'
3
+ VERSION = '2.0.3'
4
4
  end
5
5
  end
@@ -91,4 +91,14 @@ RSpec.describe(Regexp::Expression::Base) do
91
91
  expect(RP.parse(/a*/)[0].repetitions).to eq 0..(Float::INFINITY)
92
92
  expect(RP.parse(/a+/)[0].repetitions).to eq 1..(Float::INFINITY)
93
93
  end
94
+
95
+ specify('#base_length') do
96
+ expect(RP.parse(/(aa)/)[0].base_length).to eq 4
97
+ expect(RP.parse(/(aa){42}/)[0].base_length).to eq 4
98
+ end
99
+
100
+ specify('#full_length') do
101
+ expect(RP.parse(/(aa)/)[0].full_length).to eq 4
102
+ expect(RP.parse(/(aa){42}/)[0].full_length).to eq 8
103
+ end
94
104
  end
@@ -32,7 +32,7 @@ RSpec.describe(Regexp::Expression::Subexpression) do
32
32
  }
33
33
 
34
34
  root.each_expression do |exp|
35
- next unless expected_nesting_level = tests.delete(exp.to_s)
35
+ next unless (expected_nesting_level = tests.delete(exp.to_s))
36
36
  expect(expected_nesting_level).to eq exp.nesting_level
37
37
  end
38
38
 
@@ -1,58 +1,50 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe('Expression#to_s') do
4
- specify('literal alternation') do
5
- pattern = 'abcd|ghij|klmn|pqur'
4
+ def parse_frozen(pattern, ruby_version = nil)
5
+ IceNine.deep_freeze(RP.parse(pattern, *ruby_version))
6
+ end
7
+
8
+ def expect_round_trip(pattern, ruby_version = nil)
9
+ parsed = parse_frozen(pattern, ruby_version)
6
10
 
7
- expect(RP.parse(pattern).to_s).to eq pattern
11
+ expect(parsed.to_s).to eql(pattern)
8
12
  end
9
13
 
10
- specify('quantified alternations') do
11
- pattern = '(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)'
14
+ specify('literal alternation') do
15
+ expect_round_trip('abcd|ghij|klmn|pqur')
16
+ end
12
17
 
13
- expect(RP.parse(pattern).to_s).to eq pattern
18
+ specify('quantified alternations') do
19
+ expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)')
14
20
  end
15
21
 
16
22
  specify('quantified sets') do
17
- pattern = '[abc]+|[^def]{3,6}'
18
-
19
- expect(RP.parse(pattern).to_s).to eq pattern
23
+ expect_round_trip('[abc]+|[^def]{3,6}')
20
24
  end
21
25
 
22
26
  specify('property sets') do
23
- pattern = '[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+'
24
-
25
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
27
+ expect_round_trip('[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+', 'ruby/1.9')
26
28
  end
27
29
 
28
30
  specify('groups') do
29
- pattern = "(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++"
30
-
31
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
31
+ expect_round_trip("(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++", 'ruby/1.9')
32
32
  end
33
33
 
34
34
  specify('assertions') do
35
- pattern = '(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?'
36
-
37
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
35
+ expect_round_trip('(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?', 'ruby/1.9')
38
36
  end
39
37
 
40
38
  specify('comments') do
41
- pattern = '(?#start)a(?#middle)b(?#end)'
42
-
43
- expect(RP.parse(pattern).to_s).to eq pattern
39
+ expect_round_trip('(?#start)a(?#middle)b(?#end)')
44
40
  end
45
41
 
46
42
  specify('options') do
47
- pattern = '(?mix:start)a(?-mix:middle)b(?i-mx:end)'
48
-
49
- expect(RP.parse(pattern).to_s).to eq pattern
43
+ expect_round_trip('(?mix:start)a(?-mix:middle)b(?i-mx:end)')
50
44
  end
51
45
 
52
46
  specify('url') do
53
- pattern = ('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
54
-
55
- expect(RP.parse(pattern).to_s).to eq pattern
47
+ expect_round_trip('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
56
48
  end
57
49
 
58
50
  specify('multiline source') do
@@ -64,7 +56,7 @@ RSpec.describe('Expression#to_s') do
64
56
  \z
65
57
  /x
66
58
 
67
- expect(RP.parse(multiline).to_s).to eq multiline.source
59
+ expect(parse_frozen(multiline).to_s).to eql(multiline.source)
68
60
  end
69
61
 
70
62
  specify('multiline #to_s') do
@@ -76,7 +68,7 @@ RSpec.describe('Expression#to_s') do
76
68
  \z
77
69
  /x
78
70
 
79
- expect(RP.parse(multiline.to_s).to_s).to eq multiline.to_s
71
+ expect_round_trip(multiline.to_s)
80
72
  end
81
73
 
82
74
  # Free spacing expressions that use spaces between quantifiers and their
@@ -93,8 +85,24 @@ RSpec.describe('Expression#to_s') do
93
85
  /x
94
86
 
95
87
  str = 'bbbcged'
96
- root = RP.parse(multiline)
88
+ root = parse_frozen(multiline)
89
+
90
+ expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eql(multiline.match(str)[0])
91
+ end
92
+
93
+ # special case: implicit groups used for chained quantifiers produce no parens
94
+ specify 'chained quantifiers #to_s' do
95
+ pattern = /a+{1}{2}/
96
+ root = parse_frozen(pattern)
97
+ expect(root.to_s).to eql('a+{1}{2}')
98
+ end
97
99
 
98
- expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eq multiline.match(str)[0]
100
+ # regression test for https://github.com/ammar/regexp_parser/issues/74
101
+ specify('non-ascii comment') do
102
+ pattern = '(?x) 😋 # 😋'
103
+ root = RP.parse(pattern)
104
+ expect(root.last).to be_a(Regexp::Expression::Comment)
105
+ expect(root.last.to_s).to eql('# 😋')
106
+ expect(root.to_s).to eql(pattern)
99
107
  end
100
108
  end
@@ -10,67 +10,42 @@ RSpec.describe('Literal lexing') do
10
10
  1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
11
11
  2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0]
12
12
 
13
- # 2 byte wide characters, Arabic
14
- include_examples 'lex', 'ا',
15
- 0 => [:literal, :literal, 'ا', 0, 2, 0, 0, 0]
16
-
17
- include_examples 'lex', 'aاbبcت',
18
- 0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0, 0]
19
-
20
- include_examples 'lex', 'aاbبت?',
21
- 0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0, 0],
22
- 1 => [:literal, :literal, 'ت', 6, 8, 0, 0, 0],
23
- 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0]
24
-
25
- include_examples 'lex', 'aا?bبcت+',
26
- 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
27
- 1 => [:literal, :literal, 'ا', 1, 3, 0, 0, 0],
28
- 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
29
- 3 => [:literal, :literal, 'bبc', 4, 8, 0, 0, 0],
30
- 4 => [:literal, :literal, 'ت', 8, 10, 0, 0, 0],
31
- 5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0, 0]
32
-
33
- include_examples 'lex', 'a(اbب+)cت?',
34
- 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
35
- 1 => [:group, :capture, '(', 1, 2, 0, 0, 0],
36
- 2 => [:literal, :literal, 'اb', 2, 5, 1, 0, 0],
37
- 3 => [:literal, :literal, 'ب', 5, 7, 1, 0, 0],
38
- 4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0, 0],
39
- 5 => [:group, :close, ')', 8, 9, 0, 0, 0],
40
- 6 => [:literal, :literal, 'c', 9, 10, 0, 0, 0],
41
- 7 => [:literal, :literal, 'ت', 10, 12, 0, 0, 0],
42
- 8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0, 0]
13
+ # 2 byte wide characters
14
+ include_examples 'lex', 'äöü+',
15
+ 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0],
16
+ 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0],
17
+ 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0]
43
18
 
44
19
  # 3 byte wide characters, Japanese
45
20
  include_examples 'lex', 'ab?れます+cd',
46
21
  0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
47
22
  1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
48
23
  2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
49
- 3 => [:literal, :literal, 'れま', 3, 9, 0, 0, 0],
50
- 4 => [:literal, :literal, 'す', 9, 12, 0, 0, 0],
51
- 5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0, 0],
52
- 6 => [:literal, :literal, 'cd', 13, 15, 0, 0, 0]
24
+ 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0],
25
+ 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0],
26
+ 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
27
+ 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0]
53
28
 
54
29
  # 4 byte wide characters, Osmanya
55
30
  include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃',
56
- 0 => [:literal, :literal, '𐒀', 0, 4, 0, 0, 0],
57
- 1 => [:literal, :literal, '𐒁', 4, 8, 0, 0, 0],
58
- 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0],
59
- 3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0, 0],
60
- 4 => [:literal, :literal, 'b', 14, 15, 0, 0, 0],
61
- 5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0],
62
- 6 => [:literal, :literal, '𐒃', 16, 20, 0, 0, 0]
31
+ 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0],
32
+ 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0],
33
+ 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
34
+ 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0],
35
+ 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0],
36
+ 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
37
+ 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0]
63
38
 
64
39
  include_examples 'lex', 'mu𝄞?si*𝄫c+',
65
40
  0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0],
66
- 1 => [:literal, :literal, '𝄞', 2, 6, 0, 0, 0],
67
- 2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0, 0],
68
- 3 => [:literal, :literal, 's', 7, 8, 0, 0, 0],
69
- 4 => [:literal, :literal, 'i', 8, 9, 0, 0, 0],
70
- 5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0, 0],
71
- 6 => [:literal, :literal, '𝄫', 10, 14, 0, 0, 0],
72
- 7 => [:literal, :literal, 'c', 14, 15, 0, 0, 0],
73
- 8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0]
41
+ 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0],
42
+ 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
43
+ 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0],
44
+ 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0],
45
+ 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0],
46
+ 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0],
47
+ 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0],
48
+ 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0]
74
49
 
75
50
  specify('lex single 2 byte char') do
76
51
  tokens = RL.lex("\u0627+")
@@ -9,7 +9,7 @@ RSpec.describe('Parsing errors') do
9
9
  .to raise_error(Regexp::Parser::UnknownTokenTypeError)
10
10
  end
11
11
 
12
- RSpec.shared_examples 'UnknownTokenError' do |type, token|
12
+ RSpec.shared_examples 'UnknownTokenError' do |type|
13
13
  it "raises for unkown tokens of type #{type}" do
14
14
  expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) }
15
15
  .to raise_error(Regexp::Parser::UnknownTokenError)
@@ -25,7 +25,7 @@ RSpec.describe('EscapeSequence parsing') do
25
25
  include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, EscapeSequence::CodepointList]
26
26
  include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, EscapeSequence::CodepointList]
27
27
 
28
- # hex escapes
28
+ # hex escapes
29
29
  include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, EscapeSequence::Hex]
30
30
 
31
31
  # octal escapes
@@ -11,6 +11,7 @@ RSpec.describe('Quantifier parsing') do
11
11
  expect(exp.quantifier.min).to eq min
12
12
  expect(exp.quantifier.max).to eq max
13
13
  expect(exp.quantifier.mode).to eq mode
14
+ expect(exp.quantifier.text).to eq text
14
15
  end
15
16
  end
16
17
 
@@ -37,6 +38,21 @@ RSpec.describe('Quantifier parsing') do
37
38
  include_examples 'quantifier', /a{4}+b/, '{4}+', :possessive, :interval, 4, 4
38
39
  include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval, 4, 4
39
40
 
41
+ # special case: exps with chained quantifiers are wrapped in implicit passive groups
42
+ include_examples 'parse', /a+{2}{3}/,
43
+ 0 => [
44
+ :group, :passive, Group::Passive, implicit?: true, level: 0,
45
+ quantifier: Quantifier.new(:interval, '{3}', 3, 3, :greedy)
46
+ ],
47
+ [0, 0] => [
48
+ :group, :passive, Group::Passive, implicit?: true, level: 1,
49
+ quantifier: Quantifier.new(:interval, '{2}', 2, 2, :greedy)
50
+ ],
51
+ [0, 0, 0] => [
52
+ :literal, :literal, Literal, text: 'a', level: 2,
53
+ quantifier: Quantifier.new(:one_or_more, '+', 1, -1, :greedy)
54
+ ]
55
+
40
56
  specify('mode-checking methods') do
41
57
  exp = RP.parse(/a??/).first
42
58
 
@@ -17,7 +17,7 @@ RSpec.describe('CharacterSet::Range parsing') do
17
17
  end
18
18
 
19
19
  specify('parse set range hex') do
20
- root = RP.parse('[\\x00-\\x99]')
20
+ root = RP.parse('[\\x00-\\x22]')
21
21
  set = root[0]
22
22
  range = set[0]
23
23
 
@@ -26,9 +26,9 @@ RSpec.describe('CharacterSet::Range parsing') do
26
26
  expect(range.count).to eq 2
27
27
  expect(range.first.to_s).to eq '\\x00'
28
28
  expect(range.first).to be_instance_of(EscapeSequence::Hex)
29
- expect(range.last.to_s).to eq '\\x99'
29
+ expect(range.last.to_s).to eq '\\x22'
30
30
  expect(range.last).to be_instance_of(EscapeSequence::Hex)
31
- expect(set).to match '\\x50'
31
+ expect(set).to match "\x11"
32
32
  end
33
33
 
34
34
  specify('parse set range unicode') do
@@ -11,7 +11,13 @@ RSpec.describe('Escape scanning') do
11
11
  include_examples 'scan', /c\tt/, 1 => [:escape, :tab, '\t', 1, 3]
12
12
  include_examples 'scan', /c\vt/, 1 => [:escape, :vertical_tab, '\v', 1, 3]
13
13
 
14
+ # ineffectual literal escapes
15
+ # these cause "Unknown escape" warnings in Ruby for ascii chars,
16
+ # and simply drop the backslash for non-ascii chars (/\ü/.inspect == '/ü/').
17
+ # In terms of matching, Ruby treats them both like non-escaped literals.
14
18
  include_examples 'scan', 'c\qt', 1 => [:escape, :literal, '\q', 1, 3]
19
+ include_examples 'scan', 'a\üc', 1 => [:escape, :literal, '\ü', 1, 3]
20
+ include_examples 'scan', 'a\😋c', 1 => [:escape, :literal, '\😋', 1, 3]
15
21
 
16
22
  # these incomplete ref/call sequences are treated as literal escapes by Ruby
17
23
  include_examples 'scan', 'c\gt', 1 => [:escape, :literal, '\g', 1, 3]
@@ -21,6 +27,7 @@ RSpec.describe('Escape scanning') do
21
27
  include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5]
22
28
  include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4]
23
29
 
30
+ include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4]
24
31
  include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5]
25
32
  include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5]
26
33
 
@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
5
5
  include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3]
6
6
  include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1]
7
7
 
8
+ # Named groups
9
+ # only names that start with a hyphen or digit (ascii or other) are invalid
8
10
  include_examples 'scan', '(?<name>abc)', 0 => [:group, :named_ab, '(?<name>', 0, 8]
9
11
  include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8]
10
-
11
12
  include_examples 'scan', '(?<name_1>abc)', 0 => [:group, :named_ab, '(?<name_1>', 0,10]
12
13
  include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10]
14
+ include_examples 'scan', '(?<name-1>abc)', 0 => [:group, :named_ab, '(?<name-1>', 0,10]
15
+ include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10]
16
+ include_examples 'scan', "(?<name'1>abc)", 0 => [:group, :named_ab, "(?<name'1>", 0,10]
17
+ include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10]
18
+ include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10]
19
+ include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10]
20
+ include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10]
21
+ include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10]
13
22
 
14
23
  include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3]
15
24
  include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3]
@@ -2,48 +2,38 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe('UTF8 scanning') do
4
4
  # ascii, single byte characters
5
- include_examples 'scan', 'a', 0 => [:literal, :literal, 'a', 0, 1]
5
+ include_examples 'scan', 'a',
6
+ 0 => [:literal, :literal, 'a', 0, 1]
6
7
 
7
- include_examples 'scan', 'ab+', 0 => [:literal, :literal, 'ab', 0, 2]
8
- include_examples 'scan', 'ab+', 1 => [:quantifier, :one_or_more, '+', 2, 3]
8
+ include_examples 'scan', 'ab+',
9
+ 0 => [:literal, :literal, 'ab', 0, 2],
10
+ 1 => [:quantifier, :one_or_more, '+', 2, 3]
9
11
 
10
- # 2 byte wide characters, Arabic
11
- include_examples 'scan', 'aاbبcت', 0 => [:literal, :literal, 'aاbبcت', 0, 9]
12
-
13
- include_examples 'scan', 'aاbبت?', 0 => [:literal, :literal, 'aاbبت', 0, 8]
14
- include_examples 'scan', 'aاbبت?', 1 => [:quantifier, :zero_or_one, '?', 8, 9]
15
-
16
- include_examples 'scan', 'aا?bبcت+', 0 => [:literal, :literal, 'aا', 0, 3]
17
- include_examples 'scan', 'aا?bبcت+', 1 => [:quantifier, :zero_or_one, '?', 3, 4]
18
- include_examples 'scan', 'aا?bبcت+', 2 => [:literal, :literal, 'bبcت', 4, 10]
19
- include_examples 'scan', 'aا?bبcت+', 3 => [:quantifier, :one_or_more, '+', 10, 11]
20
-
21
- include_examples 'scan', 'a(اbب+)cت?', 0 => [:literal, :literal, 'a', 0, 1]
22
- include_examples 'scan', 'a(اbب+)cت?', 1 => [:group, :capture, '(', 1, 2]
23
- include_examples 'scan', 'a(اbب+)cت?', 2 => [:literal, :literal, 'اbب', 2, 7]
24
- include_examples 'scan', 'a(اbب+)cت?', 3 => [:quantifier, :one_or_more, '+', 7, 8]
25
- include_examples 'scan', 'a(اbب+)cت?', 4 => [:group, :close, ')', 8, 9]
26
- include_examples 'scan', 'a(اbب+)cت?', 5 => [:literal, :literal, 'cت', 9, 12]
27
- include_examples 'scan', 'a(اbب+)cت?', 6 => [:quantifier, :zero_or_one, '?', 12, 13]
12
+ # 2 byte wide characters
13
+ include_examples 'scan', 'äöü',
14
+ 0 => [:literal, :literal, 'äöü', 0, 3]
28
15
 
29
16
  # 3 byte wide characters, Japanese
30
- include_examples 'scan', 'ab?れます+cd', 0 => [:literal, :literal, 'ab', 0, 2]
31
- include_examples 'scan', 'ab?れます+cd', 1 => [:quantifier, :zero_or_one, '?', 2, 3]
32
- include_examples 'scan', 'ab?れます+cd', 2 => [:literal, :literal, 'れます', 3, 12]
33
- include_examples 'scan', 'ab?れます+cd', 3 => [:quantifier, :one_or_more, '+', 12, 13]
34
- include_examples 'scan', 'ab?れます+cd', 4 => [:literal, :literal, 'cd', 13, 15]
17
+ include_examples 'scan', 'ab?れます+cd',
18
+ 0 => [:literal, :literal, 'ab', 0, 2],
19
+ 1 => [:quantifier, :zero_or_one, '?', 2, 3],
20
+ 2 => [:literal, :literal, 'れます', 3, 6],
21
+ 3 => [:quantifier, :one_or_more, '+', 6, 7],
22
+ 4 => [:literal, :literal, 'cd', 7, 9]
35
23
 
36
24
  # 4 byte wide characters, Osmanya
37
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀𐒁', 0, 8]
38
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 1 => [:quantifier, :zero_or_one, '?', 8, 9]
39
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 2 => [:literal, :literal, '𐒂ab', 9, 15]
40
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 3 => [:quantifier, :one_or_more, '+', 15, 16]
41
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 4 => [:literal, :literal, '𐒃', 16, 20]
42
-
43
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu𝄞', 0, 6]
44
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 1 => [:quantifier, :zero_or_one, '?', 6, 7]
45
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 2 => [:literal, :literal, 'si', 7, 9]
46
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 3 => [:quantifier, :zero_or_more, '*', 9, 10]
47
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 4 => [:literal, :literal, '𝄫c', 10, 15]
48
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 5 => [:quantifier, :one_or_more, '+', 15, 16]
25
+ include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃',
26
+ 0 => [:literal, :literal, '𐒀𐒁', 0, 2],
27
+ 1 => [:quantifier, :zero_or_one, '?', 2, 3],
28
+ 2 => [:literal, :literal, '𐒂ab', 3, 6],
29
+ 3 => [:quantifier, :one_or_more, '+', 6, 7],
30
+ 4 => [:literal, :literal, '𐒃', 7, 8]
31
+
32
+ include_examples 'scan', 'mu𝄞?si*𝄫c+',
33
+ 0 => [:literal, :literal, 'mu𝄞', 0, 3],
34
+ 1 => [:quantifier, :zero_or_one, '?', 3, 4],
35
+ 2 => [:literal, :literal, 'si', 4, 6],
36
+ 3 => [:quantifier, :zero_or_more, '*', 6, 7],
37
+ 4 => [:literal, :literal, '𝄫c', 7, 9],
38
+ 5 => [:quantifier, :one_or_more, '+', 9, 10]
49
39
  end