regexp_parser 1.8.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +62 -0
  3. data/Gemfile +1 -0
  4. data/README.md +1 -4
  5. data/Rakefile +2 -2
  6. data/lib/regexp_parser/expression.rb +4 -17
  7. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/group.rb +22 -2
  9. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  10. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  11. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  12. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  13. data/lib/regexp_parser/expression/sequence.rb +0 -10
  14. data/lib/regexp_parser/lexer.rb +2 -2
  15. data/lib/regexp_parser/parser.rb +27 -2
  16. data/lib/regexp_parser/scanner.rb +1194 -1272
  17. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  18. data/lib/regexp_parser/scanner/property.rl +2 -2
  19. data/lib/regexp_parser/scanner/scanner.rl +178 -186
  20. data/lib/regexp_parser/syntax.rb +4 -4
  21. data/lib/regexp_parser/syntax/any.rb +2 -2
  22. data/lib/regexp_parser/syntax/base.rb +1 -1
  23. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  24. data/lib/regexp_parser/version.rb +1 -1
  25. data/spec/expression/base_spec.rb +10 -0
  26. data/spec/expression/subexpression_spec.rb +1 -1
  27. data/spec/expression/to_s_spec.rb +39 -31
  28. data/spec/lexer/literals_spec.rb +24 -49
  29. data/spec/parser/errors_spec.rb +1 -1
  30. data/spec/parser/escapes_spec.rb +1 -1
  31. data/spec/parser/quantifiers_spec.rb +16 -0
  32. data/spec/parser/set/ranges_spec.rb +3 -3
  33. data/spec/scanner/escapes_spec.rb +7 -0
  34. data/spec/scanner/groups_spec.rb +10 -1
  35. data/spec/scanner/literals_spec.rb +28 -38
  36. data/spec/scanner/quantifiers_spec.rb +18 -13
  37. data/spec/scanner/sets_spec.rb +23 -5
  38. data/spec/spec_helper.rb +1 -0
  39. metadata +3 -7
  40. data/spec/expression/root_spec.rb +0 -9
  41. data/spec/expression/sequence_spec.rb +0 -9
@@ -1,9 +1,9 @@
1
+ module Regexp::Syntax
2
+ class SyntaxError < StandardError; end
3
+ end
4
+
1
5
  require File.expand_path('../syntax/tokens', __FILE__)
2
6
  require File.expand_path('../syntax/base', __FILE__)
3
7
  require File.expand_path('../syntax/any', __FILE__)
4
8
  require File.expand_path('../syntax/version_lookup', __FILE__)
5
9
  require File.expand_path('../syntax/versions', __FILE__)
6
-
7
- module Regexp::Syntax
8
- class SyntaxError < StandardError; end
9
- end
@@ -8,8 +8,8 @@ module Regexp::Syntax
8
8
  @implements = { :* => [:*] }
9
9
  end
10
10
 
11
- def implements?(type, token) true end
12
- def implements!(type, token) true end
11
+ def implements?(_type, _token) true end
12
+ def implements!(_type, _token) true end
13
13
  end
14
14
 
15
15
  end
@@ -1,7 +1,7 @@
1
1
  require 'set'
2
2
 
3
3
  module Regexp::Syntax
4
- class NotImplementedError < SyntaxError
4
+ class NotImplementedError < Regexp::Syntax::SyntaxError
5
5
  def initialize(syntax, type, token)
6
6
  super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
7
7
  end
@@ -3,13 +3,13 @@ module Regexp::Syntax
3
3
  VERSION_REGEXP = /#{VERSION_FORMAT}/
4
4
  VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
5
5
 
6
- class InvalidVersionNameError < SyntaxError
6
+ class InvalidVersionNameError < Regexp::Syntax::SyntaxError
7
7
  def initialize(name)
8
8
  super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
9
9
  end
10
10
  end
11
11
 
12
- class UnknownSyntaxNameError < SyntaxError
12
+ class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
13
13
  def initialize(name)
14
14
  super "Unknown syntax name '#{name}'."
15
15
  end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '1.8.2'
3
+ VERSION = '2.0.3'
4
4
  end
5
5
  end
@@ -91,4 +91,14 @@ RSpec.describe(Regexp::Expression::Base) do
91
91
  expect(RP.parse(/a*/)[0].repetitions).to eq 0..(Float::INFINITY)
92
92
  expect(RP.parse(/a+/)[0].repetitions).to eq 1..(Float::INFINITY)
93
93
  end
94
+
95
+ specify('#base_length') do
96
+ expect(RP.parse(/(aa)/)[0].base_length).to eq 4
97
+ expect(RP.parse(/(aa){42}/)[0].base_length).to eq 4
98
+ end
99
+
100
+ specify('#full_length') do
101
+ expect(RP.parse(/(aa)/)[0].full_length).to eq 4
102
+ expect(RP.parse(/(aa){42}/)[0].full_length).to eq 8
103
+ end
94
104
  end
@@ -32,7 +32,7 @@ RSpec.describe(Regexp::Expression::Subexpression) do
32
32
  }
33
33
 
34
34
  root.each_expression do |exp|
35
- next unless expected_nesting_level = tests.delete(exp.to_s)
35
+ next unless (expected_nesting_level = tests.delete(exp.to_s))
36
36
  expect(expected_nesting_level).to eq exp.nesting_level
37
37
  end
38
38
 
@@ -1,58 +1,50 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe('Expression#to_s') do
4
- specify('literal alternation') do
5
- pattern = 'abcd|ghij|klmn|pqur'
4
+ def parse_frozen(pattern, ruby_version = nil)
5
+ IceNine.deep_freeze(RP.parse(pattern, *ruby_version))
6
+ end
7
+
8
+ def expect_round_trip(pattern, ruby_version = nil)
9
+ parsed = parse_frozen(pattern, ruby_version)
6
10
 
7
- expect(RP.parse(pattern).to_s).to eq pattern
11
+ expect(parsed.to_s).to eql(pattern)
8
12
  end
9
13
 
10
- specify('quantified alternations') do
11
- pattern = '(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)'
14
+ specify('literal alternation') do
15
+ expect_round_trip('abcd|ghij|klmn|pqur')
16
+ end
12
17
 
13
- expect(RP.parse(pattern).to_s).to eq pattern
18
+ specify('quantified alternations') do
19
+ expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)')
14
20
  end
15
21
 
16
22
  specify('quantified sets') do
17
- pattern = '[abc]+|[^def]{3,6}'
18
-
19
- expect(RP.parse(pattern).to_s).to eq pattern
23
+ expect_round_trip('[abc]+|[^def]{3,6}')
20
24
  end
21
25
 
22
26
  specify('property sets') do
23
- pattern = '[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+'
24
-
25
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
27
+ expect_round_trip('[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+', 'ruby/1.9')
26
28
  end
27
29
 
28
30
  specify('groups') do
29
- pattern = "(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++"
30
-
31
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
31
+ expect_round_trip("(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++", 'ruby/1.9')
32
32
  end
33
33
 
34
34
  specify('assertions') do
35
- pattern = '(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?'
36
-
37
- expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
35
+ expect_round_trip('(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?', 'ruby/1.9')
38
36
  end
39
37
 
40
38
  specify('comments') do
41
- pattern = '(?#start)a(?#middle)b(?#end)'
42
-
43
- expect(RP.parse(pattern).to_s).to eq pattern
39
+ expect_round_trip('(?#start)a(?#middle)b(?#end)')
44
40
  end
45
41
 
46
42
  specify('options') do
47
- pattern = '(?mix:start)a(?-mix:middle)b(?i-mx:end)'
48
-
49
- expect(RP.parse(pattern).to_s).to eq pattern
43
+ expect_round_trip('(?mix:start)a(?-mix:middle)b(?i-mx:end)')
50
44
  end
51
45
 
52
46
  specify('url') do
53
- pattern = ('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
54
-
55
- expect(RP.parse(pattern).to_s).to eq pattern
47
+ expect_round_trip('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
56
48
  end
57
49
 
58
50
  specify('multiline source') do
@@ -64,7 +56,7 @@ RSpec.describe('Expression#to_s') do
64
56
  \z
65
57
  /x
66
58
 
67
- expect(RP.parse(multiline).to_s).to eq multiline.source
59
+ expect(parse_frozen(multiline).to_s).to eql(multiline.source)
68
60
  end
69
61
 
70
62
  specify('multiline #to_s') do
@@ -76,7 +68,7 @@ RSpec.describe('Expression#to_s') do
76
68
  \z
77
69
  /x
78
70
 
79
- expect(RP.parse(multiline.to_s).to_s).to eq multiline.to_s
71
+ expect_round_trip(multiline.to_s)
80
72
  end
81
73
 
82
74
  # Free spacing expressions that use spaces between quantifiers and their
@@ -93,8 +85,24 @@ RSpec.describe('Expression#to_s') do
93
85
  /x
94
86
 
95
87
  str = 'bbbcged'
96
- root = RP.parse(multiline)
88
+ root = parse_frozen(multiline)
89
+
90
+ expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eql(multiline.match(str)[0])
91
+ end
92
+
93
+ # special case: implicit groups used for chained quantifiers produce no parens
94
+ specify 'chained quantifiers #to_s' do
95
+ pattern = /a+{1}{2}/
96
+ root = parse_frozen(pattern)
97
+ expect(root.to_s).to eql('a+{1}{2}')
98
+ end
97
99
 
98
- expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eq multiline.match(str)[0]
100
+ # regression test for https://github.com/ammar/regexp_parser/issues/74
101
+ specify('non-ascii comment') do
102
+ pattern = '(?x) 😋 # 😋'
103
+ root = RP.parse(pattern)
104
+ expect(root.last).to be_a(Regexp::Expression::Comment)
105
+ expect(root.last.to_s).to eql('# 😋')
106
+ expect(root.to_s).to eql(pattern)
99
107
  end
100
108
  end
@@ -10,67 +10,42 @@ RSpec.describe('Literal lexing') do
10
10
  1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
11
11
  2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0]
12
12
 
13
- # 2 byte wide characters, Arabic
14
- include_examples 'lex', 'ا',
15
- 0 => [:literal, :literal, 'ا', 0, 2, 0, 0, 0]
16
-
17
- include_examples 'lex', 'aاbبcت',
18
- 0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0, 0]
19
-
20
- include_examples 'lex', 'aاbبت?',
21
- 0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0, 0],
22
- 1 => [:literal, :literal, 'ت', 6, 8, 0, 0, 0],
23
- 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0]
24
-
25
- include_examples 'lex', 'aا?bبcت+',
26
- 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
27
- 1 => [:literal, :literal, 'ا', 1, 3, 0, 0, 0],
28
- 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
29
- 3 => [:literal, :literal, 'bبc', 4, 8, 0, 0, 0],
30
- 4 => [:literal, :literal, 'ت', 8, 10, 0, 0, 0],
31
- 5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0, 0]
32
-
33
- include_examples 'lex', 'a(اbب+)cت?',
34
- 0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
35
- 1 => [:group, :capture, '(', 1, 2, 0, 0, 0],
36
- 2 => [:literal, :literal, 'اb', 2, 5, 1, 0, 0],
37
- 3 => [:literal, :literal, 'ب', 5, 7, 1, 0, 0],
38
- 4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0, 0],
39
- 5 => [:group, :close, ')', 8, 9, 0, 0, 0],
40
- 6 => [:literal, :literal, 'c', 9, 10, 0, 0, 0],
41
- 7 => [:literal, :literal, 'ت', 10, 12, 0, 0, 0],
42
- 8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0, 0]
13
+ # 2 byte wide characters
14
+ include_examples 'lex', 'äöü+',
15
+ 0 => [:literal, :literal, 'äö', 0, 2, 0, 0, 0],
16
+ 1 => [:literal, :literal, 'ü', 2, 3, 0, 0, 0],
17
+ 2 => [:quantifier, :one_or_more, '+', 3, 4, 0, 0, 0]
43
18
 
44
19
  # 3 byte wide characters, Japanese
45
20
  include_examples 'lex', 'ab?れます+cd',
46
21
  0 => [:literal, :literal, 'a', 0, 1, 0, 0, 0],
47
22
  1 => [:literal, :literal, 'b', 1, 2, 0, 0, 0],
48
23
  2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
49
- 3 => [:literal, :literal, 'れま', 3, 9, 0, 0, 0],
50
- 4 => [:literal, :literal, 'す', 9, 12, 0, 0, 0],
51
- 5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0, 0],
52
- 6 => [:literal, :literal, 'cd', 13, 15, 0, 0, 0]
24
+ 3 => [:literal, :literal, 'れま', 3, 5, 0, 0, 0],
25
+ 4 => [:literal, :literal, 'す', 5, 6, 0, 0, 0],
26
+ 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
27
+ 6 => [:literal, :literal, 'cd', 7, 9, 0, 0, 0]
53
28
 
54
29
  # 4 byte wide characters, Osmanya
55
30
  include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃',
56
- 0 => [:literal, :literal, '𐒀', 0, 4, 0, 0, 0],
57
- 1 => [:literal, :literal, '𐒁', 4, 8, 0, 0, 0],
58
- 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0, 0],
59
- 3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0, 0],
60
- 4 => [:literal, :literal, 'b', 14, 15, 0, 0, 0],
61
- 5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0],
62
- 6 => [:literal, :literal, '𐒃', 16, 20, 0, 0, 0]
31
+ 0 => [:literal, :literal, '𐒀', 0, 1, 0, 0, 0],
32
+ 1 => [:literal, :literal, '𐒁', 1, 2, 0, 0, 0],
33
+ 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0, 0],
34
+ 3 => [:literal, :literal, '𐒂a', 3, 5, 0, 0, 0],
35
+ 4 => [:literal, :literal, 'b', 5, 6, 0, 0, 0],
36
+ 5 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
37
+ 6 => [:literal, :literal, '𐒃', 7, 8, 0, 0, 0]
63
38
 
64
39
  include_examples 'lex', 'mu𝄞?si*𝄫c+',
65
40
  0 => [:literal, :literal, 'mu', 0, 2, 0, 0, 0],
66
- 1 => [:literal, :literal, '𝄞', 2, 6, 0, 0, 0],
67
- 2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0, 0],
68
- 3 => [:literal, :literal, 's', 7, 8, 0, 0, 0],
69
- 4 => [:literal, :literal, 'i', 8, 9, 0, 0, 0],
70
- 5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0, 0],
71
- 6 => [:literal, :literal, '𝄫', 10, 14, 0, 0, 0],
72
- 7 => [:literal, :literal, 'c', 14, 15, 0, 0, 0],
73
- 8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0, 0]
41
+ 1 => [:literal, :literal, '𝄞', 2, 3, 0, 0, 0],
42
+ 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0, 0],
43
+ 3 => [:literal, :literal, 's', 4, 5, 0, 0, 0],
44
+ 4 => [:literal, :literal, 'i', 5, 6, 0, 0, 0],
45
+ 5 => [:quantifier, :zero_or_more, '*', 6, 7, 0, 0, 0],
46
+ 6 => [:literal, :literal, '𝄫', 7, 8, 0, 0, 0],
47
+ 7 => [:literal, :literal, 'c', 8, 9, 0, 0, 0],
48
+ 8 => [:quantifier, :one_or_more, '+', 9, 10, 0, 0, 0]
74
49
 
75
50
  specify('lex single 2 byte char') do
76
51
  tokens = RL.lex("\u0627+")
@@ -9,7 +9,7 @@ RSpec.describe('Parsing errors') do
9
9
  .to raise_error(Regexp::Parser::UnknownTokenTypeError)
10
10
  end
11
11
 
12
- RSpec.shared_examples 'UnknownTokenError' do |type, token|
12
+ RSpec.shared_examples 'UnknownTokenError' do |type|
13
13
  it "raises for unkown tokens of type #{type}" do
14
14
  expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) }
15
15
  .to raise_error(Regexp::Parser::UnknownTokenError)
@@ -25,7 +25,7 @@ RSpec.describe('EscapeSequence parsing') do
25
25
  include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list, EscapeSequence::CodepointList]
26
26
  include_examples 'parse', /a\u{10FFFF}/, 1 => [:escape, :codepoint_list, EscapeSequence::CodepointList]
27
27
 
28
- # hex escapes
28
+ # hex escapes
29
29
  include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, EscapeSequence::Hex]
30
30
 
31
31
  # octal escapes
@@ -11,6 +11,7 @@ RSpec.describe('Quantifier parsing') do
11
11
  expect(exp.quantifier.min).to eq min
12
12
  expect(exp.quantifier.max).to eq max
13
13
  expect(exp.quantifier.mode).to eq mode
14
+ expect(exp.quantifier.text).to eq text
14
15
  end
15
16
  end
16
17
 
@@ -37,6 +38,21 @@ RSpec.describe('Quantifier parsing') do
37
38
  include_examples 'quantifier', /a{4}+b/, '{4}+', :possessive, :interval, 4, 4
38
39
  include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval, 4, 4
39
40
 
41
+ # special case: exps with chained quantifiers are wrapped in implicit passive groups
42
+ include_examples 'parse', /a+{2}{3}/,
43
+ 0 => [
44
+ :group, :passive, Group::Passive, implicit?: true, level: 0,
45
+ quantifier: Quantifier.new(:interval, '{3}', 3, 3, :greedy)
46
+ ],
47
+ [0, 0] => [
48
+ :group, :passive, Group::Passive, implicit?: true, level: 1,
49
+ quantifier: Quantifier.new(:interval, '{2}', 2, 2, :greedy)
50
+ ],
51
+ [0, 0, 0] => [
52
+ :literal, :literal, Literal, text: 'a', level: 2,
53
+ quantifier: Quantifier.new(:one_or_more, '+', 1, -1, :greedy)
54
+ ]
55
+
40
56
  specify('mode-checking methods') do
41
57
  exp = RP.parse(/a??/).first
42
58
 
@@ -17,7 +17,7 @@ RSpec.describe('CharacterSet::Range parsing') do
17
17
  end
18
18
 
19
19
  specify('parse set range hex') do
20
- root = RP.parse('[\\x00-\\x99]')
20
+ root = RP.parse('[\\x00-\\x22]')
21
21
  set = root[0]
22
22
  range = set[0]
23
23
 
@@ -26,9 +26,9 @@ RSpec.describe('CharacterSet::Range parsing') do
26
26
  expect(range.count).to eq 2
27
27
  expect(range.first.to_s).to eq '\\x00'
28
28
  expect(range.first).to be_instance_of(EscapeSequence::Hex)
29
- expect(range.last.to_s).to eq '\\x99'
29
+ expect(range.last.to_s).to eq '\\x22'
30
30
  expect(range.last).to be_instance_of(EscapeSequence::Hex)
31
- expect(set).to match '\\x50'
31
+ expect(set).to match "\x11"
32
32
  end
33
33
 
34
34
  specify('parse set range unicode') do
@@ -11,7 +11,13 @@ RSpec.describe('Escape scanning') do
11
11
  include_examples 'scan', /c\tt/, 1 => [:escape, :tab, '\t', 1, 3]
12
12
  include_examples 'scan', /c\vt/, 1 => [:escape, :vertical_tab, '\v', 1, 3]
13
13
 
14
+ # ineffectual literal escapes
15
+ # these cause "Unknown escape" warnings in Ruby for ascii chars,
16
+ # and simply drop the backslash for non-ascii chars (/\ü/.inspect == '/ü/').
17
+ # In terms of matching, Ruby treats them both like non-escaped literals.
14
18
  include_examples 'scan', 'c\qt', 1 => [:escape, :literal, '\q', 1, 3]
19
+ include_examples 'scan', 'a\üc', 1 => [:escape, :literal, '\ü', 1, 3]
20
+ include_examples 'scan', 'a\😋c', 1 => [:escape, :literal, '\😋', 1, 3]
15
21
 
16
22
  # these incomplete ref/call sequences are treated as literal escapes by Ruby
17
23
  include_examples 'scan', 'c\gt', 1 => [:escape, :literal, '\g', 1, 3]
@@ -21,6 +27,7 @@ RSpec.describe('Escape scanning') do
21
27
  include_examples 'scan', 'a\0124', 1 => [:escape, :octal, '\012', 1, 5]
22
28
  include_examples 'scan', '\712+7', 0 => [:escape, :octal, '\712', 0, 4]
23
29
 
30
+ include_examples 'scan', 'a\xA', 1 => [:escape, :hex, '\xA', 1, 4]
24
31
  include_examples 'scan', 'a\x24c', 1 => [:escape, :hex, '\x24', 1, 5]
25
32
  include_examples 'scan', 'a\x0640c', 1 => [:escape, :hex, '\x06', 1, 5]
26
33
 
@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
5
5
  include_examples 'scan', '(?>abc)', 0 => [:group, :atomic, '(?>', 0, 3]
6
6
  include_examples 'scan', '(abc)', 0 => [:group, :capture, '(', 0, 1]
7
7
 
8
+ # Named groups
9
+ # only names that start with a hyphen or digit (ascii or other) are invalid
8
10
  include_examples 'scan', '(?<name>abc)', 0 => [:group, :named_ab, '(?<name>', 0, 8]
9
11
  include_examples 'scan', "(?'name'abc)", 0 => [:group, :named_sq, "(?'name'", 0, 8]
10
-
11
12
  include_examples 'scan', '(?<name_1>abc)', 0 => [:group, :named_ab, '(?<name_1>', 0,10]
12
13
  include_examples 'scan', "(?'name_1'abc)", 0 => [:group, :named_sq, "(?'name_1'", 0,10]
14
+ include_examples 'scan', '(?<name-1>abc)', 0 => [:group, :named_ab, '(?<name-1>', 0,10]
15
+ include_examples 'scan', "(?'name-1'abc)", 0 => [:group, :named_sq, "(?'name-1'", 0,10]
16
+ include_examples 'scan', "(?<name'1>abc)", 0 => [:group, :named_ab, "(?<name'1>", 0,10]
17
+ include_examples 'scan', "(?'name>1'abc)", 0 => [:group, :named_sq, "(?'name>1'", 0,10]
18
+ include_examples 'scan', '(?<üüuuüü>abc)', 0 => [:group, :named_ab, '(?<üüuuüü>', 0,10]
19
+ include_examples 'scan', "(?'üüuuüü'abc)", 0 => [:group, :named_sq, "(?'üüuuüü'", 0,10]
20
+ include_examples 'scan', "(?<😋1234😋>abc)", 0 => [:group, :named_ab, "(?<😋1234😋>", 0,10]
21
+ include_examples 'scan', "(?'😋1234😋'abc)", 0 => [:group, :named_sq, "(?'😋1234😋'", 0,10]
13
22
 
14
23
  include_examples 'scan', '(?:abc)', 0 => [:group, :passive, '(?:', 0, 3]
15
24
  include_examples 'scan', '(?:)', 0 => [:group, :passive, '(?:', 0, 3]
@@ -2,48 +2,38 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe('UTF8 scanning') do
4
4
  # ascii, single byte characters
5
- include_examples 'scan', 'a', 0 => [:literal, :literal, 'a', 0, 1]
5
+ include_examples 'scan', 'a',
6
+ 0 => [:literal, :literal, 'a', 0, 1]
6
7
 
7
- include_examples 'scan', 'ab+', 0 => [:literal, :literal, 'ab', 0, 2]
8
- include_examples 'scan', 'ab+', 1 => [:quantifier, :one_or_more, '+', 2, 3]
8
+ include_examples 'scan', 'ab+',
9
+ 0 => [:literal, :literal, 'ab', 0, 2],
10
+ 1 => [:quantifier, :one_or_more, '+', 2, 3]
9
11
 
10
- # 2 byte wide characters, Arabic
11
- include_examples 'scan', 'aاbبcت', 0 => [:literal, :literal, 'aاbبcت', 0, 9]
12
-
13
- include_examples 'scan', 'aاbبت?', 0 => [:literal, :literal, 'aاbبت', 0, 8]
14
- include_examples 'scan', 'aاbبت?', 1 => [:quantifier, :zero_or_one, '?', 8, 9]
15
-
16
- include_examples 'scan', 'aا?bبcت+', 0 => [:literal, :literal, 'aا', 0, 3]
17
- include_examples 'scan', 'aا?bبcت+', 1 => [:quantifier, :zero_or_one, '?', 3, 4]
18
- include_examples 'scan', 'aا?bبcت+', 2 => [:literal, :literal, 'bبcت', 4, 10]
19
- include_examples 'scan', 'aا?bبcت+', 3 => [:quantifier, :one_or_more, '+', 10, 11]
20
-
21
- include_examples 'scan', 'a(اbب+)cت?', 0 => [:literal, :literal, 'a', 0, 1]
22
- include_examples 'scan', 'a(اbب+)cت?', 1 => [:group, :capture, '(', 1, 2]
23
- include_examples 'scan', 'a(اbب+)cت?', 2 => [:literal, :literal, 'اbب', 2, 7]
24
- include_examples 'scan', 'a(اbب+)cت?', 3 => [:quantifier, :one_or_more, '+', 7, 8]
25
- include_examples 'scan', 'a(اbب+)cت?', 4 => [:group, :close, ')', 8, 9]
26
- include_examples 'scan', 'a(اbب+)cت?', 5 => [:literal, :literal, 'cت', 9, 12]
27
- include_examples 'scan', 'a(اbب+)cت?', 6 => [:quantifier, :zero_or_one, '?', 12, 13]
12
+ # 2 byte wide characters
13
+ include_examples 'scan', 'äöü',
14
+ 0 => [:literal, :literal, 'äöü', 0, 3]
28
15
 
29
16
  # 3 byte wide characters, Japanese
30
- include_examples 'scan', 'ab?れます+cd', 0 => [:literal, :literal, 'ab', 0, 2]
31
- include_examples 'scan', 'ab?れます+cd', 1 => [:quantifier, :zero_or_one, '?', 2, 3]
32
- include_examples 'scan', 'ab?れます+cd', 2 => [:literal, :literal, 'れます', 3, 12]
33
- include_examples 'scan', 'ab?れます+cd', 3 => [:quantifier, :one_or_more, '+', 12, 13]
34
- include_examples 'scan', 'ab?れます+cd', 4 => [:literal, :literal, 'cd', 13, 15]
17
+ include_examples 'scan', 'ab?れます+cd',
18
+ 0 => [:literal, :literal, 'ab', 0, 2],
19
+ 1 => [:quantifier, :zero_or_one, '?', 2, 3],
20
+ 2 => [:literal, :literal, 'れます', 3, 6],
21
+ 3 => [:quantifier, :one_or_more, '+', 6, 7],
22
+ 4 => [:literal, :literal, 'cd', 7, 9]
35
23
 
36
24
  # 4 byte wide characters, Osmanya
37
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 0 => [:literal, :literal, '𐒀𐒁', 0, 8]
38
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 1 => [:quantifier, :zero_or_one, '?', 8, 9]
39
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 2 => [:literal, :literal, '𐒂ab', 9, 15]
40
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 3 => [:quantifier, :one_or_more, '+', 15, 16]
41
- include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 4 => [:literal, :literal, '𐒃', 16, 20]
42
-
43
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 0 => [:literal, :literal, 'mu𝄞', 0, 6]
44
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 1 => [:quantifier, :zero_or_one, '?', 6, 7]
45
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 2 => [:literal, :literal, 'si', 7, 9]
46
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 3 => [:quantifier, :zero_or_more, '*', 9, 10]
47
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 4 => [:literal, :literal, '𝄫c', 10, 15]
48
- include_examples 'scan', 'mu𝄞?si*𝄫c+', 5 => [:quantifier, :one_or_more, '+', 15, 16]
25
+ include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃',
26
+ 0 => [:literal, :literal, '𐒀𐒁', 0, 2],
27
+ 1 => [:quantifier, :zero_or_one, '?', 2, 3],
28
+ 2 => [:literal, :literal, '𐒂ab', 3, 6],
29
+ 3 => [:quantifier, :one_or_more, '+', 6, 7],
30
+ 4 => [:literal, :literal, '𐒃', 7, 8]
31
+
32
+ include_examples 'scan', 'mu𝄞?si*𝄫c+',
33
+ 0 => [:literal, :literal, 'mu𝄞', 0, 3],
34
+ 1 => [:quantifier, :zero_or_one, '?', 3, 4],
35
+ 2 => [:literal, :literal, 'si', 4, 6],
36
+ 3 => [:quantifier, :zero_or_more, '*', 6, 7],
37
+ 4 => [:literal, :literal, '𝄫c', 7, 9],
38
+ 5 => [:quantifier, :one_or_more, '+', 9, 10]
49
39
  end