RubyGems - regexp_parser - Versions diffs - 1.8.1 → 2.0.3 - Mend

regexp_parser 1.8.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +70 -0
data/Gemfile +1 -0
data/README.md +12 -11
data/Rakefile +2 -2
data/lib/regexp_parser/expression.rb +10 -19
data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
data/lib/regexp_parser/expression/classes/group.rb +22 -2
data/lib/regexp_parser/expression/classes/root.rb +4 -16
data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
data/lib/regexp_parser/expression/quantifier.rb +9 -0
data/lib/regexp_parser/expression/sequence.rb +0 -10
data/lib/regexp_parser/lexer.rb +2 -2
data/lib/regexp_parser/parser.rb +27 -2
data/lib/regexp_parser/scanner.rb +1194 -1272
data/lib/regexp_parser/scanner/char_type.rl +11 -11
data/lib/regexp_parser/scanner/property.rl +2 -2
data/lib/regexp_parser/scanner/scanner.rl +178 -186
data/lib/regexp_parser/syntax.rb +4 -4
data/lib/regexp_parser/syntax/any.rb +2 -2
data/lib/regexp_parser/syntax/base.rb +1 -1
data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
data/lib/regexp_parser/version.rb +1 -1
data/spec/expression/base_spec.rb +10 -0
data/spec/expression/subexpression_spec.rb +1 -1
data/spec/expression/to_s_spec.rb +39 -31
data/spec/lexer/literals_spec.rb +24 -49
data/spec/parser/errors_spec.rb +1 -1
data/spec/parser/escapes_spec.rb +1 -1
data/spec/parser/quantifiers_spec.rb +16 -0
data/spec/parser/set/ranges_spec.rb +3 -3
data/spec/scanner/escapes_spec.rb +7 -0
data/spec/scanner/groups_spec.rb +10 -1
data/spec/scanner/literals_spec.rb +28 -38
data/spec/scanner/quantifiers_spec.rb +18 -13
data/spec/scanner/sets_spec.rb +23 -5
data/spec/spec_helper.rb +1 -0
metadata +56 -60
data/spec/expression/root_spec.rb +0 -9
data/spec/expression/sequence_spec.rb +0 -9

data/lib/regexp_parser/syntax.rb CHANGED

@@ -1,9 +1,9 @@
+module Regexp::Syntax
+  class SyntaxError < StandardError; end
+end
 require File.expand_path('../syntax/tokens', __FILE__)
 require File.expand_path('../syntax/base', __FILE__)
 require File.expand_path('../syntax/any', __FILE__)
 require File.expand_path('../syntax/version_lookup', __FILE__)
 require File.expand_path('../syntax/versions', __FILE__)
-module Regexp::Syntax
-  class SyntaxError < StandardError; end
-end

data/lib/regexp_parser/syntax/any.rb CHANGED

@@ -8,8 +8,8 @@ module Regexp::Syntax
       @implements = { :* => [:*] }
     end
-    def implements?(type, token) true end
-    def implements!(type, token) true end
+    def implements?(_type, _token) true end
+    def implements!(_type, _token) true end
   end
 end

data/lib/regexp_parser/syntax/base.rb CHANGED

@@ -1,7 +1,7 @@
 require 'set'
 module Regexp::Syntax
-  class NotImplementedError < SyntaxError
+  class NotImplementedError < Regexp::Syntax::SyntaxError
     def initialize(syntax, type, token)
       super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
     end

data/lib/regexp_parser/syntax/version_lookup.rb CHANGED

@@ -3,13 +3,13 @@ module Regexp::Syntax
   VERSION_REGEXP = /#{VERSION_FORMAT}/
   VERSION_CONST_REGEXP = /\AV\d+_\d+(?:_\d+)?\z/
-  class InvalidVersionNameError < SyntaxError
+  class InvalidVersionNameError < Regexp::Syntax::SyntaxError
     def initialize(name)
       super "Invalid version name '#{name}'. Expected format is '#{VERSION_FORMAT}'"
     end
   end
-  class UnknownSyntaxNameError < SyntaxError
+  class UnknownSyntaxNameError < Regexp::Syntax::SyntaxError
     def initialize(name)
       super "Unknown syntax name '#{name}'."
     end
@@ -74,9 +74,9 @@ module Regexp::Syntax
   end
   def warn_if_future_version(const_name)
-    return if comparable_version(const_name) < comparable_version('3.0.0')
+    return if comparable_version(const_name) < comparable_version('4.0.0')
-    warn('This library has only been tested up to Ruby 2.x, '\
+    warn('This library has only been tested up to Ruby 3.x, '\
          "but you are running with #{const_get(const_name).inspect}")
   end
 end

data/lib/regexp_parser/version.rb CHANGED

@@ -1,5 +1,5 @@
 class Regexp
   class Parser
-    VERSION = '1.8.1'
+    VERSION = '2.0.3'
   end
 end

data/spec/expression/base_spec.rb CHANGED

@@ -91,4 +91,14 @@ RSpec.describe(Regexp::Expression::Base) do
     expect(RP.parse(/a*/)[0].repetitions).to eq 0..(Float::INFINITY)
     expect(RP.parse(/a+/)[0].repetitions).to eq 1..(Float::INFINITY)
   end
+  specify('#base_length') do
+    expect(RP.parse(/(aa)/)[0].base_length).to eq 4
+    expect(RP.parse(/(aa){42}/)[0].base_length).to eq 4
+  end
+  specify('#full_length') do
+    expect(RP.parse(/(aa)/)[0].full_length).to eq 4
+    expect(RP.parse(/(aa){42}/)[0].full_length).to eq 8
+  end
 end

data/spec/expression/subexpression_spec.rb CHANGED

@@ -32,7 +32,7 @@ RSpec.describe(Regexp::Expression::Subexpression) do
     }
     root.each_expression do |exp|
-      next unless expected_nesting_level = tests.delete(exp.to_s)
+      next unless (expected_nesting_level = tests.delete(exp.to_s))
       expect(expected_nesting_level).to eq exp.nesting_level
     end

data/spec/expression/to_s_spec.rb CHANGED

@@ -1,58 +1,50 @@
 require 'spec_helper'
 RSpec.describe('Expression#to_s') do
-  specify('literal alternation') do
-    pattern = 'abcd|ghij|klmn|pqur'
+  def parse_frozen(pattern, ruby_version = nil)
+    IceNine.deep_freeze(RP.parse(pattern, *ruby_version))
+  end
+  def expect_round_trip(pattern, ruby_version = nil)
+    parsed = parse_frozen(pattern, ruby_version)
-    expect(RP.parse(pattern).to_s).to eq pattern
+    expect(parsed.to_s).to eql(pattern)
   end
-  specify('quantified alternations') do
-    pattern = '(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)'
+  specify('literal alternation') do
+    expect_round_trip('abcd|ghij|klmn|pqur')
+  end
-    expect(RP.parse(pattern).to_s).to eq pattern
+  specify('quantified alternations') do
+    expect_round_trip('(?:a?[b]+(c){2}|d+[e]*(f)?)|(?:g+[h]?(i){2,3}|j*[k]{3,5}(l)?)')
   end
   specify('quantified sets') do
-    pattern = '[abc]+|[^def]{3,6}'
-    expect(RP.parse(pattern).to_s).to eq pattern
+    expect_round_trip('[abc]+|[^def]{3,6}')
   end
   specify('property sets') do
-    pattern = '[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+'
-    expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
+    expect_round_trip('[\\a\\b\\p{Lu}\\P{Z}\\c\\d]+', 'ruby/1.9')
   end
   specify('groups') do
-    pattern = "(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++"
-    expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
+    expect_round_trip("(a(?>b(?:c(?<n>d(?'N'e)??f)+g)*+h)*i)++", 'ruby/1.9')
   end
   specify('assertions') do
-    pattern = '(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?'
-    expect(RP.parse(pattern, 'ruby/1.9').to_s).to eq pattern
+    expect_round_trip('(a+(?=b+(?!c+(?<=d+(?<!e+)?f+)?g+)?h+)?i+)?', 'ruby/1.9')
   end
   specify('comments') do
-    pattern = '(?#start)a(?#middle)b(?#end)'
-    expect(RP.parse(pattern).to_s).to eq pattern
+    expect_round_trip('(?#start)a(?#middle)b(?#end)')
   end
   specify('options') do
-    pattern = '(?mix:start)a(?-mix:middle)b(?i-mx:end)'
-    expect(RP.parse(pattern).to_s).to eq pattern
+    expect_round_trip('(?mix:start)a(?-mix:middle)b(?i-mx:end)')
   end
   specify('url') do
-    pattern = ('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
-    expect(RP.parse(pattern).to_s).to eq pattern
+    expect_round_trip('(^$)|(^(http|https):\\/\\/[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*' + '\\.[a-z]{2,5}(([0-9]{1,5})?\\/.*)?$)')
   end
   specify('multiline source') do
@@ -64,7 +56,7 @@ RSpec.describe('Expression#to_s') do
           \z
         /x
-    expect(RP.parse(multiline).to_s).to eq multiline.source
+    expect(parse_frozen(multiline).to_s).to eql(multiline.source)
   end
   specify('multiline #to_s') do
@@ -76,7 +68,7 @@ RSpec.describe('Expression#to_s') do
           \z
         /x
-    expect(RP.parse(multiline.to_s).to_s).to eq multiline.to_s
+    expect_round_trip(multiline.to_s)
   end
   # Free spacing expressions that use spaces between quantifiers and their
@@ -93,8 +85,24 @@ RSpec.describe('Expression#to_s') do
         /x
     str = 'bbbcged'
-    root = RP.parse(multiline)
+    root = parse_frozen(multiline)
+    expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eql(multiline.match(str)[0])
+  end
+  # special case: implicit groups used for chained quantifiers produce no parens
+  specify 'chained quantifiers #to_s' do
+    pattern = /a+{1}{2}/
+    root = parse_frozen(pattern)
+    expect(root.to_s).to eql('a+{1}{2}')
+  end
-    expect(Regexp.new(root.to_s, Regexp::EXTENDED).match(str)[0]).to eq multiline.match(str)[0]
+  # regression test for https://github.com/ammar/regexp_parser/issues/74
+  specify('non-ascii comment') do
+    pattern = '(?x) 😋 # 😋'
+    root = RP.parse(pattern)
+    expect(root.last).to be_a(Regexp::Expression::Comment)
+    expect(root.last.to_s).to eql('# 😋')
+    expect(root.to_s).to eql(pattern)
   end
 end

data/spec/lexer/literals_spec.rb CHANGED

@@ -10,67 +10,42 @@ RSpec.describe('Literal lexing') do
     1 => [:literal,     :literal,       'b',        1, 2, 0, 0, 0],
     2 => [:quantifier,  :one_or_more,   '+',        2, 3, 0, 0, 0]
-  # 2 byte wide characters, Arabic
-  include_examples 'lex', 'ا',
-    0 => [:literal,     :literal,       'ا',        0, 2, 0, 0, 0]
-  include_examples 'lex', 'aاbبcت',
-    0 => [:literal,     :literal,       'aاbبcت',   0, 9, 0, 0, 0]
-  include_examples 'lex', 'aاbبت?',
-    0 => [:literal,     :literal,       'aاbب',     0, 6, 0, 0, 0],
-    1 => [:literal,     :literal,       'ت',        6, 8, 0, 0, 0],
-    2 => [:quantifier,  :zero_or_one,   '?',        8, 9, 0, 0, 0]
-  include_examples 'lex', 'aا?bبcت+',
-    0 => [:literal,     :literal,       'a',        0, 1, 0, 0, 0],
-    1 => [:literal,     :literal,       'ا',        1, 3, 0, 0, 0],
-    2 => [:quantifier,  :zero_or_one,   '?',        3, 4, 0, 0, 0],
-    3 => [:literal,     :literal,       'bبc',      4, 8, 0, 0, 0],
-    4 => [:literal,     :literal,       'ت',        8, 10, 0, 0, 0],
-    5 => [:quantifier,  :one_or_more,   '+',        10, 11, 0, 0, 0]
-  include_examples 'lex', 'a(اbب+)cت?',
-    0 => [:literal,     :literal,       'a',        0, 1, 0, 0, 0],
-    1 => [:group,       :capture,       '(',        1, 2, 0, 0, 0],
-    2 => [:literal,     :literal,       'اb',       2, 5, 1, 0, 0],
-    3 => [:literal,     :literal,       'ب',        5, 7, 1, 0, 0],
-    4 => [:quantifier,  :one_or_more,   '+',        7, 8, 1, 0, 0],
-    5 => [:group,       :close,         ')',        8, 9, 0, 0, 0],
-    6 => [:literal,     :literal,       'c',        9, 10, 0, 0, 0],
-    7 => [:literal,     :literal,       'ت',        10, 12, 0, 0, 0],
-    8 => [:quantifier,  :zero_or_one,   '?',        12, 13, 0, 0, 0]
+  # 2 byte wide characters
+  include_examples 'lex', 'äöü+',
+    0 => [:literal,     :literal,       'äö',       0, 2, 0, 0, 0],
+    1 => [:literal,     :literal,       'ü',        2, 3, 0, 0, 0],
+    2 => [:quantifier,  :one_or_more,   '+',        3, 4, 0, 0, 0]
   # 3 byte wide characters, Japanese
   include_examples 'lex', 'ab?れます+cd',
     0 => [:literal,     :literal,       'a',        0, 1, 0, 0, 0],
     1 => [:literal,     :literal,       'b',        1, 2, 0, 0, 0],
     2 => [:quantifier,  :zero_or_one,   '?',        2, 3, 0, 0, 0],
-    3 => [:literal,     :literal,       'れま',     3, 9, 0, 0, 0],
-    4 => [:literal,     :literal,       'す',       9, 12, 0, 0, 0],
-    5 => [:quantifier,  :one_or_more,   '+',        12, 13, 0, 0, 0],
-    6 => [:literal,     :literal,       'cd',       13, 15, 0, 0, 0]
+    3 => [:literal,     :literal,       'れま',     3, 5, 0, 0, 0],
+    4 => [:literal,     :literal,       'す',       5, 6, 0, 0, 0],
+    5 => [:quantifier,  :one_or_more,   '+',        6, 7, 0, 0, 0],
+    6 => [:literal,     :literal,       'cd',       7, 9, 0, 0, 0]
   # 4 byte wide characters, Osmanya
   include_examples 'lex', '𐒀𐒁?𐒂ab+𐒃',
-    0 => [:literal,     :literal,       '𐒀',        0, 4, 0, 0, 0],
-    1 => [:literal,     :literal,       '𐒁',        4, 8, 0, 0, 0],
-    2 => [:quantifier,  :zero_or_one,   '?',        8, 9, 0, 0, 0],
-    3 => [:literal,     :literal,       '𐒂a',       9, 14, 0, 0, 0],
-    4 => [:literal,     :literal,       'b',        14, 15, 0, 0, 0],
-    5 => [:quantifier,  :one_or_more,   '+',        15, 16, 0, 0, 0],
-    6 => [:literal,     :literal,       '𐒃',        16, 20, 0, 0, 0]
+    0 => [:literal,     :literal,       '𐒀',        0, 1, 0, 0, 0],
+    1 => [:literal,     :literal,       '𐒁',        1, 2, 0, 0, 0],
+    2 => [:quantifier,  :zero_or_one,   '?',        2, 3, 0, 0, 0],
+    3 => [:literal,     :literal,       '𐒂a',       3, 5, 0, 0, 0],
+    4 => [:literal,     :literal,       'b',        5, 6, 0, 0, 0],
+    5 => [:quantifier,  :one_or_more,   '+',        6, 7, 0, 0, 0],
+    6 => [:literal,     :literal,       '𐒃',        7, 8, 0, 0, 0]
   include_examples 'lex', 'mu𝄞?si*𝄫c+',
     0 => [:literal,     :literal,       'mu',       0, 2, 0, 0, 0],
-    1 => [:literal,     :literal,       '𝄞',        2, 6, 0, 0, 0],
-    2 => [:quantifier,  :zero_or_one,   '?',        6, 7, 0, 0, 0],
-    3 => [:literal,     :literal,       's',        7, 8, 0, 0, 0],
-    4 => [:literal,     :literal,       'i',        8, 9, 0, 0, 0],
-    5 => [:quantifier,  :zero_or_more,  '*',        9, 10, 0, 0, 0],
-    6 => [:literal,     :literal,       '𝄫',        10, 14, 0, 0, 0],
-    7 => [:literal,     :literal,       'c',        14, 15, 0, 0, 0],
-    8 => [:quantifier,  :one_or_more,   '+',        15, 16, 0, 0, 0]
+    1 => [:literal,     :literal,       '𝄞',        2, 3, 0, 0, 0],
+    2 => [:quantifier,  :zero_or_one,   '?',        3, 4, 0, 0, 0],
+    3 => [:literal,     :literal,       's',        4, 5, 0, 0, 0],
+    4 => [:literal,     :literal,       'i',        5, 6, 0, 0, 0],
+    5 => [:quantifier,  :zero_or_more,  '*',        6, 7, 0, 0, 0],
+    6 => [:literal,     :literal,       '𝄫',        7, 8, 0, 0, 0],
+    7 => [:literal,     :literal,       'c',        8, 9, 0, 0, 0],
+    8 => [:quantifier,  :one_or_more,   '+',        9, 10, 0, 0, 0]
   specify('lex single 2 byte char') do
     tokens = RL.lex("\u0627+")

data/spec/parser/errors_spec.rb CHANGED

@@ -9,7 +9,7 @@ RSpec.describe('Parsing errors') do
       .to raise_error(Regexp::Parser::UnknownTokenTypeError)
   end
-  RSpec.shared_examples 'UnknownTokenError' do |type, token|
+  RSpec.shared_examples 'UnknownTokenError' do |type|
     it "raises for unkown tokens of type #{type}" do
       expect { parser.send(:parse_token, Regexp::Token.new(type, :foo)) }
         .to raise_error(Regexp::Parser::UnknownTokenError)

data/spec/parser/escapes_spec.rb CHANGED

@@ -25,7 +25,7 @@ RSpec.describe('EscapeSequence parsing') do
   include_examples 'parse', /a\u{41 1F60D}/, 1 => [:escape, :codepoint_list,    EscapeSequence::CodepointList]
   include_examples 'parse', /a\u{10FFFF}/,   1 => [:escape, :codepoint_list,    EscapeSequence::CodepointList]
-    # hex escapes
+  # hex escapes
   include_examples 'parse', /a\xFF/n,        1 => [:escape, :hex,               EscapeSequence::Hex]
   # octal escapes

data/spec/parser/quantifiers_spec.rb CHANGED

@@ -11,6 +11,7 @@ RSpec.describe('Quantifier parsing') do
       expect(exp.quantifier.min).to eq min
       expect(exp.quantifier.max).to eq max
       expect(exp.quantifier.mode).to eq mode
+      expect(exp.quantifier.text).to eq text
     end
   end
@@ -37,6 +38,21 @@ RSpec.describe('Quantifier parsing') do
   include_examples 'quantifier', /a{4}+b/,   '{4}+',   :possessive, :interval,     4, 4
   include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval,     4, 4
+  # special case: exps with chained quantifiers are wrapped in implicit passive groups
+  include_examples 'parse', /a+{2}{3}/,
+    0 => [
+      :group, :passive, Group::Passive, implicit?: true, level: 0,
+      quantifier: Quantifier.new(:interval, '{3}', 3, 3, :greedy)
+    ],
+    [0, 0] => [
+      :group, :passive, Group::Passive, implicit?: true, level: 1,
+      quantifier: Quantifier.new(:interval, '{2}', 2, 2, :greedy)
+    ],
+    [0, 0, 0] => [
+      :literal, :literal, Literal, text: 'a', level: 2,
+      quantifier: Quantifier.new(:one_or_more, '+', 1, -1, :greedy)
+    ]
   specify('mode-checking methods') do
     exp = RP.parse(/a??/).first

data/spec/parser/set/ranges_spec.rb CHANGED

@@ -17,7 +17,7 @@ RSpec.describe('CharacterSet::Range parsing') do
   end
   specify('parse set range hex') do
-    root = RP.parse('[\\x00-\\x99]')
+    root = RP.parse('[\\x00-\\x22]')
     set = root[0]
     range = set[0]
@@ -26,9 +26,9 @@ RSpec.describe('CharacterSet::Range parsing') do
     expect(range.count).to eq 2
     expect(range.first.to_s).to eq '\\x00'
     expect(range.first).to be_instance_of(EscapeSequence::Hex)
-    expect(range.last.to_s).to eq '\\x99'
+    expect(range.last.to_s).to eq '\\x22'
     expect(range.last).to be_instance_of(EscapeSequence::Hex)
-    expect(set).to match '\\x50'
+    expect(set).to match "\x11"
   end
   specify('parse set range unicode') do

data/spec/scanner/escapes_spec.rb CHANGED

@@ -11,7 +11,13 @@ RSpec.describe('Escape scanning') do
   include_examples 'scan', /c\tt/,            1 => [:escape,  :tab,              '\t',             1,  3]
   include_examples 'scan', /c\vt/,            1 => [:escape,  :vertical_tab,     '\v',             1,  3]
+  # ineffectual literal escapes
+  # these cause "Unknown escape" warnings in Ruby for ascii chars,
+  # and simply drop the backslash for non-ascii chars (/\ü/.inspect == '/ü/').
+  # In terms of matching, Ruby treats them both like non-escaped literals.
   include_examples 'scan', 'c\qt',            1 => [:escape,  :literal,          '\q',             1,  3]
+  include_examples 'scan', 'a\üc',            1 => [:escape, :literal,           '\ü',             1,  3]
+  include_examples 'scan', 'a\😋c',           1 => [:escape, :literal,            '\😋',            1,  3]
   # these incomplete ref/call sequences are treated as literal escapes by Ruby
   include_examples 'scan', 'c\gt',            1 => [:escape,  :literal,          '\g',             1,  3]
@@ -21,6 +27,7 @@ RSpec.describe('Escape scanning') do
   include_examples 'scan', 'a\0124',          1 => [:escape,  :octal,            '\012',           1,  5]
   include_examples 'scan', '\712+7',          0 => [:escape,  :octal,            '\712',           0,  4]
+  include_examples 'scan', 'a\xA',            1 => [:escape,  :hex,              '\xA',            1,  4]
   include_examples 'scan', 'a\x24c',          1 => [:escape,  :hex,              '\x24',           1,  5]
   include_examples 'scan', 'a\x0640c',        1 => [:escape,  :hex,              '\x06',           1,  5]

data/spec/scanner/groups_spec.rb CHANGED

@@ -5,11 +5,20 @@ RSpec.describe('Group scanning') do
   include_examples 'scan', '(?>abc)',         0 => [:group,     :atomic,         '(?>',        0, 3]
   include_examples 'scan', '(abc)',           0 => [:group,     :capture,        '(',          0, 1]
+  # Named groups
+  # only names that start with a hyphen or digit (ascii or other) are invalid
   include_examples 'scan', '(?<name>abc)',    0 => [:group,     :named_ab,       '(?<name>',   0, 8]
   include_examples 'scan', "(?'name'abc)",    0 => [:group,     :named_sq,       "(?'name'",   0, 8]
   include_examples 'scan', '(?<name_1>abc)',  0 => [:group,     :named_ab,       '(?<name_1>', 0,10]
   include_examples 'scan', "(?'name_1'abc)",  0 => [:group,     :named_sq,       "(?'name_1'", 0,10]
+  include_examples 'scan', '(?<name-1>abc)',  0 => [:group,     :named_ab,       '(?<name-1>', 0,10]
+  include_examples 'scan', "(?'name-1'abc)",  0 => [:group,     :named_sq,       "(?'name-1'", 0,10]
+  include_examples 'scan', "(?<name'1>abc)",  0 => [:group,     :named_ab,       "(?<name'1>", 0,10]
+  include_examples 'scan', "(?'name>1'abc)",  0 => [:group,     :named_sq,       "(?'name>1'", 0,10]
+  include_examples 'scan', '(?<üüuuüü>abc)',  0 => [:group,     :named_ab,       '(?<üüuuüü>', 0,10]
+  include_examples 'scan', "(?'üüuuüü'abc)",  0 => [:group,     :named_sq,       "(?'üüuuüü'", 0,10]
+  include_examples 'scan', "(?<😋1234😋>abc)",  0 => [:group,     :named_ab,       "(?<😋1234😋>", 0,10]
+  include_examples 'scan', "(?'😋1234😋'abc)",  0 => [:group,     :named_sq,       "(?'😋1234😋'", 0,10]
   include_examples 'scan', '(?:abc)',         0 => [:group,     :passive,        '(?:',        0, 3]
   include_examples 'scan', '(?:)',            0 => [:group,     :passive,        '(?:',        0, 3]

data/spec/scanner/literals_spec.rb CHANGED

@@ -2,48 +2,38 @@ require 'spec_helper'
 RSpec.describe('UTF8 scanning') do
   # ascii, single byte characters
-  include_examples 'scan', 'a', 0              => [:literal,     :literal,       'a',        0, 1]
+  include_examples 'scan', 'a',
+    0 => [:literal,     :literal,       'a',        0, 1]
-  include_examples 'scan', 'ab+', 0            => [:literal,     :literal,       'ab',       0, 2]
-  include_examples 'scan', 'ab+', 1            => [:quantifier,  :one_or_more,   '+',        2, 3]
+  include_examples 'scan', 'ab+',
+    0 => [:literal,     :literal,       'ab',       0, 2],
+    1 => [:quantifier,  :one_or_more,   '+',        2, 3]
-  # 2 byte wide characters, Arabic
-  include_examples 'scan', 'aاbبcت', 0         => [:literal,     :literal,       'aاbبcت',   0, 9]
-  include_examples 'scan', 'aاbبت?', 0         => [:literal,     :literal,       'aاbبت',    0, 8]
-  include_examples 'scan', 'aاbبت?', 1         => [:quantifier,  :zero_or_one,   '?',        8, 9]
-  include_examples 'scan', 'aا?bبcت+', 0       => [:literal,     :literal,       'aا',       0, 3]
-  include_examples 'scan', 'aا?bبcت+', 1       => [:quantifier,  :zero_or_one,   '?',        3, 4]
-  include_examples 'scan', 'aا?bبcت+', 2       => [:literal,     :literal,       'bبcت',     4, 10]
-  include_examples 'scan', 'aا?bبcت+', 3       => [:quantifier,  :one_or_more,   '+',        10, 11]
-  include_examples 'scan', 'a(اbب+)cت?', 0     => [:literal,     :literal,       'a',        0, 1]
-  include_examples 'scan', 'a(اbب+)cت?', 1     => [:group,       :capture,       '(',        1, 2]
-  include_examples 'scan', 'a(اbب+)cت?', 2     => [:literal,     :literal,       'اbب',      2, 7]
-  include_examples 'scan', 'a(اbب+)cت?', 3     => [:quantifier,  :one_or_more,   '+',        7, 8]
-  include_examples 'scan', 'a(اbب+)cت?', 4     => [:group,       :close,         ')',        8, 9]
-  include_examples 'scan', 'a(اbب+)cت?', 5     => [:literal,     :literal,       'cت',       9, 12]
-  include_examples 'scan', 'a(اbب+)cت?', 6     => [:quantifier,  :zero_or_one,   '?',        12, 13]
+  # 2 byte wide characters
+  include_examples 'scan', 'äöü',
+    0 => [:literal,     :literal,        'äöü',     0, 3]
   # 3 byte wide characters, Japanese
-  include_examples 'scan', 'ab?れます+cd', 0    => [:literal,     :literal,       'ab',       0, 2]
-  include_examples 'scan', 'ab?れます+cd', 1    => [:quantifier,  :zero_or_one,   '?',        2, 3]
-  include_examples 'scan', 'ab?れます+cd', 2    => [:literal,     :literal,       'れます',    3, 12]
-  include_examples 'scan', 'ab?れます+cd', 3    => [:quantifier,  :one_or_more,   '+',        12, 13]
-  include_examples 'scan', 'ab?れます+cd', 4    => [:literal,     :literal,       'cd',       13, 15]
+  include_examples 'scan', 'ab?れます+cd',
+    0 => [:literal,     :literal,       'ab',       0, 2],
+    1 => [:quantifier,  :zero_or_one,   '?',        2, 3],
+    2 => [:literal,     :literal,       'れます',    3, 6],
+    3 => [:quantifier,  :one_or_more,   '+',        6, 7],
+    4 => [:literal,     :literal,       'cd',       7, 9]
   # 4 byte wide characters, Osmanya
-  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 0      => [:literal,     :literal,       '𐒀𐒁',       0, 8]
-  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 1      => [:quantifier,  :zero_or_one,   '?',        8, 9]
-  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 2      => [:literal,     :literal,       '𐒂ab',      9, 15]
-  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 3      => [:quantifier,  :one_or_more,   '+',        15, 16]
-  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃', 4      => [:literal,     :literal,       '𐒃',        16, 20]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 0      => [:literal,     :literal,       'mu𝄞',       0, 6]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 1      => [:quantifier,  :zero_or_one,   '?',        6, 7]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 2      => [:literal,     :literal,       'si',       7, 9]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 3      => [:quantifier,  :zero_or_more,  '*',        9, 10]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 4      => [:literal,     :literal,       '𝄫c',       10, 15]
-  include_examples 'scan', 'mu𝄞?si*𝄫c+', 5      => [:quantifier,  :one_or_more,   '+',        15, 16]
+  include_examples 'scan', '𐒀𐒁?𐒂ab+𐒃',
+    0 => [:literal,     :literal,       '𐒀𐒁',       0, 2],
+    1 => [:quantifier,  :zero_or_one,   '?',        2, 3],
+    2 => [:literal,     :literal,       '𐒂ab',      3, 6],
+    3 => [:quantifier,  :one_or_more,   '+',        6, 7],
+    4 => [:literal,     :literal,       '𐒃',        7, 8]
+  include_examples 'scan', 'mu𝄞?si*𝄫c+',
+    0 => [:literal,     :literal,       'mu𝄞',       0, 3],
+    1 => [:quantifier,  :zero_or_one,   '?',        3, 4],
+    2 => [:literal,     :literal,       'si',       4, 6],
+    3 => [:quantifier,  :zero_or_more,  '*',        6, 7],
+    4 => [:literal,     :literal,       '𝄫c',       7, 9],
+    5 => [:quantifier,  :one_or_more,   '+',        9, 10]
 end