regexp_parser 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +66 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +3 -2
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +12 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  14. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  15. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  16. data/lib/regexp_parser/expression/quantifier.rb +1 -1
  17. data/lib/regexp_parser/expression/sequence.rb +3 -9
  18. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  19. data/lib/regexp_parser/parser.rb +282 -334
  20. data/lib/regexp_parser/scanner.rb +1084 -1230
  21. data/lib/regexp_parser/scanner/scanner.rl +80 -110
  22. data/lib/regexp_parser/syntax.rb +8 -6
  23. data/lib/regexp_parser/syntax/any.rb +3 -3
  24. data/lib/regexp_parser/syntax/base.rb +1 -1
  25. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  26. data/lib/regexp_parser/version.rb +1 -1
  27. data/spec/expression/clone_spec.rb +36 -4
  28. data/spec/expression/free_space_spec.rb +2 -2
  29. data/spec/expression/methods/match_length_spec.rb +2 -2
  30. data/spec/expression/subexpression_spec.rb +1 -1
  31. data/spec/expression/to_s_spec.rb +28 -36
  32. data/spec/lexer/refcalls_spec.rb +5 -0
  33. data/spec/parser/all_spec.rb +2 -2
  34. data/spec/parser/errors_spec.rb +1 -1
  35. data/spec/parser/quantifiers_spec.rb +1 -0
  36. data/spec/parser/refcalls_spec.rb +5 -0
  37. data/spec/scanner/escapes_spec.rb +2 -1
  38. data/spec/scanner/groups_spec.rb +10 -1
  39. data/spec/scanner/refcalls_spec.rb +19 -0
  40. data/spec/scanner/sets_spec.rb +57 -14
  41. data/spec/spec_helper.rb +1 -0
  42. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dcf56dd42e703e7f1f846762c418e83792d46f2c7d9efffc1fb1612b4325e076
4
- data.tar.gz: a6197d98af2325a93ed60a102f09bc98e2163dec0d2b57fbede82dbf0479ea8b
3
+ metadata.gz: 077b8a0c90d90cf46e44671ec1335a5373eef72c61a0bcf4de43ba5217a188c3
4
+ data.tar.gz: b9aed868af73adcdf40c09720c5d10091b25a53b25a792717ceb5591039a2931
5
5
  SHA512:
6
- metadata.gz: 8b9db6543c87b63c49e24666e06bc012ea9a6e330711c9fbd35961c70d1222988bf2403927fdbe2f8797176d674b83c7c3f2d1215c5f92b0d6f00c6ab7fe37af
7
- data.tar.gz: 6cf07796d7c6ab1520a63b0f3b65d2e513caccf0b81306321376473fc438c7573bcf867e74ea8465122e12a47fe2e7e09652473f83e26f1038e112c2d66b3d2c
6
+ metadata.gz: 9c04d9a6434c6e3f322e97e8e2a1c86b3ddda88bd8821368a37b92f5836e4c3df1dc27a79165303420c3e8d5eea31bda1483824da01a40ce30961b645ba65ddd
7
+ data.tar.gz: 01e5c261e9dca0c4df7c696128dbc0520ca40aa6b9393cc8d6c3bdb8386470aeb773566000b811f98c1407038216c8d2c0b444c7955ea5a881ac759796f8a440
data/CHANGELOG.md CHANGED
@@ -1,5 +1,71 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
4
+
5
+ ### Fixed
6
+
7
+ - fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
8
+ * thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
9
+
10
+ ## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
11
+
12
+ ### Added
13
+
14
+ - common ancestor for all scanning/parsing/lexing errors
15
+ * `Regexp::Parser::Error` can now be rescued as a catch-all
16
+ * the following errors (and their many descendants) now inherit from it:
17
+ - `Regexp::Expression::Conditional::TooManyBranches`
18
+ - `Regexp::Parser::ParserError`
19
+ - `Regexp::Scanner::ScannerError`
20
+ - `Regexp::Scanner::ValidationError`
21
+ - `Regexp::Syntax::SyntaxError`
22
+ * it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
23
+ * thanks to [sandstrom](https://github.com/sandstrom) for the cue
24
+
25
+ ### Fixed
26
+
27
+ - fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
28
+ * a regression in v2.0.1 had caused them to be scanned as literals
29
+ - fixed scanning of some backreference and subexpression call edge cases
30
+ * e.g. `\k<+1>`, `\g<x-1>`
31
+ - fixed tokenization of some escapes in character sets
32
+ * `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
33
+ * all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
34
+ * if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
35
+ * the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
36
+ - fixed handling of control/metacontrol escapes in character sets
37
+ * e.g. `[\cX]`, `[\M-\C-X]`
38
+ * they were misread as bunch of individual literals, escapes, and ranges
39
+ - fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
40
+
41
+ ## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
42
+
43
+ ### Fixed
44
+
45
+ - fixed error when scanning some unlikely and redundant but valid charset patterns
46
+ * e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
47
+ - fixed ancestry of some error classes related to syntax version lookup
48
+ * `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
49
+ * they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
50
+
51
+ ## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
52
+
53
+ ### Fixed
54
+
55
+ - fixed `FrozenError` when calling `#to_s` on a frozen `Group::Passive`
56
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon)
57
+
58
+ ## [2.0.1] - 2020-12-20 - [Janosch Müller](mailto:janosch84@gmail.com)
59
+
60
+ ### Fixed
61
+
62
+ - fixed error when scanning some group names
63
+ * this affected names containing hyphens, digits or multibyte chars, e.g. `/(?<a1>a)/`
64
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
65
+ - fixed error when scanning hex escapes with just one hex digit
66
+ * e.g. `/\x0A/` was scanned correctly, but the equivalent `/\xA/` was not
67
+ * thanks to [Daniel Gollahon](https://github.com/dgollahon) for the report
68
+
3
69
  ## [2.0.0] - 2020-11-25 - [Janosch Müller](mailto:janosch84@gmail.com)
4
70
 
5
71
  ### Changed
data/Gemfile CHANGED
@@ -3,7 +3,12 @@ source 'https://rubygems.org'
3
3
  gemspec
4
4
 
5
5
  group :development, :test do
6
+ gem 'ice_nine', '~> 0.11.2'
6
7
  gem 'rake', '~> 13.0'
7
8
  gem 'regexp_property_values', '~> 1.0'
8
- gem 'rspec', '~> 3.8'
9
+ gem 'rspec', '~> 3.10'
10
+ if RUBY_VERSION.to_f >= 2.7
11
+ gem 'gouteur'
12
+ gem 'rubocop', '~> 1.7'
13
+ end
9
14
  end
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Regexp::Parser
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://secure.travis-ci.org/ammar/regexp_parser.svg?branch=master)](http://travis-ci.org/ammar/regexp_parser) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
3
+ [![Gem Version](https://badge.fury.io/rb/regexp_parser.svg)](http://badge.fury.io/rb/regexp_parser) [![Build Status](https://github.com/ammar/regexp_parser/workflows/tests/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Build Status](https://github.com/ammar/regexp_parser/workflows/gouteur/badge.svg)](https://github.com/ammar/regexp_parser/actions) [![Code Climate](https://codeclimate.com/github/ammar/regexp_parser.svg)](https://codeclimate.com/github/ammar/regexp_parser/badges)
4
4
 
5
5
  A Ruby gem for tokenizing, parsing, and transforming regular expressions.
6
6
 
@@ -22,9 +22,6 @@ _For examples of regexp_parser in use, see [Example Projects](#example-projects)
22
22
  * Ragel >= 6.0, but only if you want to build the gem or work on the scanner.
23
23
 
24
24
 
25
- _Note: See the .travis.yml file for covered versions._
26
-
27
-
28
25
  ---
29
26
  ## Install
30
27
 
data/Rakefile CHANGED
@@ -7,8 +7,8 @@ require 'bundler'
7
7
  require 'rubygems/package_task'
8
8
 
9
9
 
10
- RAGEL_SOURCE_DIR = File.expand_path '../lib/regexp_parser/scanner', __FILE__
11
- RAGEL_OUTPUT_DIR = File.expand_path '../lib/regexp_parser', __FILE__
10
+ RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
11
+ RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
12
12
  RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
13
13
 
14
14
 
@@ -25,11 +25,11 @@ end
25
25
 
26
26
  namespace :ragel do
27
27
  desc "Process the ragel source files and output ruby code"
28
- task :rb do |t|
29
- RAGEL_SOURCE_FILES.each do |file|
30
- output_file = "#{RAGEL_OUTPUT_DIR}/#{file}.rb"
28
+ task :rb do
29
+ RAGEL_SOURCE_FILES.each do |source_file|
30
+ output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
31
31
  # using faster flat table driven FSM, about 25% larger code, but about 30% faster
32
- sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{file}.rl -o #{output_file}"
32
+ sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
33
33
 
34
34
  contents = File.read(output_file)
35
35
 
@@ -42,7 +42,7 @@ namespace :ragel do
42
42
  end
43
43
 
44
44
  desc "Delete the ragel generated source file(s)"
45
- task :clean do |t|
45
+ task :clean do
46
46
  RAGEL_SOURCE_FILES.each do |file|
47
47
  sh "rm -f #{RAGEL_OUTPUT_DIR}/#{file}.rb"
48
48
  end
@@ -61,7 +61,7 @@ namespace :props do
61
61
  task :update do
62
62
  require 'regexp_property_values'
63
63
  RegexpPropertyValues.update
64
- dir = File.expand_path('../lib/regexp_parser/scanner/properties', __FILE__)
64
+ dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
65
65
 
66
66
  require 'psych'
67
67
  write_hash_to_file = ->(hash, path) do
@@ -0,0 +1,4 @@
1
+ class Regexp::Parser
2
+ # base class for all gem-specific errors (inherited but never raised itself)
3
+ class Error < StandardError; end
4
+ end
@@ -1,5 +1,6 @@
1
- module Regexp::Expression
1
+ require 'regexp_parser/error'
2
2
 
3
+ module Regexp::Expression
3
4
  class Base
4
5
  attr_accessor :type, :token
5
6
  attr_accessor :text, :ts
@@ -21,7 +22,7 @@ module Regexp::Expression
21
22
  self.options = options
22
23
  end
23
24
 
24
- def initialize_clone(orig)
25
+ def initialize_copy(orig)
25
26
  self.text = (orig.text ? orig.text.dup : nil)
26
27
  self.options = (orig.options ? orig.options.dup : nil)
27
28
  self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
@@ -2,6 +2,11 @@ module Regexp::Expression
2
2
  module Backreference
3
3
  class Base < Regexp::Expression::Base
4
4
  attr_accessor :referenced_expression
5
+
6
+ def initialize_copy(orig)
7
+ self.referenced_expression = orig.referenced_expression.dup
8
+ super
9
+ end
5
10
  end
6
11
 
7
12
  class Number < Backreference::Base
@@ -1,6 +1,6 @@
1
1
  module Regexp::Expression
2
2
  module Conditional
3
- class TooManyBranches < StandardError
3
+ class TooManyBranches < Regexp::Parser::Error
4
4
  def initialize
5
5
  super('The conditional expression has more than 2 branches')
6
6
  end
@@ -15,6 +15,11 @@ module Regexp::Expression
15
15
  ref = text.tr("'<>()", "")
16
16
  ref =~ /\D/ ? ref : Integer(ref)
17
17
  end
18
+
19
+ def initialize_copy(orig)
20
+ self.referenced_expression = orig.referenced_expression.dup
21
+ super
22
+ end
18
23
  end
19
24
 
20
25
  class Branch < Regexp::Expression::Sequence; end
@@ -53,6 +58,11 @@ module Regexp::Expression
53
58
  def to_s(format = :full)
54
59
  "#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
55
60
  end
61
+
62
+ def initialize_copy(orig)
63
+ self.referenced_expression = orig.referenced_expression.dup
64
+ super
65
+ end
56
66
  end
57
67
  end
58
68
  end
@@ -1,8 +1,8 @@
1
1
  module Regexp::Expression
2
2
 
3
3
  class FreeSpace < Regexp::Expression::Base
4
- def quantify(token, text, min = nil, max = nil, mode = :greedy)
5
- raise "Can not quantify a free space object"
4
+ def quantify(_token, _text, _min = nil, _max = nil, _mode = :greedy)
5
+ raise Regexp::Parser::Error, 'Can not quantify a free space object'
6
6
  end
7
7
  end
8
8
 
@@ -13,6 +13,11 @@ module Regexp::Expression
13
13
  class Passive < Group::Base
14
14
  attr_writer :implicit
15
15
 
16
+ def initialize(*)
17
+ @implicit = false
18
+ super
19
+ end
20
+
16
21
  def to_s(format = :full)
17
22
  if implicit?
18
23
  "#{expressions.join}#{quantifier_affix(format)}"
@@ -22,7 +27,7 @@ module Regexp::Expression
22
27
  end
23
28
 
24
29
  def implicit?
25
- @implicit ||= false
30
+ @implicit
26
31
  end
27
32
  end
28
33
 
@@ -30,6 +35,11 @@ module Regexp::Expression
30
35
  class Atomic < Group::Base; end
31
36
  class Options < Group::Base
32
37
  attr_accessor :option_changes
38
+
39
+ def initialize_copy(orig)
40
+ self.option_changes = orig.option_changes.dup
41
+ super
42
+ end
33
43
  end
34
44
 
35
45
  class Capture < Group::Base
@@ -48,7 +58,7 @@ module Regexp::Expression
48
58
  super
49
59
  end
50
60
 
51
- def initialize_clone(orig)
61
+ def initialize_copy(orig)
52
62
  @name = orig.name.dup
53
63
  super
54
64
  end
@@ -7,7 +7,7 @@ module Regexp::Expression
7
7
  end
8
8
 
9
9
  def name
10
- text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
10
+ text[/\A\\[pP]\{([^}]+)\}\z/, 1]
11
11
  end
12
12
 
13
13
  def shortcut
@@ -7,7 +7,8 @@ module Regexp::Expression
7
7
  alias :ts :starts_at
8
8
 
9
9
  def <<(exp)
10
- complete? && raise("Can't add more than 2 expressions to a Range")
10
+ complete? and raise Regexp::Parser::Error,
11
+ "Can't add more than 2 expressions to a Range"
11
12
  super
12
13
  end
13
14
 
@@ -10,7 +10,7 @@ class Regexp::MatchLength
10
10
  self.exp_class = exp.class
11
11
  self.min_rep = exp.repetitions.min
12
12
  self.max_rep = exp.repetitions.max
13
- if base = opts[:base]
13
+ if (base = opts[:base])
14
14
  self.base_min = base
15
15
  self.base_max = base
16
16
  self.reify = ->{ '.' * base }
@@ -32,7 +32,7 @@ class Regexp::MatchLength
32
32
  end
33
33
  end
34
34
 
35
- def endless_each(&block)
35
+ def endless_each
36
36
  return enum_for(__method__) unless block_given?
37
37
  (min..max).each { |num| yield(num) if include?(num) }
38
38
  end
@@ -36,7 +36,7 @@ module Regexp::Expression
36
36
 
37
37
  # Iterates over the expressions of this expression as an array, passing
38
38
  # the expression and its index within its parent to the given block.
39
- def each_expression(include_self = false, &block)
39
+ def each_expression(include_self = false)
40
40
  return enum_for(__method__, include_self) unless block_given?
41
41
 
42
42
  traverse(include_self) do |event, exp, index|
@@ -47,7 +47,7 @@ module Regexp::Expression
47
47
  # Returns a new array with the results of calling the given block once
48
48
  # for every expression. If a block is not given, returns an array with
49
49
  # each expression and its level index as an array.
50
- def flat_map(include_self = false, &block)
50
+ def flat_map(include_self = false)
51
51
  result = []
52
52
 
53
53
  each_expression(include_self) do |exp, index|
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  @max = max
13
13
  end
14
14
 
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  @text = orig.text.dup
17
17
  super
18
18
  end
@@ -41,17 +41,11 @@ module Regexp::Expression
41
41
  alias :ts :starts_at
42
42
 
43
43
  def quantify(token, text, min = nil, max = nil, mode = :greedy)
44
- offset = -1
45
- target = expressions[offset]
46
- while target.is_a?(FreeSpace)
47
- target = expressions[offset -= 1]
48
- end
49
-
50
- target || raise(ArgumentError, "No valid target found for '#{text}' "\
51
- 'quantifier')
44
+ target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
45
+ target or raise Regexp::Parser::Error,
46
+ "No valid target found for '#{text}' quantifier"
52
47
 
53
48
  target.quantify(token, text, min, max, mode)
54
49
  end
55
50
  end
56
-
57
51
  end
@@ -12,7 +12,7 @@ module Regexp::Expression
12
12
  end
13
13
 
14
14
  # Override base method to clone the expressions as well.
15
- def initialize_clone(orig)
15
+ def initialize_copy(orig)
16
16
  self.expressions = orig.expressions.map(&:clone)
17
17
  super
18
18
  end
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -70,95 +70,155 @@ class Regexp::Parser
70
70
  enabled_options
71
71
  end
72
72
 
73
- def nest(exp)
74
- nesting.push(exp)
75
- node << exp
76
- update_transplanted_subtree(exp, node)
77
- self.node = exp
78
- end
73
+ def parse_token(token)
74
+ case token.type
75
+ when :anchor; anchor(token)
76
+ when :assertion, :group; group(token)
77
+ when :backref; backref(token)
78
+ when :conditional; conditional(token)
79
+ when :escape; escape(token)
80
+ when :free_space; free_space(token)
81
+ when :keep; keep(token)
82
+ when :literal; literal(token)
83
+ when :meta; meta(token)
84
+ when :posixclass, :nonposixclass; posixclass(token)
85
+ when :property, :nonproperty; property(token)
86
+ when :quantifier; quantifier(token)
87
+ when :set; set(token)
88
+ when :type; type(token)
89
+ else
90
+ raise UnknownTokenTypeError.new(token.type, token)
91
+ end
79
92
 
80
- # subtrees are transplanted to build Alternations, Intersections, Ranges
81
- def update_transplanted_subtree(exp, new_parent)
82
- exp.nesting_level = new_parent.nesting_level + 1
83
- exp.respond_to?(:each) &&
84
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
93
+ close_completed_character_set_range
85
94
  end
86
95
 
87
- def decrease_nesting
88
- while nesting.last.is_a?(SequenceOperation)
89
- nesting.pop
90
- self.node = nesting.last
96
+ def anchor(token)
97
+ case token.token
98
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
99
+ when :bos; node << Anchor::BOS.new(token, active_opts)
100
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
101
+ when :eos; node << Anchor::EOS.new(token, active_opts)
102
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
103
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
104
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
105
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
106
+ else
107
+ raise UnknownTokenError.new('Anchor', token)
91
108
  end
92
- nesting.pop
93
- yield(node) if block_given?
94
- self.node = nesting.last
95
- self.node = node.last if node.last.is_a?(SequenceOperation)
96
109
  end
97
110
 
98
- def nest_conditional(exp)
99
- conditional_nesting.push(exp)
100
- nest(exp)
111
+ def group(token)
112
+ case token.token
113
+ when :options, :options_switch
114
+ options_group(token)
115
+ when :close
116
+ close_group
117
+ when :comment
118
+ node << Group::Comment.new(token, active_opts)
119
+ else
120
+ open_group(token)
121
+ end
101
122
  end
102
123
 
103
- def parse_token(token)
104
- close_completed_character_set_range
124
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
125
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
105
126
 
106
- case token.type
107
- when :meta; meta(token)
108
- when :quantifier; quantifier(token)
109
- when :anchor; anchor(token)
110
- when :escape; escape(token)
111
- when :group; group(token)
112
- when :assertion; group(token)
113
- when :set; set(token)
114
- when :type; type(token)
115
- when :backref; backref(token)
116
- when :conditional; conditional(token)
117
- when :keep; keep(token)
118
-
119
- when :posixclass, :nonposixclass
120
- posixclass(token)
121
- when :property, :nonproperty
122
- property(token)
123
-
124
- when :literal
125
- node << Literal.new(token, active_opts)
126
- when :free_space
127
- free_space(token)
127
+ def options_group(token)
128
+ positive, negative = token.text.split('-', 2)
129
+ negative ||= ''
130
+ self.switching_options = token.token.equal?(:options_switch)
128
131
 
129
- else
130
- raise UnknownTokenTypeError.new(token.type, token)
132
+ opt_changes = {}
133
+ new_active_opts = active_opts.dup
134
+
135
+ MOD_FLAGS.each do |flag|
136
+ if positive.include?(flag.to_s)
137
+ opt_changes[flag] = new_active_opts[flag] = true
138
+ end
139
+ if negative.include?(flag.to_s)
140
+ opt_changes[flag] = false
141
+ new_active_opts.delete(flag)
142
+ end
143
+ end
144
+
145
+ if (enc_flag = positive.reverse[/[adu]/])
146
+ enc_flag = enc_flag.to_sym
147
+ (ENC_FLAGS - [enc_flag]).each do |other|
148
+ opt_changes[other] = false if new_active_opts[other]
149
+ new_active_opts.delete(other)
150
+ end
151
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
131
152
  end
153
+
154
+ options_stack << new_active_opts
155
+
156
+ options_group = Group::Options.new(token, active_opts)
157
+ options_group.option_changes = opt_changes
158
+
159
+ nest(options_group)
132
160
  end
133
161
 
134
- def set(token)
135
- case token.token
136
- when :open
137
- open_set(token)
138
- when :close
139
- close_set
140
- when :negate
141
- negate_set
142
- when :range
143
- range(token)
144
- when :intersection
145
- intersection(token)
146
- when :collation, :equivalent
147
- node << Literal.new(token, active_opts)
148
- else
149
- raise UnknownTokenError.new('CharacterSet', token)
162
+ def open_group(token)
163
+ group_class =
164
+ case token.token
165
+ when :absence; Group::Absence
166
+ when :atomic; Group::Atomic
167
+ when :capture; Group::Capture
168
+ when :named; Group::Named
169
+ when :passive; Group::Passive
170
+
171
+ when :lookahead; Assertion::Lookahead
172
+ when :lookbehind; Assertion::Lookbehind
173
+ when :nlookahead; Assertion::NegativeLookahead
174
+ when :nlookbehind; Assertion::NegativeLookbehind
175
+
176
+ else
177
+ raise UnknownTokenError.new('Group type open', token)
178
+ end
179
+
180
+ group = group_class.new(token, active_opts)
181
+
182
+ if group.capturing?
183
+ group.number = total_captured_group_count + 1
184
+ group.number_at_level = captured_group_count_at_level + 1
185
+ count_captured_group
150
186
  end
187
+
188
+ # Push the active options to the stack again. This way we can simply pop the
189
+ # stack for any group we close, no matter if it had its own options or not.
190
+ options_stack << active_opts
191
+
192
+ nest(group)
151
193
  end
152
194
 
153
- def meta(token)
154
- case token.token
155
- when :dot
156
- node << CharacterType::Any.new(token, active_opts)
157
- when :alternation
158
- sequence_operation(Alternation, token)
159
- else
160
- raise UnknownTokenError.new('Meta', token)
195
+ def total_captured_group_count
196
+ captured_group_counts.values.reduce(0, :+)
197
+ end
198
+
199
+ def captured_group_count_at_level
200
+ captured_group_counts[node.level]
201
+ end
202
+
203
+ def count_captured_group
204
+ captured_group_counts[node.level] += 1
205
+ end
206
+
207
+ def close_group
208
+ options_stack.pop unless switching_options
209
+ self.switching_options = false
210
+ decrease_nesting
211
+ end
212
+
213
+ def decrease_nesting
214
+ while nesting.last.is_a?(SequenceOperation)
215
+ nesting.pop
216
+ self.node = nesting.last
161
217
  end
218
+ nesting.pop
219
+ yield(node) if block_given?
220
+ self.node = nesting.last
221
+ self.node = node.last if node.last.is_a?(SequenceOperation)
162
222
  end
163
223
 
164
224
  def backref(token)
@@ -188,31 +248,9 @@ class Regexp::Parser
188
248
  end
189
249
  end
190
250
 
191
- def type(token)
192
- case token.token
193
- when :digit
194
- node << CharacterType::Digit.new(token, active_opts)
195
- when :nondigit
196
- node << CharacterType::NonDigit.new(token, active_opts)
197
- when :hex
198
- node << CharacterType::Hex.new(token, active_opts)
199
- when :nonhex
200
- node << CharacterType::NonHex.new(token, active_opts)
201
- when :space
202
- node << CharacterType::Space.new(token, active_opts)
203
- when :nonspace
204
- node << CharacterType::NonSpace.new(token, active_opts)
205
- when :word
206
- node << CharacterType::Word.new(token, active_opts)
207
- when :nonword
208
- node << CharacterType::NonWord.new(token, active_opts)
209
- when :linebreak
210
- node << CharacterType::Linebreak.new(token, active_opts)
211
- when :xgrapheme
212
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
213
- else
214
- raise UnknownTokenError.new('CharacterType', token)
215
- end
251
+ def assign_effective_number(exp)
252
+ exp.effective_number =
253
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
216
254
  end
217
255
 
218
256
  def conditional(token)
@@ -240,11 +278,118 @@ class Regexp::Parser
240
278
  end
241
279
  end
242
280
 
281
+ def nest_conditional(exp)
282
+ conditional_nesting.push(exp)
283
+ nest(exp)
284
+ end
285
+
286
+ def nest(exp)
287
+ nesting.push(exp)
288
+ node << exp
289
+ update_transplanted_subtree(exp, node)
290
+ self.node = exp
291
+ end
292
+
293
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
294
+ def update_transplanted_subtree(exp, new_parent)
295
+ exp.nesting_level = new_parent.nesting_level + 1
296
+ exp.respond_to?(:each) &&
297
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
+ end
299
+
300
+ def escape(token)
301
+ case token.token
302
+
303
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
304
+
305
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
306
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
307
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
308
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
309
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
310
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
311
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
312
+
313
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
314
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
315
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
316
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
317
+
318
+ when :control
319
+ if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
320
+ node << EscapeSequence::MetaControl.new(token, active_opts)
321
+ else
322
+ node << EscapeSequence::Control.new(token, active_opts)
323
+ end
324
+
325
+ when :meta_sequence
326
+ if token.text =~ /\A\\M-\\[Cc]/
327
+ node << EscapeSequence::MetaControl.new(token, active_opts)
328
+ else
329
+ node << EscapeSequence::Meta.new(token, active_opts)
330
+ end
331
+
332
+ else
333
+ # treating everything else as a literal
334
+ # TODO: maybe split this up a bit more in v3.0.0?
335
+ # E.g. escaped quantifiers or set meta chars are not the same
336
+ # as stuff that would be a literal even without the backslash.
337
+ # Right now, they all end up here.
338
+ node << EscapeSequence::Literal.new(token, active_opts)
339
+ end
340
+ end
341
+
342
+ def free_space(token)
343
+ case token.token
344
+ when :comment
345
+ node << Comment.new(token, active_opts)
346
+ when :whitespace
347
+ if node.last.is_a?(WhiteSpace)
348
+ node.last.merge(WhiteSpace.new(token, active_opts))
349
+ else
350
+ node << WhiteSpace.new(token, active_opts)
351
+ end
352
+ else
353
+ raise UnknownTokenError.new('FreeSpace', token)
354
+ end
355
+ end
356
+
357
+ def keep(token)
358
+ node << Keep::Mark.new(token, active_opts)
359
+ end
360
+
361
+ def literal(token)
362
+ node << Literal.new(token, active_opts)
363
+ end
364
+
365
+ def meta(token)
366
+ case token.token
367
+ when :dot
368
+ node << CharacterType::Any.new(token, active_opts)
369
+ when :alternation
370
+ sequence_operation(Alternation, token)
371
+ else
372
+ raise UnknownTokenError.new('Meta', token)
373
+ end
374
+ end
375
+
376
+ def sequence_operation(klass, token)
377
+ unless node.is_a?(klass)
378
+ operator = klass.new(token, active_opts)
379
+ sequence = operator.add_sequence(active_opts)
380
+ sequence.expressions = node.expressions
381
+ node.expressions = []
382
+ nest(operator)
383
+ end
384
+ node.add_sequence(active_opts)
385
+ end
386
+
243
387
  def posixclass(token)
244
388
  node << PosixClass.new(token, active_opts)
245
389
  end
246
390
 
247
391
  include Regexp::Expression::UnicodeProperty
392
+ UPTokens = Regexp::Syntax::Token::UnicodeProperty
248
393
 
249
394
  def property(token)
250
395
  case token.token
@@ -316,127 +461,20 @@ class Regexp::Parser
316
461
  when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
317
462
  when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
318
463
 
319
- when *Token::UnicodeProperty::Age
320
- node << Age.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Derived
323
- node << Derived.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::Emoji
326
- node << Emoji.new(token, active_opts)
327
-
328
- when *Token::UnicodeProperty::Script
329
- node << Script.new(token, active_opts)
330
-
331
- when *Token::UnicodeProperty::UnicodeBlock
332
- node << Block.new(token, active_opts)
464
+ when *UPTokens::Age; node << Age.new(token, active_opts)
465
+ when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
+ when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
+ when *UPTokens::Script; node << Script.new(token, active_opts)
468
+ when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
333
469
 
334
470
  else
335
471
  raise UnknownTokenError.new('UnicodeProperty', token)
336
472
  end
337
473
  end
338
474
 
339
- def anchor(token)
340
- case token.token
341
- when :bol
342
- node << Anchor::BeginningOfLine.new(token, active_opts)
343
- when :eol
344
- node << Anchor::EndOfLine.new(token, active_opts)
345
- when :bos
346
- node << Anchor::BOS.new(token, active_opts)
347
- when :eos
348
- node << Anchor::EOS.new(token, active_opts)
349
- when :eos_ob_eol
350
- node << Anchor::EOSobEOL.new(token, active_opts)
351
- when :word_boundary
352
- node << Anchor::WordBoundary.new(token, active_opts)
353
- when :nonword_boundary
354
- node << Anchor::NonWordBoundary.new(token, active_opts)
355
- when :match_start
356
- node << Anchor::MatchStart.new(token, active_opts)
357
- else
358
- raise UnknownTokenError.new('Anchor', token)
359
- end
360
- end
361
-
362
- def escape(token)
363
- case token.token
364
-
365
- when :backspace
366
- node << EscapeSequence::Backspace.new(token, active_opts)
367
-
368
- when :escape
369
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
370
- when :bell
371
- node << EscapeSequence::Bell.new(token, active_opts)
372
- when :form_feed
373
- node << EscapeSequence::FormFeed.new(token, active_opts)
374
- when :newline
375
- node << EscapeSequence::Newline.new(token, active_opts)
376
- when :carriage
377
- node << EscapeSequence::Return.new(token, active_opts)
378
- when :tab
379
- node << EscapeSequence::Tab.new(token, active_opts)
380
- when :vertical_tab
381
- node << EscapeSequence::VerticalTab.new(token, active_opts)
382
-
383
- when :hex
384
- node << EscapeSequence::Hex.new(token, active_opts)
385
- when :octal
386
- node << EscapeSequence::Octal.new(token, active_opts)
387
- when :codepoint
388
- node << EscapeSequence::Codepoint.new(token, active_opts)
389
- when :codepoint_list
390
- node << EscapeSequence::CodepointList.new(token, active_opts)
391
-
392
- when :control
393
- if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
394
- node << EscapeSequence::MetaControl.new(token, active_opts)
395
- else
396
- node << EscapeSequence::Control.new(token, active_opts)
397
- end
398
-
399
- when :meta_sequence
400
- if token.text =~ /\A\\M-\\[Cc]/
401
- node << EscapeSequence::MetaControl.new(token, active_opts)
402
- else
403
- node << EscapeSequence::Meta.new(token, active_opts)
404
- end
405
-
406
- else
407
- # treating everything else as a literal
408
- node << EscapeSequence::Literal.new(token, active_opts)
409
- end
410
- end
411
-
412
- def keep(token)
413
- node << Keep::Mark.new(token, active_opts)
414
- end
415
-
416
- def free_space(token)
417
- case token.token
418
- when :comment
419
- node << Comment.new(token, active_opts)
420
- when :whitespace
421
- if node.last.is_a?(WhiteSpace)
422
- node.last.merge(WhiteSpace.new(token, active_opts))
423
- else
424
- node << WhiteSpace.new(token, active_opts)
425
- end
426
- else
427
- raise UnknownTokenError.new('FreeSpace', token)
428
- end
429
- end
430
-
431
475
  def quantifier(token)
432
- offset = -1
433
- target_node = node.expressions[offset]
434
- while target_node.is_a?(FreeSpace)
435
- target_node = node.expressions[offset -= 1]
436
- end
437
-
438
- target_node || raise(ArgumentError, 'No valid target found for '\
439
- "'#{token.text}' ")
476
+ target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
+ target_node or raise ParserError, "No valid target found for '#{token.text}'"
440
478
 
441
479
  # in case of chained quantifiers, wrap target in an implicit passive group
442
480
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -456,7 +494,7 @@ class Regexp::Parser
456
494
  new_group.implicit = true
457
495
  new_group << target_node
458
496
  increase_level(target_node)
459
- node.expressions[offset] = new_group
497
+ node.expressions[node.expressions.index(target_node)] = new_group
460
498
  target_node = new_group
461
499
  end
462
500
 
@@ -517,100 +555,16 @@ class Regexp::Parser
517
555
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
518
556
  end
519
557
 
520
- def group(token)
521
- case token.token
522
- when :options, :options_switch
523
- options_group(token)
524
- when :close
525
- close_group
526
- when :comment
527
- node << Group::Comment.new(token, active_opts)
528
- else
529
- open_group(token)
530
- end
531
- end
532
-
533
- MOD_FLAGS = %w[i m x].map(&:to_sym)
534
- ENC_FLAGS = %w[a d u].map(&:to_sym)
535
-
536
- def options_group(token)
537
- positive, negative = token.text.split('-', 2)
538
- negative ||= ''
539
- self.switching_options = token.token.equal?(:options_switch)
540
-
541
- opt_changes = {}
542
- new_active_opts = active_opts.dup
543
-
544
- MOD_FLAGS.each do |flag|
545
- if positive.include?(flag.to_s)
546
- opt_changes[flag] = new_active_opts[flag] = true
547
- end
548
- if negative.include?(flag.to_s)
549
- opt_changes[flag] = false
550
- new_active_opts.delete(flag)
551
- end
552
- end
553
-
554
- if (enc_flag = positive.reverse[/[adu]/])
555
- enc_flag = enc_flag.to_sym
556
- (ENC_FLAGS - [enc_flag]).each do |other|
557
- opt_changes[other] = false if new_active_opts[other]
558
- new_active_opts.delete(other)
559
- end
560
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
561
- end
562
-
563
- options_stack << new_active_opts
564
-
565
- options_group = Group::Options.new(token, active_opts)
566
- options_group.option_changes = opt_changes
567
-
568
- nest(options_group)
569
- end
570
-
571
- def open_group(token)
558
+ def set(token)
572
559
  case token.token
573
- when :passive
574
- exp = Group::Passive.new(token, active_opts)
575
- when :atomic
576
- exp = Group::Atomic.new(token, active_opts)
577
- when :named
578
- exp = Group::Named.new(token, active_opts)
579
- when :capture
580
- exp = Group::Capture.new(token, active_opts)
581
- when :absence
582
- exp = Group::Absence.new(token, active_opts)
583
-
584
- when :lookahead
585
- exp = Assertion::Lookahead.new(token, active_opts)
586
- when :nlookahead
587
- exp = Assertion::NegativeLookahead.new(token, active_opts)
588
- when :lookbehind
589
- exp = Assertion::Lookbehind.new(token, active_opts)
590
- when :nlookbehind
591
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
592
-
560
+ when :open; open_set(token)
561
+ when :close; close_set
562
+ when :negate; negate_set
563
+ when :range; range(token)
564
+ when :intersection; intersection(token)
593
565
  else
594
- raise UnknownTokenError.new('Group type open', token)
595
- end
596
-
597
- if exp.capturing?
598
- exp.number = total_captured_group_count + 1
599
- exp.number_at_level = captured_group_count_at_level + 1
600
- count_captured_group
566
+ raise UnknownTokenError.new('CharacterSet', token)
601
567
  end
602
-
603
- # Push the active options to the stack again. This way we can simply pop the
604
- # stack for any group we close, no matter if it had its own options or not.
605
- options_stack << active_opts
606
-
607
- nest(exp)
608
- end
609
-
610
- def close_group
611
- options_stack.pop unless switching_options
612
- self.switching_options = false
613
- decrease_nesting
614
568
  end
615
569
 
616
570
  def open_set(token)
@@ -633,51 +587,45 @@ class Regexp::Parser
633
587
  nest(exp)
634
588
  end
635
589
 
636
- def close_completed_character_set_range
637
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
638
- end
639
-
640
590
  def intersection(token)
641
591
  sequence_operation(CharacterSet::Intersection, token)
642
592
  end
643
593
 
644
- def sequence_operation(klass, token)
645
- unless node.is_a?(klass)
646
- operator = klass.new(token, active_opts)
647
- sequence = operator.add_sequence(active_opts)
648
- sequence.expressions = node.expressions
649
- node.expressions = []
650
- nest(operator)
594
+ def type(token)
595
+ case token.token
596
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
597
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
598
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
599
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
600
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
601
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
602
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
603
+ when :space; node << CharacterType::Space.new(token, active_opts)
604
+ when :word; node << CharacterType::Word.new(token, active_opts)
605
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
606
+ else
607
+ raise UnknownTokenError.new('CharacterType', token)
651
608
  end
652
- node.add_sequence(active_opts)
653
- end
654
-
655
- def active_opts
656
- options_stack.last
657
- end
658
-
659
- def total_captured_group_count
660
- captured_group_counts.values.reduce(0, :+)
661
- end
662
-
663
- def captured_group_count_at_level
664
- captured_group_counts[node.level]
665
609
  end
666
610
 
667
- def count_captured_group
668
- captured_group_counts[node.level] += 1
611
+ def close_completed_character_set_range
612
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
669
613
  end
670
614
 
671
- def assign_effective_number(exp)
672
- exp.effective_number =
673
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
615
+ def active_opts
616
+ options_stack.last
674
617
  end
675
618
 
619
+ # Assigns referenced expressions to refering expressions, e.g. if there is
620
+ # an instance of Backreference::Number, its #referenced_expression is set to
621
+ # the instance of Group::Capture that it refers to via its number.
676
622
  def assign_referenced_expressions
677
623
  targets = {}
624
+ # find all referencable expressions
678
625
  root.each_expression do |exp|
679
626
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
680
627
  end
628
+ # assign them to any refering expressions
681
629
  root.each_expression do |exp|
682
630
  exp.respond_to?(:reference) &&
683
631
  exp.referenced_expression = targets[exp.reference]