regexp_parser 2.0.3 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -3
- data/Gemfile +5 -1
- data/README.md +1 -1
- data/Rakefile +6 -6
- data/lib/regexp_parser/error.rb +4 -0
- data/lib/regexp_parser/expression.rb +3 -2
- data/lib/regexp_parser/expression/classes/backref.rb +5 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -1
- data/lib/regexp_parser/expression/classes/property.rb +1 -1
- data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
- data/lib/regexp_parser/expression/quantifier.rb +1 -1
- data/lib/regexp_parser/expression/sequence.rb +3 -9
- data/lib/regexp_parser/expression/subexpression.rb +1 -1
- data/lib/regexp_parser/parser.rb +282 -332
- data/lib/regexp_parser/scanner.rb +1019 -1006
- data/lib/regexp_parser/scanner/scanner.rl +56 -79
- data/lib/regexp_parser/syntax.rb +8 -6
- data/lib/regexp_parser/syntax/any.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/spec/expression/clone_spec.rb +36 -4
- data/spec/expression/free_space_spec.rb +2 -2
- data/spec/expression/methods/match_length_spec.rb +2 -2
- data/spec/lexer/refcalls_spec.rb +5 -0
- data/spec/parser/all_spec.rb +2 -2
- data/spec/parser/refcalls_spec.rb +5 -0
- data/spec/scanner/escapes_spec.rb +1 -1
- data/spec/scanner/refcalls_spec.rb +19 -0
- data/spec/scanner/sets_spec.rb +42 -11
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 077b8a0c90d90cf46e44671ec1335a5373eef72c61a0bcf4de43ba5217a188c3
|
4
|
+
data.tar.gz: b9aed868af73adcdf40c09720c5d10091b25a53b25a792717ceb5591039a2931
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9c04d9a6434c6e3f322e97e8e2a1c86b3ddda88bd8821368a37b92f5836e4c3df1dc27a79165303420c3e8d5eea31bda1483824da01a40ce30961b645ba65ddd
|
7
|
+
data.tar.gz: 01e5c261e9dca0c4df7c696128dbc0520ca40aa6b9393cc8d6c3bdb8386470aeb773566000b811f98c1407038216c8d2c0b444c7955ea5a881ac759796f8a440
|
data/CHANGELOG.md
CHANGED
@@ -1,14 +1,52 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [2.1.1] - 2021-02-23 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
|
+
|
5
|
+
### Fixed
|
6
|
+
|
7
|
+
- fixed `NameError` when requiring only `'regexp_parser/scanner'` in v2.1.0
|
8
|
+
* thanks to [Jared White and Sam Ruby](https://github.com/ruby2js/ruby2js) for the report
|
9
|
+
|
10
|
+
## [2.1.0] - 2021-02-22 - [Janosch Müller](mailto:janosch84@gmail.com)
|
11
|
+
|
12
|
+
### Added
|
13
|
+
|
14
|
+
- common ancestor for all scanning/parsing/lexing errors
|
15
|
+
* `Regexp::Parser::Error` can now be rescued as a catch-all
|
16
|
+
* the following errors (and their many descendants) now inherit from it:
|
17
|
+
- `Regexp::Expression::Conditional::TooManyBranches`
|
18
|
+
- `Regexp::Parser::ParserError`
|
19
|
+
- `Regexp::Scanner::ScannerError`
|
20
|
+
- `Regexp::Scanner::ValidationError`
|
21
|
+
- `Regexp::Syntax::SyntaxError`
|
22
|
+
* it replaces `ArgumentError` in some rare cases (`Regexp::Parser.parse('?')`)
|
23
|
+
* thanks to [sandstrom](https://github.com/sandstrom) for the cue
|
24
|
+
|
25
|
+
### Fixed
|
26
|
+
|
27
|
+
- fixed scanning of whole-pattern recursion calls `\g<0>` and `\g'0'`
|
28
|
+
* a regression in v2.0.1 had caused them to be scanned as literals
|
29
|
+
- fixed scanning of some backreference and subexpression call edge cases
|
30
|
+
* e.g. `\k<+1>`, `\g<x-1>`
|
31
|
+
- fixed tokenization of some escapes in character sets
|
32
|
+
* `.`, `|`, `{`, `}`, `(`, `)`, `^`, `$`, `?`, `+`, `*`
|
33
|
+
* all of these correctly emitted `#type` `:literal` and `#token` `:literal` if *not* escaped
|
34
|
+
* if escaped, they emitted e.g. `#type` `:escape` and `#token` `:group_open` for `[\(]`
|
35
|
+
* the escaped versions now correctly emit `#type` `:escape` and `#token` `:literal`
|
36
|
+
- fixed handling of control/metacontrol escapes in character sets
|
37
|
+
* e.g. `[\cX]`, `[\M-\C-X]`
|
38
|
+
* they were misread as bunch of individual literals, escapes, and ranges
|
39
|
+
- fixed some cases where calling `#dup`/`#clone` on expressions led to shared state
|
40
|
+
|
3
41
|
## [2.0.3] - 2020-12-28 - [Janosch Müller](mailto:janosch84@gmail.com)
|
4
42
|
|
5
43
|
### Fixed
|
6
44
|
|
7
45
|
- fixed error when scanning some unlikely and redundant but valid charset patterns
|
8
|
-
|
46
|
+
* e.g. `/[[.a-b.]]/`, `/[[=e=]]/`,
|
9
47
|
- fixed ancestry of some error classes related to syntax version lookup
|
10
|
-
|
11
|
-
|
48
|
+
* `NotImplementedError`, `InvalidVersionNameError`, `UnknownSyntaxNameError`
|
49
|
+
* they now correctly inherit from `Regexp::Syntax::SyntaxError` instead of Rubys `::SyntaxError`
|
12
50
|
|
13
51
|
## [2.0.2] - 2020-12-25 - [Janosch Müller](mailto:janosch84@gmail.com)
|
14
52
|
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Regexp::Parser
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/regexp_parser) [](https://github.com/ammar/regexp_parser/actions) [](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
3
|
+
[](http://badge.fury.io/rb/regexp_parser) [](https://github.com/ammar/regexp_parser/actions) [](https://github.com/ammar/regexp_parser/actions) [](https://codeclimate.com/github/ammar/regexp_parser/badges)
|
4
4
|
|
5
5
|
A Ruby gem for tokenizing, parsing, and transforming regular expressions.
|
6
6
|
|
data/Rakefile
CHANGED
@@ -7,8 +7,8 @@ require 'bundler'
|
|
7
7
|
require 'rubygems/package_task'
|
8
8
|
|
9
9
|
|
10
|
-
RAGEL_SOURCE_DIR = File.
|
11
|
-
RAGEL_OUTPUT_DIR = File.
|
10
|
+
RAGEL_SOURCE_DIR = File.join(__dir__, 'lib/regexp_parser/scanner')
|
11
|
+
RAGEL_OUTPUT_DIR = File.join(__dir__, 'lib/regexp_parser')
|
12
12
|
RAGEL_SOURCE_FILES = %w{scanner} # scanner.rl includes property.rl
|
13
13
|
|
14
14
|
|
@@ -26,10 +26,10 @@ end
|
|
26
26
|
namespace :ragel do
|
27
27
|
desc "Process the ragel source files and output ruby code"
|
28
28
|
task :rb do
|
29
|
-
RAGEL_SOURCE_FILES.each do |
|
30
|
-
output_file = "#{RAGEL_OUTPUT_DIR}/#{
|
29
|
+
RAGEL_SOURCE_FILES.each do |source_file|
|
30
|
+
output_file = "#{RAGEL_OUTPUT_DIR}/#{source_file}.rb"
|
31
31
|
# using faster flat table driven FSM, about 25% larger code, but about 30% faster
|
32
|
-
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{
|
32
|
+
sh "ragel -F1 -R #{RAGEL_SOURCE_DIR}/#{source_file}.rl -o #{output_file}"
|
33
33
|
|
34
34
|
contents = File.read(output_file)
|
35
35
|
|
@@ -61,7 +61,7 @@ namespace :props do
|
|
61
61
|
task :update do
|
62
62
|
require 'regexp_property_values'
|
63
63
|
RegexpPropertyValues.update
|
64
|
-
dir = File.
|
64
|
+
dir = File.join(__dir__, 'lib/regexp_parser/scanner/properties')
|
65
65
|
|
66
66
|
require 'psych'
|
67
67
|
write_hash_to_file = ->(hash, path) do
|
@@ -1,5 +1,6 @@
|
|
1
|
-
|
1
|
+
require 'regexp_parser/error'
|
2
2
|
|
3
|
+
module Regexp::Expression
|
3
4
|
class Base
|
4
5
|
attr_accessor :type, :token
|
5
6
|
attr_accessor :text, :ts
|
@@ -21,7 +22,7 @@ module Regexp::Expression
|
|
21
22
|
self.options = options
|
22
23
|
end
|
23
24
|
|
24
|
-
def
|
25
|
+
def initialize_copy(orig)
|
25
26
|
self.text = (orig.text ? orig.text.dup : nil)
|
26
27
|
self.options = (orig.options ? orig.options.dup : nil)
|
27
28
|
self.quantifier = (orig.quantifier ? orig.quantifier.clone : nil)
|
@@ -2,6 +2,11 @@ module Regexp::Expression
|
|
2
2
|
module Backreference
|
3
3
|
class Base < Regexp::Expression::Base
|
4
4
|
attr_accessor :referenced_expression
|
5
|
+
|
6
|
+
def initialize_copy(orig)
|
7
|
+
self.referenced_expression = orig.referenced_expression.dup
|
8
|
+
super
|
9
|
+
end
|
5
10
|
end
|
6
11
|
|
7
12
|
class Number < Backreference::Base
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Regexp::Expression
|
2
2
|
module Conditional
|
3
|
-
class TooManyBranches <
|
3
|
+
class TooManyBranches < Regexp::Parser::Error
|
4
4
|
def initialize
|
5
5
|
super('The conditional expression has more than 2 branches')
|
6
6
|
end
|
@@ -15,6 +15,11 @@ module Regexp::Expression
|
|
15
15
|
ref = text.tr("'<>()", "")
|
16
16
|
ref =~ /\D/ ? ref : Integer(ref)
|
17
17
|
end
|
18
|
+
|
19
|
+
def initialize_copy(orig)
|
20
|
+
self.referenced_expression = orig.referenced_expression.dup
|
21
|
+
super
|
22
|
+
end
|
18
23
|
end
|
19
24
|
|
20
25
|
class Branch < Regexp::Expression::Sequence; end
|
@@ -53,6 +58,11 @@ module Regexp::Expression
|
|
53
58
|
def to_s(format = :full)
|
54
59
|
"#{text}#{condition}#{branches.join('|')})#{quantifier_affix(format)}"
|
55
60
|
end
|
61
|
+
|
62
|
+
def initialize_copy(orig)
|
63
|
+
self.referenced_expression = orig.referenced_expression.dup
|
64
|
+
super
|
65
|
+
end
|
56
66
|
end
|
57
67
|
end
|
58
68
|
end
|
@@ -2,7 +2,7 @@ module Regexp::Expression
|
|
2
2
|
|
3
3
|
class FreeSpace < Regexp::Expression::Base
|
4
4
|
def quantify(_token, _text, _min = nil, _max = nil, _mode = :greedy)
|
5
|
-
raise
|
5
|
+
raise Regexp::Parser::Error, 'Can not quantify a free space object'
|
6
6
|
end
|
7
7
|
end
|
8
8
|
|
@@ -35,6 +35,11 @@ module Regexp::Expression
|
|
35
35
|
class Atomic < Group::Base; end
|
36
36
|
class Options < Group::Base
|
37
37
|
attr_accessor :option_changes
|
38
|
+
|
39
|
+
def initialize_copy(orig)
|
40
|
+
self.option_changes = orig.option_changes.dup
|
41
|
+
super
|
42
|
+
end
|
38
43
|
end
|
39
44
|
|
40
45
|
class Capture < Group::Base
|
@@ -53,7 +58,7 @@ module Regexp::Expression
|
|
53
58
|
super
|
54
59
|
end
|
55
60
|
|
56
|
-
def
|
61
|
+
def initialize_copy(orig)
|
57
62
|
@name = orig.name.dup
|
58
63
|
super
|
59
64
|
end
|
@@ -41,17 +41,11 @@ module Regexp::Expression
|
|
41
41
|
alias :ts :starts_at
|
42
42
|
|
43
43
|
def quantify(token, text, min = nil, max = nil, mode = :greedy)
|
44
|
-
|
45
|
-
target
|
46
|
-
|
47
|
-
target = expressions[offset -= 1]
|
48
|
-
end
|
49
|
-
|
50
|
-
target || raise(ArgumentError, "No valid target found for '#{text}' "\
|
51
|
-
'quantifier')
|
44
|
+
target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
45
|
+
target or raise Regexp::Parser::Error,
|
46
|
+
"No valid target found for '#{text}' quantifier"
|
52
47
|
|
53
48
|
target.quantify(token, text, min, max, mode)
|
54
49
|
end
|
55
50
|
end
|
56
|
-
|
57
51
|
end
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
+
require 'regexp_parser/error'
|
1
2
|
require 'regexp_parser/expression'
|
2
3
|
|
3
4
|
class Regexp::Parser
|
4
5
|
include Regexp::Expression
|
5
|
-
include Regexp::Syntax
|
6
6
|
|
7
|
-
class ParserError <
|
7
|
+
class ParserError < Regexp::Parser::Error; end
|
8
8
|
|
9
9
|
class UnknownTokenTypeError < ParserError
|
10
10
|
def initialize(type, token)
|
@@ -70,93 +70,155 @@ class Regexp::Parser
|
|
70
70
|
enabled_options
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
def parse_token(token)
|
74
|
+
case token.type
|
75
|
+
when :anchor; anchor(token)
|
76
|
+
when :assertion, :group; group(token)
|
77
|
+
when :backref; backref(token)
|
78
|
+
when :conditional; conditional(token)
|
79
|
+
when :escape; escape(token)
|
80
|
+
when :free_space; free_space(token)
|
81
|
+
when :keep; keep(token)
|
82
|
+
when :literal; literal(token)
|
83
|
+
when :meta; meta(token)
|
84
|
+
when :posixclass, :nonposixclass; posixclass(token)
|
85
|
+
when :property, :nonproperty; property(token)
|
86
|
+
when :quantifier; quantifier(token)
|
87
|
+
when :set; set(token)
|
88
|
+
when :type; type(token)
|
89
|
+
else
|
90
|
+
raise UnknownTokenTypeError.new(token.type, token)
|
91
|
+
end
|
79
92
|
|
80
|
-
|
81
|
-
def update_transplanted_subtree(exp, new_parent)
|
82
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
83
|
-
exp.respond_to?(:each) &&
|
84
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
93
|
+
close_completed_character_set_range
|
85
94
|
end
|
86
95
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
def anchor(token)
|
97
|
+
case token.token
|
98
|
+
when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
|
99
|
+
when :bos; node << Anchor::BOS.new(token, active_opts)
|
100
|
+
when :eol; node << Anchor::EndOfLine.new(token, active_opts)
|
101
|
+
when :eos; node << Anchor::EOS.new(token, active_opts)
|
102
|
+
when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
|
103
|
+
when :match_start; node << Anchor::MatchStart.new(token, active_opts)
|
104
|
+
when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
|
105
|
+
when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
|
106
|
+
else
|
107
|
+
raise UnknownTokenError.new('Anchor', token)
|
91
108
|
end
|
92
|
-
nesting.pop
|
93
|
-
yield(node) if block_given?
|
94
|
-
self.node = nesting.last
|
95
|
-
self.node = node.last if node.last.is_a?(SequenceOperation)
|
96
109
|
end
|
97
110
|
|
98
|
-
def
|
99
|
-
|
100
|
-
|
111
|
+
def group(token)
|
112
|
+
case token.token
|
113
|
+
when :options, :options_switch
|
114
|
+
options_group(token)
|
115
|
+
when :close
|
116
|
+
close_group
|
117
|
+
when :comment
|
118
|
+
node << Group::Comment.new(token, active_opts)
|
119
|
+
else
|
120
|
+
open_group(token)
|
121
|
+
end
|
101
122
|
end
|
102
123
|
|
103
|
-
|
104
|
-
|
124
|
+
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
125
|
+
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
105
126
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
when :escape; escape(token)
|
111
|
-
when :group; group(token)
|
112
|
-
when :assertion; group(token)
|
113
|
-
when :set; set(token)
|
114
|
-
when :type; type(token)
|
115
|
-
when :backref; backref(token)
|
116
|
-
when :conditional; conditional(token)
|
117
|
-
when :keep; keep(token)
|
118
|
-
|
119
|
-
when :posixclass, :nonposixclass
|
120
|
-
posixclass(token)
|
121
|
-
when :property, :nonproperty
|
122
|
-
property(token)
|
123
|
-
|
124
|
-
when :literal
|
125
|
-
node << Literal.new(token, active_opts)
|
126
|
-
when :free_space
|
127
|
-
free_space(token)
|
127
|
+
def options_group(token)
|
128
|
+
positive, negative = token.text.split('-', 2)
|
129
|
+
negative ||= ''
|
130
|
+
self.switching_options = token.token.equal?(:options_switch)
|
128
131
|
|
129
|
-
|
130
|
-
|
132
|
+
opt_changes = {}
|
133
|
+
new_active_opts = active_opts.dup
|
134
|
+
|
135
|
+
MOD_FLAGS.each do |flag|
|
136
|
+
if positive.include?(flag.to_s)
|
137
|
+
opt_changes[flag] = new_active_opts[flag] = true
|
138
|
+
end
|
139
|
+
if negative.include?(flag.to_s)
|
140
|
+
opt_changes[flag] = false
|
141
|
+
new_active_opts.delete(flag)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if (enc_flag = positive.reverse[/[adu]/])
|
146
|
+
enc_flag = enc_flag.to_sym
|
147
|
+
(ENC_FLAGS - [enc_flag]).each do |other|
|
148
|
+
opt_changes[other] = false if new_active_opts[other]
|
149
|
+
new_active_opts.delete(other)
|
150
|
+
end
|
151
|
+
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
131
152
|
end
|
153
|
+
|
154
|
+
options_stack << new_active_opts
|
155
|
+
|
156
|
+
options_group = Group::Options.new(token, active_opts)
|
157
|
+
options_group.option_changes = opt_changes
|
158
|
+
|
159
|
+
nest(options_group)
|
132
160
|
end
|
133
161
|
|
134
|
-
def
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
162
|
+
def open_group(token)
|
163
|
+
group_class =
|
164
|
+
case token.token
|
165
|
+
when :absence; Group::Absence
|
166
|
+
when :atomic; Group::Atomic
|
167
|
+
when :capture; Group::Capture
|
168
|
+
when :named; Group::Named
|
169
|
+
when :passive; Group::Passive
|
170
|
+
|
171
|
+
when :lookahead; Assertion::Lookahead
|
172
|
+
when :lookbehind; Assertion::Lookbehind
|
173
|
+
when :nlookahead; Assertion::NegativeLookahead
|
174
|
+
when :nlookbehind; Assertion::NegativeLookbehind
|
175
|
+
|
176
|
+
else
|
177
|
+
raise UnknownTokenError.new('Group type open', token)
|
178
|
+
end
|
179
|
+
|
180
|
+
group = group_class.new(token, active_opts)
|
181
|
+
|
182
|
+
if group.capturing?
|
183
|
+
group.number = total_captured_group_count + 1
|
184
|
+
group.number_at_level = captured_group_count_at_level + 1
|
185
|
+
count_captured_group
|
148
186
|
end
|
187
|
+
|
188
|
+
# Push the active options to the stack again. This way we can simply pop the
|
189
|
+
# stack for any group we close, no matter if it had its own options or not.
|
190
|
+
options_stack << active_opts
|
191
|
+
|
192
|
+
nest(group)
|
149
193
|
end
|
150
194
|
|
151
|
-
def
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
195
|
+
def total_captured_group_count
|
196
|
+
captured_group_counts.values.reduce(0, :+)
|
197
|
+
end
|
198
|
+
|
199
|
+
def captured_group_count_at_level
|
200
|
+
captured_group_counts[node.level]
|
201
|
+
end
|
202
|
+
|
203
|
+
def count_captured_group
|
204
|
+
captured_group_counts[node.level] += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def close_group
|
208
|
+
options_stack.pop unless switching_options
|
209
|
+
self.switching_options = false
|
210
|
+
decrease_nesting
|
211
|
+
end
|
212
|
+
|
213
|
+
def decrease_nesting
|
214
|
+
while nesting.last.is_a?(SequenceOperation)
|
215
|
+
nesting.pop
|
216
|
+
self.node = nesting.last
|
159
217
|
end
|
218
|
+
nesting.pop
|
219
|
+
yield(node) if block_given?
|
220
|
+
self.node = nesting.last
|
221
|
+
self.node = node.last if node.last.is_a?(SequenceOperation)
|
160
222
|
end
|
161
223
|
|
162
224
|
def backref(token)
|
@@ -186,31 +248,9 @@ class Regexp::Parser
|
|
186
248
|
end
|
187
249
|
end
|
188
250
|
|
189
|
-
def
|
190
|
-
|
191
|
-
|
192
|
-
node << CharacterType::Digit.new(token, active_opts)
|
193
|
-
when :nondigit
|
194
|
-
node << CharacterType::NonDigit.new(token, active_opts)
|
195
|
-
when :hex
|
196
|
-
node << CharacterType::Hex.new(token, active_opts)
|
197
|
-
when :nonhex
|
198
|
-
node << CharacterType::NonHex.new(token, active_opts)
|
199
|
-
when :space
|
200
|
-
node << CharacterType::Space.new(token, active_opts)
|
201
|
-
when :nonspace
|
202
|
-
node << CharacterType::NonSpace.new(token, active_opts)
|
203
|
-
when :word
|
204
|
-
node << CharacterType::Word.new(token, active_opts)
|
205
|
-
when :nonword
|
206
|
-
node << CharacterType::NonWord.new(token, active_opts)
|
207
|
-
when :linebreak
|
208
|
-
node << CharacterType::Linebreak.new(token, active_opts)
|
209
|
-
when :xgrapheme
|
210
|
-
node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
211
|
-
else
|
212
|
-
raise UnknownTokenError.new('CharacterType', token)
|
213
|
-
end
|
251
|
+
def assign_effective_number(exp)
|
252
|
+
exp.effective_number =
|
253
|
+
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
214
254
|
end
|
215
255
|
|
216
256
|
def conditional(token)
|
@@ -238,11 +278,118 @@ class Regexp::Parser
|
|
238
278
|
end
|
239
279
|
end
|
240
280
|
|
281
|
+
def nest_conditional(exp)
|
282
|
+
conditional_nesting.push(exp)
|
283
|
+
nest(exp)
|
284
|
+
end
|
285
|
+
|
286
|
+
def nest(exp)
|
287
|
+
nesting.push(exp)
|
288
|
+
node << exp
|
289
|
+
update_transplanted_subtree(exp, node)
|
290
|
+
self.node = exp
|
291
|
+
end
|
292
|
+
|
293
|
+
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
+
def update_transplanted_subtree(exp, new_parent)
|
295
|
+
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
+
exp.respond_to?(:each) &&
|
297
|
+
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
+
end
|
299
|
+
|
300
|
+
def escape(token)
|
301
|
+
case token.token
|
302
|
+
|
303
|
+
when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
|
304
|
+
|
305
|
+
when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
306
|
+
when :bell; node << EscapeSequence::Bell.new(token, active_opts)
|
307
|
+
when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
|
308
|
+
when :newline; node << EscapeSequence::Newline.new(token, active_opts)
|
309
|
+
when :carriage; node << EscapeSequence::Return.new(token, active_opts)
|
310
|
+
when :tab; node << EscapeSequence::Tab.new(token, active_opts)
|
311
|
+
when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
|
312
|
+
|
313
|
+
when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
|
314
|
+
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
|
315
|
+
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
|
316
|
+
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
|
317
|
+
|
318
|
+
when :control
|
319
|
+
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
320
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
321
|
+
else
|
322
|
+
node << EscapeSequence::Control.new(token, active_opts)
|
323
|
+
end
|
324
|
+
|
325
|
+
when :meta_sequence
|
326
|
+
if token.text =~ /\A\\M-\\[Cc]/
|
327
|
+
node << EscapeSequence::MetaControl.new(token, active_opts)
|
328
|
+
else
|
329
|
+
node << EscapeSequence::Meta.new(token, active_opts)
|
330
|
+
end
|
331
|
+
|
332
|
+
else
|
333
|
+
# treating everything else as a literal
|
334
|
+
# TODO: maybe split this up a bit more in v3.0.0?
|
335
|
+
# E.g. escaped quantifiers or set meta chars are not the same
|
336
|
+
# as stuff that would be a literal even without the backslash.
|
337
|
+
# Right now, they all end up here.
|
338
|
+
node << EscapeSequence::Literal.new(token, active_opts)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def free_space(token)
|
343
|
+
case token.token
|
344
|
+
when :comment
|
345
|
+
node << Comment.new(token, active_opts)
|
346
|
+
when :whitespace
|
347
|
+
if node.last.is_a?(WhiteSpace)
|
348
|
+
node.last.merge(WhiteSpace.new(token, active_opts))
|
349
|
+
else
|
350
|
+
node << WhiteSpace.new(token, active_opts)
|
351
|
+
end
|
352
|
+
else
|
353
|
+
raise UnknownTokenError.new('FreeSpace', token)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def keep(token)
|
358
|
+
node << Keep::Mark.new(token, active_opts)
|
359
|
+
end
|
360
|
+
|
361
|
+
def literal(token)
|
362
|
+
node << Literal.new(token, active_opts)
|
363
|
+
end
|
364
|
+
|
365
|
+
def meta(token)
|
366
|
+
case token.token
|
367
|
+
when :dot
|
368
|
+
node << CharacterType::Any.new(token, active_opts)
|
369
|
+
when :alternation
|
370
|
+
sequence_operation(Alternation, token)
|
371
|
+
else
|
372
|
+
raise UnknownTokenError.new('Meta', token)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def sequence_operation(klass, token)
|
377
|
+
unless node.is_a?(klass)
|
378
|
+
operator = klass.new(token, active_opts)
|
379
|
+
sequence = operator.add_sequence(active_opts)
|
380
|
+
sequence.expressions = node.expressions
|
381
|
+
node.expressions = []
|
382
|
+
nest(operator)
|
383
|
+
end
|
384
|
+
node.add_sequence(active_opts)
|
385
|
+
end
|
386
|
+
|
241
387
|
def posixclass(token)
|
242
388
|
node << PosixClass.new(token, active_opts)
|
243
389
|
end
|
244
390
|
|
245
391
|
include Regexp::Expression::UnicodeProperty
|
392
|
+
UPTokens = Regexp::Syntax::Token::UnicodeProperty
|
246
393
|
|
247
394
|
def property(token)
|
248
395
|
case token.token
|
@@ -314,127 +461,20 @@ class Regexp::Parser
|
|
314
461
|
when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
|
315
462
|
when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
|
316
463
|
|
317
|
-
when *
|
318
|
-
node <<
|
319
|
-
|
320
|
-
when *
|
321
|
-
|
322
|
-
|
323
|
-
when *Token::UnicodeProperty::Emoji
|
324
|
-
node << Emoji.new(token, active_opts)
|
325
|
-
|
326
|
-
when *Token::UnicodeProperty::Script
|
327
|
-
node << Script.new(token, active_opts)
|
328
|
-
|
329
|
-
when *Token::UnicodeProperty::UnicodeBlock
|
330
|
-
node << Block.new(token, active_opts)
|
464
|
+
when *UPTokens::Age; node << Age.new(token, active_opts)
|
465
|
+
when *UPTokens::Derived; node << Derived.new(token, active_opts)
|
466
|
+
when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
|
467
|
+
when *UPTokens::Script; node << Script.new(token, active_opts)
|
468
|
+
when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
|
331
469
|
|
332
470
|
else
|
333
471
|
raise UnknownTokenError.new('UnicodeProperty', token)
|
334
472
|
end
|
335
473
|
end
|
336
474
|
|
337
|
-
def anchor(token)
|
338
|
-
case token.token
|
339
|
-
when :bol
|
340
|
-
node << Anchor::BeginningOfLine.new(token, active_opts)
|
341
|
-
when :eol
|
342
|
-
node << Anchor::EndOfLine.new(token, active_opts)
|
343
|
-
when :bos
|
344
|
-
node << Anchor::BOS.new(token, active_opts)
|
345
|
-
when :eos
|
346
|
-
node << Anchor::EOS.new(token, active_opts)
|
347
|
-
when :eos_ob_eol
|
348
|
-
node << Anchor::EOSobEOL.new(token, active_opts)
|
349
|
-
when :word_boundary
|
350
|
-
node << Anchor::WordBoundary.new(token, active_opts)
|
351
|
-
when :nonword_boundary
|
352
|
-
node << Anchor::NonWordBoundary.new(token, active_opts)
|
353
|
-
when :match_start
|
354
|
-
node << Anchor::MatchStart.new(token, active_opts)
|
355
|
-
else
|
356
|
-
raise UnknownTokenError.new('Anchor', token)
|
357
|
-
end
|
358
|
-
end
|
359
|
-
|
360
|
-
def escape(token)
|
361
|
-
case token.token
|
362
|
-
|
363
|
-
when :backspace
|
364
|
-
node << EscapeSequence::Backspace.new(token, active_opts)
|
365
|
-
|
366
|
-
when :escape
|
367
|
-
node << EscapeSequence::AsciiEscape.new(token, active_opts)
|
368
|
-
when :bell
|
369
|
-
node << EscapeSequence::Bell.new(token, active_opts)
|
370
|
-
when :form_feed
|
371
|
-
node << EscapeSequence::FormFeed.new(token, active_opts)
|
372
|
-
when :newline
|
373
|
-
node << EscapeSequence::Newline.new(token, active_opts)
|
374
|
-
when :carriage
|
375
|
-
node << EscapeSequence::Return.new(token, active_opts)
|
376
|
-
when :tab
|
377
|
-
node << EscapeSequence::Tab.new(token, active_opts)
|
378
|
-
when :vertical_tab
|
379
|
-
node << EscapeSequence::VerticalTab.new(token, active_opts)
|
380
|
-
|
381
|
-
when :hex
|
382
|
-
node << EscapeSequence::Hex.new(token, active_opts)
|
383
|
-
when :octal
|
384
|
-
node << EscapeSequence::Octal.new(token, active_opts)
|
385
|
-
when :codepoint
|
386
|
-
node << EscapeSequence::Codepoint.new(token, active_opts)
|
387
|
-
when :codepoint_list
|
388
|
-
node << EscapeSequence::CodepointList.new(token, active_opts)
|
389
|
-
|
390
|
-
when :control
|
391
|
-
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
|
392
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
393
|
-
else
|
394
|
-
node << EscapeSequence::Control.new(token, active_opts)
|
395
|
-
end
|
396
|
-
|
397
|
-
when :meta_sequence
|
398
|
-
if token.text =~ /\A\\M-\\[Cc]/
|
399
|
-
node << EscapeSequence::MetaControl.new(token, active_opts)
|
400
|
-
else
|
401
|
-
node << EscapeSequence::Meta.new(token, active_opts)
|
402
|
-
end
|
403
|
-
|
404
|
-
else
|
405
|
-
# treating everything else as a literal
|
406
|
-
node << EscapeSequence::Literal.new(token, active_opts)
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
def keep(token)
|
411
|
-
node << Keep::Mark.new(token, active_opts)
|
412
|
-
end
|
413
|
-
|
414
|
-
def free_space(token)
|
415
|
-
case token.token
|
416
|
-
when :comment
|
417
|
-
node << Comment.new(token, active_opts)
|
418
|
-
when :whitespace
|
419
|
-
if node.last.is_a?(WhiteSpace)
|
420
|
-
node.last.merge(WhiteSpace.new(token, active_opts))
|
421
|
-
else
|
422
|
-
node << WhiteSpace.new(token, active_opts)
|
423
|
-
end
|
424
|
-
else
|
425
|
-
raise UnknownTokenError.new('FreeSpace', token)
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
475
|
def quantifier(token)
|
430
|
-
|
431
|
-
target_node
|
432
|
-
while target_node.is_a?(FreeSpace)
|
433
|
-
target_node = node.expressions[offset -= 1]
|
434
|
-
end
|
435
|
-
|
436
|
-
target_node || raise(ArgumentError, 'No valid target found for '\
|
437
|
-
"'#{token.text}' ")
|
476
|
+
target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
|
477
|
+
target_node or raise ParserError, "No valid target found for '#{token.text}'"
|
438
478
|
|
439
479
|
# in case of chained quantifiers, wrap target in an implicit passive group
|
440
480
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
@@ -454,7 +494,7 @@ class Regexp::Parser
|
|
454
494
|
new_group.implicit = true
|
455
495
|
new_group << target_node
|
456
496
|
increase_level(target_node)
|
457
|
-
node.expressions[
|
497
|
+
node.expressions[node.expressions.index(target_node)] = new_group
|
458
498
|
target_node = new_group
|
459
499
|
end
|
460
500
|
|
@@ -515,100 +555,16 @@ class Regexp::Parser
|
|
515
555
|
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
516
556
|
end
|
517
557
|
|
518
|
-
def
|
519
|
-
case token.token
|
520
|
-
when :options, :options_switch
|
521
|
-
options_group(token)
|
522
|
-
when :close
|
523
|
-
close_group
|
524
|
-
when :comment
|
525
|
-
node << Group::Comment.new(token, active_opts)
|
526
|
-
else
|
527
|
-
open_group(token)
|
528
|
-
end
|
529
|
-
end
|
530
|
-
|
531
|
-
MOD_FLAGS = %w[i m x].map(&:to_sym)
|
532
|
-
ENC_FLAGS = %w[a d u].map(&:to_sym)
|
533
|
-
|
534
|
-
def options_group(token)
|
535
|
-
positive, negative = token.text.split('-', 2)
|
536
|
-
negative ||= ''
|
537
|
-
self.switching_options = token.token.equal?(:options_switch)
|
538
|
-
|
539
|
-
opt_changes = {}
|
540
|
-
new_active_opts = active_opts.dup
|
541
|
-
|
542
|
-
MOD_FLAGS.each do |flag|
|
543
|
-
if positive.include?(flag.to_s)
|
544
|
-
opt_changes[flag] = new_active_opts[flag] = true
|
545
|
-
end
|
546
|
-
if negative.include?(flag.to_s)
|
547
|
-
opt_changes[flag] = false
|
548
|
-
new_active_opts.delete(flag)
|
549
|
-
end
|
550
|
-
end
|
551
|
-
|
552
|
-
if (enc_flag = positive.reverse[/[adu]/])
|
553
|
-
enc_flag = enc_flag.to_sym
|
554
|
-
(ENC_FLAGS - [enc_flag]).each do |other|
|
555
|
-
opt_changes[other] = false if new_active_opts[other]
|
556
|
-
new_active_opts.delete(other)
|
557
|
-
end
|
558
|
-
opt_changes[enc_flag] = new_active_opts[enc_flag] = true
|
559
|
-
end
|
560
|
-
|
561
|
-
options_stack << new_active_opts
|
562
|
-
|
563
|
-
options_group = Group::Options.new(token, active_opts)
|
564
|
-
options_group.option_changes = opt_changes
|
565
|
-
|
566
|
-
nest(options_group)
|
567
|
-
end
|
568
|
-
|
569
|
-
def open_group(token)
|
558
|
+
def set(token)
|
570
559
|
case token.token
|
571
|
-
when :
|
572
|
-
|
573
|
-
when :
|
574
|
-
|
575
|
-
when :
|
576
|
-
exp = Group::Named.new(token, active_opts)
|
577
|
-
when :capture
|
578
|
-
exp = Group::Capture.new(token, active_opts)
|
579
|
-
when :absence
|
580
|
-
exp = Group::Absence.new(token, active_opts)
|
581
|
-
|
582
|
-
when :lookahead
|
583
|
-
exp = Assertion::Lookahead.new(token, active_opts)
|
584
|
-
when :nlookahead
|
585
|
-
exp = Assertion::NegativeLookahead.new(token, active_opts)
|
586
|
-
when :lookbehind
|
587
|
-
exp = Assertion::Lookbehind.new(token, active_opts)
|
588
|
-
when :nlookbehind
|
589
|
-
exp = Assertion::NegativeLookbehind.new(token, active_opts)
|
590
|
-
|
560
|
+
when :open; open_set(token)
|
561
|
+
when :close; close_set
|
562
|
+
when :negate; negate_set
|
563
|
+
when :range; range(token)
|
564
|
+
when :intersection; intersection(token)
|
591
565
|
else
|
592
|
-
raise UnknownTokenError.new('
|
593
|
-
end
|
594
|
-
|
595
|
-
if exp.capturing?
|
596
|
-
exp.number = total_captured_group_count + 1
|
597
|
-
exp.number_at_level = captured_group_count_at_level + 1
|
598
|
-
count_captured_group
|
566
|
+
raise UnknownTokenError.new('CharacterSet', token)
|
599
567
|
end
|
600
|
-
|
601
|
-
# Push the active options to the stack again. This way we can simply pop the
|
602
|
-
# stack for any group we close, no matter if it had its own options or not.
|
603
|
-
options_stack << active_opts
|
604
|
-
|
605
|
-
nest(exp)
|
606
|
-
end
|
607
|
-
|
608
|
-
def close_group
|
609
|
-
options_stack.pop unless switching_options
|
610
|
-
self.switching_options = false
|
611
|
-
decrease_nesting
|
612
568
|
end
|
613
569
|
|
614
570
|
def open_set(token)
|
@@ -631,51 +587,45 @@ class Regexp::Parser
|
|
631
587
|
nest(exp)
|
632
588
|
end
|
633
589
|
|
634
|
-
def close_completed_character_set_range
|
635
|
-
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
636
|
-
end
|
637
|
-
|
638
590
|
def intersection(token)
|
639
591
|
sequence_operation(CharacterSet::Intersection, token)
|
640
592
|
end
|
641
593
|
|
642
|
-
def
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
594
|
+
def type(token)
|
595
|
+
case token.token
|
596
|
+
when :digit; node << CharacterType::Digit.new(token, active_opts)
|
597
|
+
when :hex; node << CharacterType::Hex.new(token, active_opts)
|
598
|
+
when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
|
599
|
+
when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
|
600
|
+
when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
|
601
|
+
when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
|
602
|
+
when :nonword; node << CharacterType::NonWord.new(token, active_opts)
|
603
|
+
when :space; node << CharacterType::Space.new(token, active_opts)
|
604
|
+
when :word; node << CharacterType::Word.new(token, active_opts)
|
605
|
+
when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
|
606
|
+
else
|
607
|
+
raise UnknownTokenError.new('CharacterType', token)
|
649
608
|
end
|
650
|
-
node.add_sequence(active_opts)
|
651
|
-
end
|
652
|
-
|
653
|
-
def active_opts
|
654
|
-
options_stack.last
|
655
|
-
end
|
656
|
-
|
657
|
-
def total_captured_group_count
|
658
|
-
captured_group_counts.values.reduce(0, :+)
|
659
|
-
end
|
660
|
-
|
661
|
-
def captured_group_count_at_level
|
662
|
-
captured_group_counts[node.level]
|
663
609
|
end
|
664
610
|
|
665
|
-
def
|
666
|
-
|
611
|
+
def close_completed_character_set_range
|
612
|
+
decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
|
667
613
|
end
|
668
614
|
|
669
|
-
def
|
670
|
-
|
671
|
-
exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
|
615
|
+
def active_opts
|
616
|
+
options_stack.last
|
672
617
|
end
|
673
618
|
|
619
|
+
# Assigns referenced expressions to refering expressions, e.g. if there is
|
620
|
+
# an instance of Backreference::Number, its #referenced_expression is set to
|
621
|
+
# the instance of Group::Capture that it refers to via its number.
|
674
622
|
def assign_referenced_expressions
|
675
623
|
targets = {}
|
624
|
+
# find all referencable expressions
|
676
625
|
root.each_expression do |exp|
|
677
626
|
exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
|
678
627
|
end
|
628
|
+
# assign them to any refering expressions
|
679
629
|
root.each_expression do |exp|
|
680
630
|
exp.respond_to?(:reference) &&
|
681
631
|
exp.referenced_expression = targets[exp.reference]
|