regexp_parser 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +63 -6
- data/Gemfile +1 -0
- data/README.md +12 -6
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +9 -57
- data/lib/regexp_parser/expression/classes/backreference.rb +1 -0
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
- data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/classes/keep.rb +2 -0
- data/lib/regexp_parser/expression/classes/root.rb +3 -5
- data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
- data/lib/regexp_parser/expression/methods/construct.rb +43 -0
- data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +41 -23
- data/lib/regexp_parser/expression/sequence.rb +9 -23
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +85 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -7
- data/lib/regexp_parser/expression.rb +4 -2
- data/lib/regexp_parser/parser.rb +21 -72
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +42 -31
- data/lib/regexp_parser/scanner.rb +725 -793
- data/lib/regexp_parser/syntax/token/escape.rb +1 -1
- data/lib/regexp_parser/syntax/token/unicode_property.rb +0 -5
- data/lib/regexp_parser/version.rb +1 -1
- metadata +10 -8
@@ -0,0 +1,85 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
module Shared
|
3
|
+
module ClassMethods; end # filled in ./methods/*.rb
|
4
|
+
|
5
|
+
def self.included(mod)
|
6
|
+
mod.class_eval do
|
7
|
+
extend Shared::ClassMethods
|
8
|
+
|
9
|
+
attr_accessor :type, :token, :text, :ts, :te,
|
10
|
+
:level, :set_level, :conditional_level,
|
11
|
+
:options, :quantifier
|
12
|
+
|
13
|
+
attr_reader :nesting_level
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def init_from_token_and_options(token, options = {})
|
18
|
+
self.type = token.type
|
19
|
+
self.token = token.token
|
20
|
+
self.text = token.text
|
21
|
+
self.ts = token.ts
|
22
|
+
self.te = token.te
|
23
|
+
self.level = token.level
|
24
|
+
self.set_level = token.set_level
|
25
|
+
self.conditional_level = token.conditional_level
|
26
|
+
self.nesting_level = 0
|
27
|
+
self.options = options || {}
|
28
|
+
end
|
29
|
+
private :init_from_token_and_options
|
30
|
+
|
31
|
+
def initialize_copy(orig)
|
32
|
+
self.text = orig.text.dup if orig.text
|
33
|
+
self.options = orig.options.dup if orig.options
|
34
|
+
self.quantifier = orig.quantifier.clone if orig.quantifier
|
35
|
+
super
|
36
|
+
end
|
37
|
+
|
38
|
+
def starts_at
|
39
|
+
ts
|
40
|
+
end
|
41
|
+
|
42
|
+
def base_length
|
43
|
+
to_s(:base).length
|
44
|
+
end
|
45
|
+
|
46
|
+
def full_length
|
47
|
+
to_s.length
|
48
|
+
end
|
49
|
+
|
50
|
+
def to_s(format = :full)
|
51
|
+
"#{parts.join}#{quantifier_affix(format)}"
|
52
|
+
end
|
53
|
+
alias :to_str :to_s
|
54
|
+
|
55
|
+
def parts
|
56
|
+
[text.dup]
|
57
|
+
end
|
58
|
+
|
59
|
+
def quantifier_affix(expression_format)
|
60
|
+
quantifier.to_s if quantified? && expression_format != :base
|
61
|
+
end
|
62
|
+
|
63
|
+
def quantified?
|
64
|
+
!quantifier.nil?
|
65
|
+
end
|
66
|
+
|
67
|
+
def offset
|
68
|
+
[starts_at, full_length]
|
69
|
+
end
|
70
|
+
|
71
|
+
def coded_offset
|
72
|
+
'@%d+%d' % offset
|
73
|
+
end
|
74
|
+
|
75
|
+
def terminal?
|
76
|
+
!respond_to?(:expressions)
|
77
|
+
end
|
78
|
+
|
79
|
+
def nesting_level=(lvl)
|
80
|
+
@nesting_level = lvl
|
81
|
+
quantifier && quantifier.nesting_level = lvl
|
82
|
+
terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -5,9 +5,8 @@ module Regexp::Expression
|
|
5
5
|
attr_accessor :expressions
|
6
6
|
|
7
7
|
def initialize(token, options = {})
|
8
|
-
super
|
9
|
-
|
10
8
|
self.expressions = []
|
9
|
+
super
|
11
10
|
end
|
12
11
|
|
13
12
|
# Override base method to clone the expressions as well.
|
@@ -43,16 +42,21 @@ module Regexp::Expression
|
|
43
42
|
ts + to_s.length
|
44
43
|
end
|
45
44
|
|
46
|
-
def
|
47
|
-
|
48
|
-
"#{expressions.join}#{quantifier_affix(format)}"
|
45
|
+
def parts
|
46
|
+
expressions
|
49
47
|
end
|
50
48
|
|
51
49
|
def to_h
|
52
|
-
attributes.merge(
|
50
|
+
attributes.merge(
|
53
51
|
text: to_s(:base),
|
54
52
|
expressions: expressions.map(&:to_h)
|
55
|
-
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def intersperse(expressions, separator)
|
59
|
+
expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
|
56
60
|
end
|
57
61
|
end
|
58
62
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'regexp_parser/error'
|
2
2
|
|
3
|
+
require 'regexp_parser/expression/shared'
|
3
4
|
require 'regexp_parser/expression/base'
|
4
5
|
require 'regexp_parser/expression/quantifier'
|
5
6
|
require 'regexp_parser/expression/subexpression'
|
@@ -12,6 +13,7 @@ require 'regexp_parser/expression/classes/backreference'
|
|
12
13
|
require 'regexp_parser/expression/classes/character_set'
|
13
14
|
require 'regexp_parser/expression/classes/character_set/intersection'
|
14
15
|
require 'regexp_parser/expression/classes/character_set/range'
|
16
|
+
require 'regexp_parser/expression/classes/character_type'
|
15
17
|
require 'regexp_parser/expression/classes/conditional'
|
16
18
|
require 'regexp_parser/expression/classes/escape_sequence'
|
17
19
|
require 'regexp_parser/expression/classes/free_space'
|
@@ -19,10 +21,10 @@ require 'regexp_parser/expression/classes/group'
|
|
19
21
|
require 'regexp_parser/expression/classes/keep'
|
20
22
|
require 'regexp_parser/expression/classes/literal'
|
21
23
|
require 'regexp_parser/expression/classes/posix_class'
|
22
|
-
require 'regexp_parser/expression/classes/property'
|
23
24
|
require 'regexp_parser/expression/classes/root'
|
24
|
-
require 'regexp_parser/expression/classes/
|
25
|
+
require 'regexp_parser/expression/classes/unicode_property'
|
25
26
|
|
27
|
+
require 'regexp_parser/expression/methods/construct'
|
26
28
|
require 'regexp_parser/expression/methods/match'
|
27
29
|
require 'regexp_parser/expression/methods/match_length'
|
28
30
|
require 'regexp_parser/expression/methods/options'
|
data/lib/regexp_parser/parser.rb
CHANGED
@@ -23,7 +23,7 @@ class Regexp::Parser
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
|
26
|
-
root = Root.
|
26
|
+
root = Root.construct(options: extract_options(input, options))
|
27
27
|
|
28
28
|
self.root = root
|
29
29
|
self.node = root
|
@@ -39,6 +39,9 @@ class Regexp::Parser
|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -197,11 +200,11 @@ class Regexp::Parser
|
|
197
200
|
end
|
198
201
|
|
199
202
|
def captured_group_count_at_level
|
200
|
-
captured_group_counts[node
|
203
|
+
captured_group_counts[node]
|
201
204
|
end
|
202
205
|
|
203
206
|
def count_captured_group
|
204
|
-
captured_group_counts[node
|
207
|
+
captured_group_counts[node] += 1
|
205
208
|
end
|
206
209
|
|
207
210
|
def close_group
|
@@ -286,17 +289,9 @@ class Regexp::Parser
|
|
286
289
|
def nest(exp)
|
287
290
|
nesting.push(exp)
|
288
291
|
node << exp
|
289
|
-
update_transplanted_subtree(exp, node)
|
290
292
|
self.node = exp
|
291
293
|
end
|
292
294
|
|
293
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
-
def update_transplanted_subtree(exp, new_parent)
|
295
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
-
exp.respond_to?(:each) &&
|
297
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
-
end
|
299
|
-
|
300
295
|
def escape(token)
|
301
296
|
case token.token
|
302
297
|
|
@@ -480,79 +475,33 @@ class Regexp::Parser
|
|
480
475
|
# description of the problem: https://github.com/ammar/regexp_parser/issues/3
|
481
476
|
# rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
|
482
477
|
if target_node.quantified?
|
483
|
-
|
484
|
-
:
|
485
|
-
:
|
486
|
-
|
487
|
-
target_node.
|
488
|
-
|
489
|
-
|
490
|
-
target_node.set_level,
|
491
|
-
target_node.conditional_level
|
478
|
+
new_group = Group::Passive.construct(
|
479
|
+
token: :passive,
|
480
|
+
ts: target_node.ts,
|
481
|
+
level: target_node.level,
|
482
|
+
set_level: target_node.set_level,
|
483
|
+
conditional_level: target_node.conditional_level,
|
484
|
+
options: active_opts,
|
492
485
|
)
|
493
|
-
new_group = Group::Passive.new(new_token, active_opts)
|
494
486
|
new_group.implicit = true
|
495
487
|
new_group << target_node
|
496
|
-
|
488
|
+
increase_group_level(target_node)
|
497
489
|
node.expressions[node.expressions.index(target_node)] = new_group
|
498
490
|
target_node = new_group
|
499
491
|
end
|
500
492
|
|
501
|
-
|
502
|
-
|
503
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
504
|
-
when :zero_or_one_reluctant
|
505
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
|
506
|
-
when :zero_or_one_possessive
|
507
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
508
|
-
|
509
|
-
when :zero_or_more
|
510
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
511
|
-
when :zero_or_more_reluctant
|
512
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
513
|
-
when :zero_or_more_possessive
|
514
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
515
|
-
|
516
|
-
when :one_or_more
|
517
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
518
|
-
when :one_or_more_reluctant
|
519
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
520
|
-
when :one_or_more_possessive
|
521
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
522
|
-
|
523
|
-
when :interval
|
524
|
-
interval(target_node, token)
|
525
|
-
|
526
|
-
else
|
493
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
494
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
527
495
|
raise UnknownTokenError.new('Quantifier', token)
|
528
496
|
end
|
497
|
+
|
498
|
+
target_node.quantify(token, active_opts)
|
529
499
|
end
|
530
500
|
|
531
|
-
def
|
501
|
+
def increase_group_level(exp)
|
532
502
|
exp.level += 1
|
533
|
-
exp.
|
534
|
-
|
535
|
-
|
536
|
-
def interval(target_node, token)
|
537
|
-
text = token.text
|
538
|
-
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
539
|
-
case mchr
|
540
|
-
when '?'
|
541
|
-
range_text = text[0...-1]
|
542
|
-
mode = :reluctant
|
543
|
-
when '+'
|
544
|
-
range_text = text[0...-1]
|
545
|
-
mode = :possessive
|
546
|
-
else
|
547
|
-
range_text = text
|
548
|
-
mode = :greedy
|
549
|
-
end
|
550
|
-
|
551
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
552
|
-
min = range[0].empty? ? 0 : range[0]
|
553
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
554
|
-
|
555
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
503
|
+
exp.quantifier.level += 1 if exp.quantifier
|
504
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
556
505
|
end
|
557
506
|
|
558
507
|
def set(token)
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
validation_error(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -28,13 +28,7 @@
|
|
28
28
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
30
30
|
|
31
|
-
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
33
|
-
'lower' | 'print' | 'punct' |
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
35
|
-
'word' | 'ascii';
|
36
|
-
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
|
40
34
|
# these are not supported in ruby at the moment
|
@@ -74,8 +68,7 @@
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
78
|
-
quantifier_mode?;
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
79
72
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
81
74
|
quantifier_possessive | quantifier_interval;
|
@@ -223,24 +216,28 @@
|
|
223
216
|
fcall character_set;
|
224
217
|
};
|
225
218
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
227
220
|
text = copy(data, ts, te)
|
228
221
|
|
229
222
|
type = :posixclass
|
230
223
|
class_name = text[2..-3]
|
231
|
-
if class_name[0]
|
224
|
+
if class_name[0] == '^'
|
232
225
|
class_name = class_name[1..-1]
|
233
226
|
type = :nonposixclass
|
234
227
|
end
|
235
228
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
230
|
+
validation_error(:posix_class, text)
|
231
|
+
end
|
232
|
+
|
236
233
|
emit(type, class_name.to_sym, text)
|
237
234
|
};
|
238
235
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
242
239
|
# };
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
245
242
|
# };
|
246
243
|
|
@@ -323,7 +320,7 @@
|
|
323
320
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
325
322
|
text = copy(data, ts-1, te)
|
326
|
-
if text[2]
|
323
|
+
if text[2] == '{'
|
327
324
|
emit(:escape, :codepoint_list, text)
|
328
325
|
else
|
329
326
|
emit(:escape, :codepoint, text)
|
@@ -419,12 +416,12 @@
|
|
419
416
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
421
418
|
case text = copy(data, ts, te)
|
422
|
-
when '
|
423
|
-
when '
|
424
|
-
when '
|
425
|
-
when '
|
426
|
-
when '
|
427
|
-
when '
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
428
425
|
end
|
429
426
|
};
|
430
427
|
|
@@ -477,7 +474,7 @@
|
|
477
474
|
group_open . group_options >group_opened {
|
478
475
|
text = copy(data, ts, te)
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
480
|
-
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
481
478
|
end
|
482
479
|
emit_options(text)
|
483
480
|
};
|
@@ -605,7 +602,7 @@
|
|
605
602
|
end
|
606
603
|
};
|
607
604
|
|
608
|
-
quantifier_interval
|
605
|
+
quantifier_interval {
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
610
607
|
};
|
611
608
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
686
683
|
end
|
687
684
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
689
687
|
class InvalidGroupOption < ValidationError
|
690
688
|
def initialize(option, text)
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
706
704
|
end
|
707
705
|
end
|
708
706
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
708
|
+
class UnknownPosixClassError < ValidationError
|
709
|
+
def initialize(text)
|
710
|
+
super "Unknown POSIX class #{text}"
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
711
716
|
# given, it gets called for each emitted token.
|
@@ -771,6 +776,11 @@ class Regexp::Scanner
|
|
771
776
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
772
777
|
end
|
773
778
|
|
779
|
+
def self.posix_classes
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
781
|
+
lower print punct space upper word xdigit]
|
782
|
+
end
|
783
|
+
|
774
784
|
# Emits an array with the details of the scanned pattern
|
775
785
|
def emit(type, token, text)
|
776
786
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -873,15 +883,16 @@ class Regexp::Scanner
|
|
873
883
|
|
874
884
|
# Centralizes and unifies the handling of validation related
|
875
885
|
# errors.
|
876
|
-
def validation_error(type, what, reason)
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
886
|
+
def validation_error(type, what, reason = nil)
|
887
|
+
error =
|
888
|
+
case type
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
+
end
|
885
896
|
|
886
897
|
raise error # unless @@config.validation_ignore
|
887
898
|
end
|