regexp_parser 2.3.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -7
- data/README.md +9 -5
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +9 -57
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +40 -23
- data/lib/regexp_parser/expression/sequence.rb +2 -2
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +81 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -7
- data/lib/regexp_parser/expression.rb +1 -0
- data/lib/regexp_parser/parser.rb +12 -60
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +42 -31
- data/lib/regexp_parser/scanner.rb +725 -793
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -2
data/lib/regexp_parser/parser.rb
CHANGED
@@ -39,6 +39,9 @@ class Regexp::Parser
|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -286,17 +289,9 @@ class Regexp::Parser
|
|
286
289
|
def nest(exp)
|
287
290
|
nesting.push(exp)
|
288
291
|
node << exp
|
289
|
-
update_transplanted_subtree(exp, node)
|
290
292
|
self.node = exp
|
291
293
|
end
|
292
294
|
|
293
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
-
def update_transplanted_subtree(exp, new_parent)
|
295
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
-
exp.respond_to?(:each) &&
|
297
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
-
end
|
299
|
-
|
300
295
|
def escape(token)
|
301
296
|
case token.token
|
302
297
|
|
@@ -483,7 +478,7 @@ class Regexp::Parser
|
|
483
478
|
new_token = Regexp::Token.new(
|
484
479
|
:group,
|
485
480
|
:passive,
|
486
|
-
'', # text
|
481
|
+
'', # text (none because this group is implicit)
|
487
482
|
target_node.ts,
|
488
483
|
nil, # te (unused)
|
489
484
|
target_node.level,
|
@@ -493,66 +488,23 @@ class Regexp::Parser
|
|
493
488
|
new_group = Group::Passive.new(new_token, active_opts)
|
494
489
|
new_group.implicit = true
|
495
490
|
new_group << target_node
|
496
|
-
|
491
|
+
increase_group_level(target_node)
|
497
492
|
node.expressions[node.expressions.index(target_node)] = new_group
|
498
493
|
target_node = new_group
|
499
494
|
end
|
500
495
|
|
501
|
-
|
502
|
-
|
503
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
504
|
-
when :zero_or_one_reluctant
|
505
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
|
506
|
-
when :zero_or_one_possessive
|
507
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
508
|
-
|
509
|
-
when :zero_or_more
|
510
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
511
|
-
when :zero_or_more_reluctant
|
512
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
513
|
-
when :zero_or_more_possessive
|
514
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
515
|
-
|
516
|
-
when :one_or_more
|
517
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
518
|
-
when :one_or_more_reluctant
|
519
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
520
|
-
when :one_or_more_possessive
|
521
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
522
|
-
|
523
|
-
when :interval
|
524
|
-
interval(target_node, token)
|
525
|
-
|
526
|
-
else
|
496
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
497
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
527
498
|
raise UnknownTokenError.new('Quantifier', token)
|
528
499
|
end
|
500
|
+
|
501
|
+
target_node.quantify(token, active_opts)
|
529
502
|
end
|
530
503
|
|
531
|
-
def
|
504
|
+
def increase_group_level(exp)
|
532
505
|
exp.level += 1
|
533
|
-
exp.
|
534
|
-
|
535
|
-
|
536
|
-
def interval(target_node, token)
|
537
|
-
text = token.text
|
538
|
-
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
539
|
-
case mchr
|
540
|
-
when '?'
|
541
|
-
range_text = text[0...-1]
|
542
|
-
mode = :reluctant
|
543
|
-
when '+'
|
544
|
-
range_text = text[0...-1]
|
545
|
-
mode = :possessive
|
546
|
-
else
|
547
|
-
range_text = text
|
548
|
-
mode = :greedy
|
549
|
-
end
|
550
|
-
|
551
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
552
|
-
min = range[0].empty? ? 0 : range[0]
|
553
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
554
|
-
|
555
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
506
|
+
exp.quantifier.level += 1 if exp.quantifier
|
507
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
556
508
|
end
|
557
509
|
|
558
510
|
def set(token)
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
validation_error(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -28,13 +28,7 @@
|
|
28
28
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
30
30
|
|
31
|
-
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
33
|
-
'lower' | 'print' | 'punct' |
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
35
|
-
'word' | 'ascii';
|
36
|
-
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
|
40
34
|
# these are not supported in ruby at the moment
|
@@ -74,8 +68,7 @@
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
78
|
-
quantifier_mode?;
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
79
72
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
81
74
|
quantifier_possessive | quantifier_interval;
|
@@ -223,24 +216,28 @@
|
|
223
216
|
fcall character_set;
|
224
217
|
};
|
225
218
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
227
220
|
text = copy(data, ts, te)
|
228
221
|
|
229
222
|
type = :posixclass
|
230
223
|
class_name = text[2..-3]
|
231
|
-
if class_name[0]
|
224
|
+
if class_name[0] == '^'
|
232
225
|
class_name = class_name[1..-1]
|
233
226
|
type = :nonposixclass
|
234
227
|
end
|
235
228
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
230
|
+
validation_error(:posix_class, text)
|
231
|
+
end
|
232
|
+
|
236
233
|
emit(type, class_name.to_sym, text)
|
237
234
|
};
|
238
235
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
242
239
|
# };
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
245
242
|
# };
|
246
243
|
|
@@ -323,7 +320,7 @@
|
|
323
320
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
325
322
|
text = copy(data, ts-1, te)
|
326
|
-
if text[2]
|
323
|
+
if text[2] == '{'
|
327
324
|
emit(:escape, :codepoint_list, text)
|
328
325
|
else
|
329
326
|
emit(:escape, :codepoint, text)
|
@@ -419,12 +416,12 @@
|
|
419
416
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
421
418
|
case text = copy(data, ts, te)
|
422
|
-
when '
|
423
|
-
when '
|
424
|
-
when '
|
425
|
-
when '
|
426
|
-
when '
|
427
|
-
when '
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
428
425
|
end
|
429
426
|
};
|
430
427
|
|
@@ -477,7 +474,7 @@
|
|
477
474
|
group_open . group_options >group_opened {
|
478
475
|
text = copy(data, ts, te)
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
480
|
-
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
481
478
|
end
|
482
479
|
emit_options(text)
|
483
480
|
};
|
@@ -605,7 +602,7 @@
|
|
605
602
|
end
|
606
603
|
};
|
607
604
|
|
608
|
-
quantifier_interval
|
605
|
+
quantifier_interval {
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
610
607
|
};
|
611
608
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
686
683
|
end
|
687
684
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
689
687
|
class InvalidGroupOption < ValidationError
|
690
688
|
def initialize(option, text)
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
706
704
|
end
|
707
705
|
end
|
708
706
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
708
|
+
class UnknownPosixClassError < ValidationError
|
709
|
+
def initialize(text)
|
710
|
+
super "Unknown POSIX class #{text}"
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
711
716
|
# given, it gets called for each emitted token.
|
@@ -771,6 +776,11 @@ class Regexp::Scanner
|
|
771
776
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
772
777
|
end
|
773
778
|
|
779
|
+
def self.posix_classes
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
781
|
+
lower print punct space upper word xdigit]
|
782
|
+
end
|
783
|
+
|
774
784
|
# Emits an array with the details of the scanned pattern
|
775
785
|
def emit(type, token, text)
|
776
786
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -873,15 +883,16 @@ class Regexp::Scanner
|
|
873
883
|
|
874
884
|
# Centralizes and unifies the handling of validation related
|
875
885
|
# errors.
|
876
|
-
def validation_error(type, what, reason)
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
886
|
+
def validation_error(type, what, reason = nil)
|
887
|
+
error =
|
888
|
+
case type
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
+
end
|
885
896
|
|
886
897
|
raise error # unless @@config.validation_ignore
|
887
898
|
end
|