regexp_parser 2.3.1 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -7
- data/README.md +9 -5
- data/lib/regexp_parser/error.rb +1 -1
- data/lib/regexp_parser/expression/base.rb +9 -57
- data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
- data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
- data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
- data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
- data/lib/regexp_parser/expression/classes/group.rb +6 -6
- data/lib/regexp_parser/expression/methods/tests.rb +10 -1
- data/lib/regexp_parser/expression/quantifier.rb +40 -23
- data/lib/regexp_parser/expression/sequence.rb +2 -2
- data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
- data/lib/regexp_parser/expression/shared.rb +81 -0
- data/lib/regexp_parser/expression/subexpression.rb +11 -7
- data/lib/regexp_parser/expression.rb +1 -0
- data/lib/regexp_parser/parser.rb +12 -60
- data/lib/regexp_parser/scanner/property.rl +1 -1
- data/lib/regexp_parser/scanner/scanner.rl +42 -31
- data/lib/regexp_parser/scanner.rb +725 -793
- data/lib/regexp_parser/version.rb +1 -1
- metadata +3 -2
data/lib/regexp_parser/parser.rb
CHANGED
@@ -39,6 +39,9 @@ class Regexp::Parser
|
|
39
39
|
parse_token(token)
|
40
40
|
end
|
41
41
|
|
42
|
+
# Trigger recursive setting of #nesting_level, which reflects how deep
|
43
|
+
# a node is in the tree. Do this at the end to account for tree rewrites.
|
44
|
+
root.nesting_level = 0
|
42
45
|
assign_referenced_expressions
|
43
46
|
|
44
47
|
if block_given?
|
@@ -286,17 +289,9 @@ class Regexp::Parser
|
|
286
289
|
def nest(exp)
|
287
290
|
nesting.push(exp)
|
288
291
|
node << exp
|
289
|
-
update_transplanted_subtree(exp, node)
|
290
292
|
self.node = exp
|
291
293
|
end
|
292
294
|
|
293
|
-
# subtrees are transplanted to build Alternations, Intersections, Ranges
|
294
|
-
def update_transplanted_subtree(exp, new_parent)
|
295
|
-
exp.nesting_level = new_parent.nesting_level + 1
|
296
|
-
exp.respond_to?(:each) &&
|
297
|
-
exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
|
298
|
-
end
|
299
|
-
|
300
295
|
def escape(token)
|
301
296
|
case token.token
|
302
297
|
|
@@ -483,7 +478,7 @@ class Regexp::Parser
|
|
483
478
|
new_token = Regexp::Token.new(
|
484
479
|
:group,
|
485
480
|
:passive,
|
486
|
-
'', # text
|
481
|
+
'', # text (none because this group is implicit)
|
487
482
|
target_node.ts,
|
488
483
|
nil, # te (unused)
|
489
484
|
target_node.level,
|
@@ -493,66 +488,23 @@ class Regexp::Parser
|
|
493
488
|
new_group = Group::Passive.new(new_token, active_opts)
|
494
489
|
new_group.implicit = true
|
495
490
|
new_group << target_node
|
496
|
-
|
491
|
+
increase_group_level(target_node)
|
497
492
|
node.expressions[node.expressions.index(target_node)] = new_group
|
498
493
|
target_node = new_group
|
499
494
|
end
|
500
495
|
|
501
|
-
|
502
|
-
|
503
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
|
504
|
-
when :zero_or_one_reluctant
|
505
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
|
506
|
-
when :zero_or_one_possessive
|
507
|
-
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
|
508
|
-
|
509
|
-
when :zero_or_more
|
510
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
|
511
|
-
when :zero_or_more_reluctant
|
512
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
|
513
|
-
when :zero_or_more_possessive
|
514
|
-
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
|
515
|
-
|
516
|
-
when :one_or_more
|
517
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
|
518
|
-
when :one_or_more_reluctant
|
519
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
|
520
|
-
when :one_or_more_possessive
|
521
|
-
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
|
522
|
-
|
523
|
-
when :interval
|
524
|
-
interval(target_node, token)
|
525
|
-
|
526
|
-
else
|
496
|
+
unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
|
497
|
+
(?:_greedy|_reluctant|_possessive)?\z/x
|
527
498
|
raise UnknownTokenError.new('Quantifier', token)
|
528
499
|
end
|
500
|
+
|
501
|
+
target_node.quantify(token, active_opts)
|
529
502
|
end
|
530
503
|
|
531
|
-
def
|
504
|
+
def increase_group_level(exp)
|
532
505
|
exp.level += 1
|
533
|
-
exp.
|
534
|
-
|
535
|
-
|
536
|
-
def interval(target_node, token)
|
537
|
-
text = token.text
|
538
|
-
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
|
539
|
-
case mchr
|
540
|
-
when '?'
|
541
|
-
range_text = text[0...-1]
|
542
|
-
mode = :reluctant
|
543
|
-
when '+'
|
544
|
-
range_text = text[0...-1]
|
545
|
-
mode = :possessive
|
546
|
-
else
|
547
|
-
range_text = text
|
548
|
-
mode = :greedy
|
549
|
-
end
|
550
|
-
|
551
|
-
range = range_text.gsub(/\{|\}/, '').split(',', 2)
|
552
|
-
min = range[0].empty? ? 0 : range[0]
|
553
|
-
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
|
554
|
-
|
555
|
-
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
|
506
|
+
exp.quantifier.level += 1 if exp.quantifier
|
507
|
+
exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
|
556
508
|
end
|
557
509
|
|
558
510
|
def set(token)
|
@@ -20,7 +20,7 @@
|
|
20
20
|
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
21
21
|
|
22
22
|
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
-
|
23
|
+
validation_error(:property, name) unless token
|
24
24
|
|
25
25
|
self.emit(type, token.to_sym, text)
|
26
26
|
|
@@ -28,13 +28,7 @@
|
|
28
28
|
|
29
29
|
comment = ('#' . [^\n]* . '\n'?);
|
30
30
|
|
31
|
-
|
32
|
-
'cntrl' | 'digit' | 'graph' |
|
33
|
-
'lower' | 'print' | 'punct' |
|
34
|
-
'space' | 'upper' | 'xdigit' |
|
35
|
-
'word' | 'ascii';
|
36
|
-
|
37
|
-
class_posix = ('[:' . '^'? . class_name_posix . ':]');
|
31
|
+
class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
|
38
32
|
|
39
33
|
|
40
34
|
# these are not supported in ruby at the moment
|
@@ -74,8 +68,7 @@
|
|
74
68
|
quantity_maximum = ',' . (digit+);
|
75
69
|
quantity_range = (digit+) . ',' . (digit+);
|
76
70
|
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
|
77
|
-
quantity_maximum | quantity_range ) . range_close
|
78
|
-
quantifier_mode?;
|
71
|
+
quantity_maximum | quantity_range ) . range_close;
|
79
72
|
|
80
73
|
quantifiers = quantifier_greedy | quantifier_reluctant |
|
81
74
|
quantifier_possessive | quantifier_interval;
|
@@ -223,24 +216,28 @@
|
|
223
216
|
fcall character_set;
|
224
217
|
};
|
225
218
|
|
226
|
-
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
219
|
+
class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
227
220
|
text = copy(data, ts, te)
|
228
221
|
|
229
222
|
type = :posixclass
|
230
223
|
class_name = text[2..-3]
|
231
|
-
if class_name[0]
|
224
|
+
if class_name[0] == '^'
|
232
225
|
class_name = class_name[1..-1]
|
233
226
|
type = :nonposixclass
|
234
227
|
end
|
235
228
|
|
229
|
+
unless self.class.posix_classes.include?(class_name)
|
230
|
+
validation_error(:posix_class, text)
|
231
|
+
end
|
232
|
+
|
236
233
|
emit(type, class_name.to_sym, text)
|
237
234
|
};
|
238
235
|
|
239
236
|
# These are not supported in ruby at the moment. Enable them if they are.
|
240
|
-
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
237
|
+
# collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
241
238
|
# emit(:set, :collation, copy(data, ts, te))
|
242
239
|
# };
|
243
|
-
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)
|
240
|
+
# character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
|
244
241
|
# emit(:set, :equivalent, copy(data, ts, te))
|
245
242
|
# };
|
246
243
|
|
@@ -323,7 +320,7 @@
|
|
323
320
|
|
324
321
|
codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
|
325
322
|
text = copy(data, ts-1, te)
|
326
|
-
if text[2]
|
323
|
+
if text[2] == '{'
|
327
324
|
emit(:escape, :codepoint_list, text)
|
328
325
|
else
|
329
326
|
emit(:escape, :codepoint, text)
|
@@ -419,12 +416,12 @@
|
|
419
416
|
|
420
417
|
backslash . anchor_char > (backslashed, 3) {
|
421
418
|
case text = copy(data, ts, te)
|
422
|
-
when '
|
423
|
-
when '
|
424
|
-
when '
|
425
|
-
when '
|
426
|
-
when '
|
427
|
-
when '
|
419
|
+
when '\A'; emit(:anchor, :bos, text)
|
420
|
+
when '\z'; emit(:anchor, :eos, text)
|
421
|
+
when '\Z'; emit(:anchor, :eos_ob_eol, text)
|
422
|
+
when '\b'; emit(:anchor, :word_boundary, text)
|
423
|
+
when '\B'; emit(:anchor, :nonword_boundary, text)
|
424
|
+
when '\G'; emit(:anchor, :match_start, text)
|
428
425
|
end
|
429
426
|
};
|
430
427
|
|
@@ -477,7 +474,7 @@
|
|
477
474
|
group_open . group_options >group_opened {
|
478
475
|
text = copy(data, ts, te)
|
479
476
|
if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
|
480
|
-
|
477
|
+
validation_error(:group_option, $1 || "-#{$2}", text)
|
481
478
|
end
|
482
479
|
emit_options(text)
|
483
480
|
};
|
@@ -605,7 +602,7 @@
|
|
605
602
|
end
|
606
603
|
};
|
607
604
|
|
608
|
-
quantifier_interval
|
605
|
+
quantifier_interval {
|
609
606
|
emit(:quantifier, :interval, copy(data, ts, te))
|
610
607
|
};
|
611
608
|
|
@@ -686,6 +683,7 @@ class Regexp::Scanner
|
|
686
683
|
end
|
687
684
|
|
688
685
|
# Invalid groupOption. Used for inline options.
|
686
|
+
# TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
|
689
687
|
class InvalidGroupOption < ValidationError
|
690
688
|
def initialize(option, text)
|
691
689
|
super "Invalid group option #{option} in #{text}"
|
@@ -706,6 +704,13 @@ class Regexp::Scanner
|
|
706
704
|
end
|
707
705
|
end
|
708
706
|
|
707
|
+
# The POSIX class name was not recognized by the scanner.
|
708
|
+
class UnknownPosixClassError < ValidationError
|
709
|
+
def initialize(text)
|
710
|
+
super "Unknown POSIX class #{text}"
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
709
714
|
# Scans the given regular expression text, or Regexp object and collects the
|
710
715
|
# emitted token into an array that gets returned at the end. If a block is
|
711
716
|
# given, it gets called for each emitted token.
|
@@ -771,6 +776,11 @@ class Regexp::Scanner
|
|
771
776
|
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
|
772
777
|
end
|
773
778
|
|
779
|
+
def self.posix_classes
|
780
|
+
%w[alnum alpha ascii blank cntrl digit graph
|
781
|
+
lower print punct space upper word xdigit]
|
782
|
+
end
|
783
|
+
|
774
784
|
# Emits an array with the details of the scanned pattern
|
775
785
|
def emit(type, token, text)
|
776
786
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -873,15 +883,16 @@ class Regexp::Scanner
|
|
873
883
|
|
874
884
|
# Centralizes and unifies the handling of validation related
|
875
885
|
# errors.
|
876
|
-
def validation_error(type, what, reason)
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
886
|
+
def validation_error(type, what, reason = nil)
|
887
|
+
error =
|
888
|
+
case type
|
889
|
+
when :backref then InvalidBackrefError.new(what, reason)
|
890
|
+
when :group then InvalidGroupError.new(what, reason)
|
891
|
+
when :group_option then InvalidGroupOption.new(what, reason)
|
892
|
+
when :posix_class then UnknownPosixClassError.new(what)
|
893
|
+
when :property then UnknownUnicodePropertyError.new(what)
|
894
|
+
when :sequence then InvalidSequenceError.new(what, reason)
|
895
|
+
end
|
885
896
|
|
886
897
|
raise error # unless @@config.validation_ignore
|
887
898
|
end
|