regexp_parser 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -1,8 +1,16 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module Backreference
|
4
3
|
class Base < Regexp::Expression::Base; end
|
5
4
|
|
5
|
+
class Number < Backreference::Base
|
6
|
+
attr_reader :number
|
7
|
+
|
8
|
+
def initialize(token, options = {})
|
9
|
+
@number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
|
10
|
+
super
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
6
14
|
class Name < Backreference::Base
|
7
15
|
attr_reader :name
|
8
16
|
|
@@ -12,31 +20,28 @@ module Regexp::Expression
|
|
12
20
|
end
|
13
21
|
end
|
14
22
|
|
15
|
-
class
|
16
|
-
|
23
|
+
class NumberCall < Backreference::Number; end
|
24
|
+
class NumberRelative < Backreference::Number; end
|
25
|
+
class NumberCallRelative < Backreference::Number; end
|
26
|
+
class NameCall < Backreference::Name; end
|
27
|
+
|
28
|
+
class NumberRecursionLevel < Backreference::Base
|
29
|
+
attr_reader :number, :recursion_level
|
17
30
|
|
18
31
|
def initialize(token, options = {})
|
19
|
-
@number = token.text[
|
32
|
+
@number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i)
|
20
33
|
super
|
21
34
|
end
|
22
35
|
end
|
23
36
|
|
24
|
-
class
|
25
|
-
|
26
|
-
class NameNestLevel < Backreference::Base; end
|
27
|
-
class NumberNestLevel < Backreference::Base; end
|
28
|
-
|
29
|
-
class NameCall < Backreference::Base
|
30
|
-
attr_reader :name
|
37
|
+
class NameRecursionLevel < Backreference::Base
|
38
|
+
attr_reader :name, :recursion_level
|
31
39
|
|
32
40
|
def initialize(token, options = {})
|
33
|
-
@name = token.text[3..-2]
|
41
|
+
@name, recursion_level = token.text[3..-2].split(/(?=[+-])/)
|
42
|
+
@recursion_level = recursion_level.to_i
|
34
43
|
super
|
35
44
|
end
|
36
45
|
end
|
37
|
-
|
38
|
-
class NumberCall < Backreference::Base; end
|
39
|
-
class NumberCallRelative < Backreference::Base; end
|
40
46
|
end
|
41
|
-
|
42
47
|
end
|
@@ -1,9 +1,23 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module EscapeSequence
|
4
|
-
class Base
|
3
|
+
class Base < Regexp::Expression::Base
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
def char
|
7
|
+
# poor man's unescape without using eval
|
8
|
+
YAML.load(%Q(---\n"#{text}"\n))
|
9
|
+
end
|
5
10
|
|
6
|
-
|
11
|
+
def codepoint
|
12
|
+
char.ord
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Literal < EscapeSequence::Base
|
17
|
+
def char
|
18
|
+
text[1..-1]
|
19
|
+
end
|
20
|
+
end
|
7
21
|
|
8
22
|
class AsciiEscape < EscapeSequence::Base; end
|
9
23
|
class Backspace < EscapeSequence::Base; end
|
@@ -11,17 +25,74 @@ module Regexp::Expression
|
|
11
25
|
class FormFeed < EscapeSequence::Base; end
|
12
26
|
class Newline < EscapeSequence::Base; end
|
13
27
|
class Return < EscapeSequence::Base; end
|
14
|
-
class Space < EscapeSequence::Base; end
|
15
28
|
class Tab < EscapeSequence::Base; end
|
16
29
|
class VerticalTab < EscapeSequence::Base; end
|
17
30
|
|
18
|
-
class Octal < EscapeSequence::Base; end
|
19
31
|
class Hex < EscapeSequence::Base; end
|
20
|
-
class
|
32
|
+
class Codepoint < EscapeSequence::Base; end
|
21
33
|
|
22
|
-
class
|
23
|
-
|
24
|
-
|
25
|
-
|
34
|
+
class CodepointList < EscapeSequence::Base
|
35
|
+
def char
|
36
|
+
raise NoMethodError, 'CodepointList responds only to #chars'
|
37
|
+
end
|
38
|
+
|
39
|
+
def codepoint
|
40
|
+
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
41
|
+
end
|
42
|
+
|
43
|
+
def chars
|
44
|
+
codepoints.map { |cp| cp.chr('utf-8') }
|
45
|
+
end
|
46
|
+
|
47
|
+
def codepoints
|
48
|
+
text.scan(/\h+/).map(&:hex)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Octal < EscapeSequence::Base
|
53
|
+
def char
|
54
|
+
text[1..-1].to_i(8).chr('utf-8')
|
55
|
+
end
|
56
|
+
end
|
26
57
|
|
58
|
+
class AbstractMetaControlSequence < EscapeSequence::Base
|
59
|
+
def char
|
60
|
+
codepoint.chr('utf-8')
|
61
|
+
end
|
62
|
+
|
63
|
+
def codepoint
|
64
|
+
raise NotImplementedError, 'implement in subclass'
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def control_sequence_to_s(control_sequence)
|
70
|
+
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
71
|
+
["000#{five_lsb}"].pack('B*')
|
72
|
+
end
|
73
|
+
|
74
|
+
def meta_char_to_codepoint(meta_char)
|
75
|
+
byte_value = meta_char.ord
|
76
|
+
byte_value < 128 ? byte_value + 128 : byte_value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class Control < AbstractMetaControlSequence
|
81
|
+
def codepoint
|
82
|
+
control_sequence_to_s(text).ord
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Meta < AbstractMetaControlSequence
|
87
|
+
def codepoint
|
88
|
+
meta_char_to_codepoint(text[-1])
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
class MetaControl < AbstractMetaControlSequence
|
93
|
+
def codepoint
|
94
|
+
meta_char_to_codepoint(control_sequence_to_s(text))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
27
98
|
end
|
@@ -1,27 +1,27 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module Group
|
4
3
|
class Base < Regexp::Expression::Subexpression
|
5
|
-
def capturing?
|
6
|
-
[:capture, :named].include?(token)
|
7
|
-
end
|
8
|
-
|
9
|
-
def comment?
|
10
|
-
type == :comment
|
11
|
-
end
|
12
|
-
|
13
4
|
def to_s(format = :full)
|
14
5
|
"#{text}#{expressions.join})#{quantifier_affix(format)}"
|
15
6
|
end
|
7
|
+
|
8
|
+
def capturing?; false end
|
9
|
+
|
10
|
+
def comment?; false end
|
16
11
|
end
|
17
12
|
|
18
|
-
class Atomic
|
19
|
-
class
|
20
|
-
class
|
21
|
-
class
|
22
|
-
class Absence < Group::Base; end
|
13
|
+
class Atomic < Group::Base; end
|
14
|
+
class Passive < Group::Base; end
|
15
|
+
class Options < Group::Base; end
|
16
|
+
class Absence < Group::Base; end
|
23
17
|
|
24
|
-
class
|
18
|
+
class Capture < Group::Base
|
19
|
+
attr_accessor :number, :number_at_level
|
20
|
+
|
21
|
+
def capturing?; true end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Named < Group::Capture
|
25
25
|
attr_reader :name
|
26
26
|
|
27
27
|
def initialize(token, options = {})
|
@@ -29,10 +29,9 @@ module Regexp::Expression
|
|
29
29
|
super
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
copy
|
32
|
+
def initialize_clone(other)
|
33
|
+
other.instance_variable_set(:@name, name.dup)
|
34
|
+
super
|
36
35
|
end
|
37
36
|
end
|
38
37
|
|
@@ -40,6 +39,8 @@ module Regexp::Expression
|
|
40
39
|
def to_s(_format = :full)
|
41
40
|
text.dup
|
42
41
|
end
|
42
|
+
|
43
|
+
def comment?; true end
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
@@ -52,5 +53,4 @@ module Regexp::Expression
|
|
52
53
|
class Lookbehind < Assertion::Base; end
|
53
54
|
class NegativeLookbehind < Assertion::Base; end
|
54
55
|
end
|
55
|
-
|
56
56
|
end
|
@@ -9,6 +9,10 @@ module Regexp::Expression
|
|
9
9
|
def name
|
10
10
|
text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
|
11
11
|
end
|
12
|
+
|
13
|
+
def shortcut
|
14
|
+
(Regexp::Scanner.short_prop_map.rassoc(token.to_s) || []).first
|
15
|
+
end
|
12
16
|
end
|
13
17
|
|
14
18
|
class Alnum < Base; end
|
@@ -36,6 +40,7 @@ module Regexp::Expression
|
|
36
40
|
class Base < UnicodeProperty::Base; end
|
37
41
|
|
38
42
|
class Any < Letter::Base; end
|
43
|
+
class Cased < Letter::Base; end
|
39
44
|
class Uppercase < Letter::Base; end
|
40
45
|
class Lowercase < Letter::Base; end
|
41
46
|
class Titlecase < Letter::Base; end
|
@@ -47,6 +52,7 @@ module Regexp::Expression
|
|
47
52
|
class Base < UnicodeProperty::Base; end
|
48
53
|
|
49
54
|
class Any < Mark::Base; end
|
55
|
+
class Combining < Mark::Base; end
|
50
56
|
class Nonspacing < Mark::Base; end
|
51
57
|
class Spacing < Mark::Base; end
|
52
58
|
class Enclosing < Mark::Base; end
|
@@ -1,110 +1,27 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
class CharacterSet < Regexp::Expression::Subexpression
|
3
|
+
attr_accessor :closed, :negative
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
+
alias :negative? :negative
|
6
|
+
alias :negated? :negative
|
7
|
+
alias :closed? :closed
|
5
8
|
|
6
9
|
def initialize(token, options = {})
|
7
|
-
|
8
|
-
|
9
|
-
@closed = false
|
10
|
+
self.negative = false
|
11
|
+
self.closed = false
|
10
12
|
super
|
11
13
|
end
|
12
14
|
|
13
|
-
# Override base method to clone set members as well.
|
14
|
-
def clone
|
15
|
-
copy = super
|
16
|
-
copy.members = @members.map {|m| m.clone }
|
17
|
-
copy
|
18
|
-
end
|
19
|
-
|
20
|
-
def <<(member)
|
21
|
-
if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
|
22
|
-
@members.last << member
|
23
|
-
else
|
24
|
-
@members << member
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def include?(member, directly = false)
|
29
|
-
@members.each do |m|
|
30
|
-
if m.is_a?(CharacterSubSet) and not directly
|
31
|
-
return true if m.include?(member)
|
32
|
-
else
|
33
|
-
return true if member == m.to_s
|
34
|
-
end
|
35
|
-
end; false
|
36
|
-
end
|
37
|
-
|
38
|
-
def each(&block)
|
39
|
-
@members.each {|m| yield m}
|
40
|
-
end
|
41
|
-
|
42
|
-
def each_with_index(&block)
|
43
|
-
@members.each_with_index {|m, i| yield m, i}
|
44
|
-
end
|
45
|
-
|
46
|
-
def length
|
47
|
-
@members.length
|
48
|
-
end
|
49
|
-
|
50
15
|
def negate
|
51
|
-
|
52
|
-
@members.last.negate
|
53
|
-
else
|
54
|
-
@negative = true
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def negative?
|
59
|
-
@negative
|
16
|
+
self.negative = true
|
60
17
|
end
|
61
|
-
alias :negated? :negative?
|
62
18
|
|
63
19
|
def close
|
64
|
-
|
65
|
-
@members.last.close
|
66
|
-
else
|
67
|
-
@closed = true
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def closed?
|
72
|
-
@closed
|
73
|
-
end
|
74
|
-
|
75
|
-
# Returns an array of the members with any shorthand members like \d and \W
|
76
|
-
# expanded to either traditional form or unicode properties.
|
77
|
-
def expand_members(use_properties = false)
|
78
|
-
@members.map do |member|
|
79
|
-
case member
|
80
|
-
when "\\d"
|
81
|
-
use_properties ? '\p{Digit}' : '0-9'
|
82
|
-
when "\\D"
|
83
|
-
use_properties ? '\P{Digit}' : '^0-9'
|
84
|
-
when "\\w"
|
85
|
-
use_properties ? '\p{Word}' : 'A-Za-z0-9_'
|
86
|
-
when "\\W"
|
87
|
-
use_properties ? '\P{Word}' : '^A-Za-z0-9_'
|
88
|
-
when "\\s"
|
89
|
-
use_properties ? '\p{Space}' : ' \t\f\v\n\r'
|
90
|
-
when "\\S"
|
91
|
-
use_properties ? '\P{Space}' : '^ \t\f\v\n\r'
|
92
|
-
when "\\h"
|
93
|
-
use_properties ? '\p{Xdigit}' : '0-9A-Fa-f'
|
94
|
-
when "\\H"
|
95
|
-
use_properties ? '\P{Xdigit}' : '^0-9A-Fa-f'
|
96
|
-
else
|
97
|
-
member
|
98
|
-
end
|
99
|
-
end
|
20
|
+
self.closed = true
|
100
21
|
end
|
101
22
|
|
102
23
|
def to_s(format = :full)
|
103
|
-
"#{text}#{'^' if
|
24
|
+
"#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
|
104
25
|
end
|
105
26
|
end
|
106
|
-
|
107
|
-
class CharacterSubSet < CharacterSet
|
108
|
-
end
|
109
|
-
|
110
27
|
end # module Regexp::Expression
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class CharacterSet < Regexp::Expression::Subexpression
|
3
|
+
class Range < Regexp::Expression::Subexpression
|
4
|
+
def starts_at
|
5
|
+
expressions.first.starts_at
|
6
|
+
end
|
7
|
+
alias :ts :starts_at
|
8
|
+
|
9
|
+
def <<(exp)
|
10
|
+
complete? && raise("Can't add more than 2 expressions to a Range")
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def complete?
|
15
|
+
count == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s(_format = :full)
|
19
|
+
expressions.join(text)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -40,14 +40,16 @@ module Regexp::Expression
|
|
40
40
|
|
41
41
|
part = {}
|
42
42
|
|
43
|
+
print_level = nesting_level > 0 ? nesting_level - 1 : nil
|
44
|
+
|
43
45
|
# Order is important! Fields that use other fields in their
|
44
46
|
# definition must appear before the fields they use.
|
45
47
|
part_keys = %w{a m b o i l x s e S y k c q Q z Z t ~t T >}
|
46
48
|
part.keys.each {|k| part[k] = "<?#{k}?>"}
|
47
49
|
|
48
|
-
part['>'] =
|
50
|
+
part['>'] = print_level ? (' ' * (print_level + indent_offset)) : ''
|
49
51
|
|
50
|
-
part['l'] =
|
52
|
+
part['l'] = print_level ? "#{'%d' % print_level}" : 'root'
|
51
53
|
part['x'] = "#{'%d' % index}" if have_index
|
52
54
|
|
53
55
|
part['s'] = starts_at
|
@@ -101,9 +103,9 @@ module Regexp::Expression
|
|
101
103
|
def strfregexp_tree(format = '%a', include_self = true, separator = "\n")
|
102
104
|
output = include_self ? [self.strfregexp(format)] : []
|
103
105
|
|
104
|
-
output +=
|
106
|
+
output += flat_map do |exp, index|
|
105
107
|
exp.strfregexp(format, (include_self ? 1 : 0), index)
|
106
|
-
|
108
|
+
end
|
107
109
|
|
108
110
|
output.join(separator)
|
109
111
|
end
|