regexp_parser 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -1,8 +1,16 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module Backreference
|
4
3
|
class Base < Regexp::Expression::Base; end
|
5
4
|
|
5
|
+
class Number < Backreference::Base
|
6
|
+
attr_reader :number
|
7
|
+
|
8
|
+
def initialize(token, options = {})
|
9
|
+
@number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
|
10
|
+
super
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
6
14
|
class Name < Backreference::Base
|
7
15
|
attr_reader :name
|
8
16
|
|
@@ -12,31 +20,28 @@ module Regexp::Expression
|
|
12
20
|
end
|
13
21
|
end
|
14
22
|
|
15
|
-
class
|
16
|
-
|
23
|
+
class NumberCall < Backreference::Number; end
|
24
|
+
class NumberRelative < Backreference::Number; end
|
25
|
+
class NumberCallRelative < Backreference::Number; end
|
26
|
+
class NameCall < Backreference::Name; end
|
27
|
+
|
28
|
+
class NumberRecursionLevel < Backreference::Base
|
29
|
+
attr_reader :number, :recursion_level
|
17
30
|
|
18
31
|
def initialize(token, options = {})
|
19
|
-
@number = token.text[
|
32
|
+
@number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i)
|
20
33
|
super
|
21
34
|
end
|
22
35
|
end
|
23
36
|
|
24
|
-
class
|
25
|
-
|
26
|
-
class NameNestLevel < Backreference::Base; end
|
27
|
-
class NumberNestLevel < Backreference::Base; end
|
28
|
-
|
29
|
-
class NameCall < Backreference::Base
|
30
|
-
attr_reader :name
|
37
|
+
class NameRecursionLevel < Backreference::Base
|
38
|
+
attr_reader :name, :recursion_level
|
31
39
|
|
32
40
|
def initialize(token, options = {})
|
33
|
-
@name = token.text[3..-2]
|
41
|
+
@name, recursion_level = token.text[3..-2].split(/(?=[+-])/)
|
42
|
+
@recursion_level = recursion_level.to_i
|
34
43
|
super
|
35
44
|
end
|
36
45
|
end
|
37
|
-
|
38
|
-
class NumberCall < Backreference::Base; end
|
39
|
-
class NumberCallRelative < Backreference::Base; end
|
40
46
|
end
|
41
|
-
|
42
47
|
end
|
@@ -1,9 +1,23 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module EscapeSequence
|
4
|
-
class Base
|
3
|
+
class Base < Regexp::Expression::Base
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
def char
|
7
|
+
# poor man's unescape without using eval
|
8
|
+
YAML.load(%Q(---\n"#{text}"\n))
|
9
|
+
end
|
5
10
|
|
6
|
-
|
11
|
+
def codepoint
|
12
|
+
char.ord
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Literal < EscapeSequence::Base
|
17
|
+
def char
|
18
|
+
text[1..-1]
|
19
|
+
end
|
20
|
+
end
|
7
21
|
|
8
22
|
class AsciiEscape < EscapeSequence::Base; end
|
9
23
|
class Backspace < EscapeSequence::Base; end
|
@@ -11,17 +25,74 @@ module Regexp::Expression
|
|
11
25
|
class FormFeed < EscapeSequence::Base; end
|
12
26
|
class Newline < EscapeSequence::Base; end
|
13
27
|
class Return < EscapeSequence::Base; end
|
14
|
-
class Space < EscapeSequence::Base; end
|
15
28
|
class Tab < EscapeSequence::Base; end
|
16
29
|
class VerticalTab < EscapeSequence::Base; end
|
17
30
|
|
18
|
-
class Octal < EscapeSequence::Base; end
|
19
31
|
class Hex < EscapeSequence::Base; end
|
20
|
-
class
|
32
|
+
class Codepoint < EscapeSequence::Base; end
|
21
33
|
|
22
|
-
class
|
23
|
-
|
24
|
-
|
25
|
-
|
34
|
+
class CodepointList < EscapeSequence::Base
|
35
|
+
def char
|
36
|
+
raise NoMethodError, 'CodepointList responds only to #chars'
|
37
|
+
end
|
38
|
+
|
39
|
+
def codepoint
|
40
|
+
raise NoMethodError, 'CodepointList responds only to #codepoints'
|
41
|
+
end
|
42
|
+
|
43
|
+
def chars
|
44
|
+
codepoints.map { |cp| cp.chr('utf-8') }
|
45
|
+
end
|
46
|
+
|
47
|
+
def codepoints
|
48
|
+
text.scan(/\h+/).map(&:hex)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Octal < EscapeSequence::Base
|
53
|
+
def char
|
54
|
+
text[1..-1].to_i(8).chr('utf-8')
|
55
|
+
end
|
56
|
+
end
|
26
57
|
|
58
|
+
class AbstractMetaControlSequence < EscapeSequence::Base
|
59
|
+
def char
|
60
|
+
codepoint.chr('utf-8')
|
61
|
+
end
|
62
|
+
|
63
|
+
def codepoint
|
64
|
+
raise NotImplementedError, 'implement in subclass'
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def control_sequence_to_s(control_sequence)
|
70
|
+
five_lsb = control_sequence.unpack('B*').first[-5..-1]
|
71
|
+
["000#{five_lsb}"].pack('B*')
|
72
|
+
end
|
73
|
+
|
74
|
+
def meta_char_to_codepoint(meta_char)
|
75
|
+
byte_value = meta_char.ord
|
76
|
+
byte_value < 128 ? byte_value + 128 : byte_value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class Control < AbstractMetaControlSequence
|
81
|
+
def codepoint
|
82
|
+
control_sequence_to_s(text).ord
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Meta < AbstractMetaControlSequence
|
87
|
+
def codepoint
|
88
|
+
meta_char_to_codepoint(text[-1])
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
class MetaControl < AbstractMetaControlSequence
|
93
|
+
def codepoint
|
94
|
+
meta_char_to_codepoint(control_sequence_to_s(text))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
27
98
|
end
|
@@ -1,27 +1,27 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
-
|
3
2
|
module Group
|
4
3
|
class Base < Regexp::Expression::Subexpression
|
5
|
-
def capturing?
|
6
|
-
[:capture, :named].include?(token)
|
7
|
-
end
|
8
|
-
|
9
|
-
def comment?
|
10
|
-
type == :comment
|
11
|
-
end
|
12
|
-
|
13
4
|
def to_s(format = :full)
|
14
5
|
"#{text}#{expressions.join})#{quantifier_affix(format)}"
|
15
6
|
end
|
7
|
+
|
8
|
+
def capturing?; false end
|
9
|
+
|
10
|
+
def comment?; false end
|
16
11
|
end
|
17
12
|
|
18
|
-
class Atomic
|
19
|
-
class
|
20
|
-
class
|
21
|
-
class
|
22
|
-
class Absence < Group::Base; end
|
13
|
+
class Atomic < Group::Base; end
|
14
|
+
class Passive < Group::Base; end
|
15
|
+
class Options < Group::Base; end
|
16
|
+
class Absence < Group::Base; end
|
23
17
|
|
24
|
-
class
|
18
|
+
class Capture < Group::Base
|
19
|
+
attr_accessor :number, :number_at_level
|
20
|
+
|
21
|
+
def capturing?; true end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Named < Group::Capture
|
25
25
|
attr_reader :name
|
26
26
|
|
27
27
|
def initialize(token, options = {})
|
@@ -29,10 +29,9 @@ module Regexp::Expression
|
|
29
29
|
super
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
copy
|
32
|
+
def initialize_clone(other)
|
33
|
+
other.instance_variable_set(:@name, name.dup)
|
34
|
+
super
|
36
35
|
end
|
37
36
|
end
|
38
37
|
|
@@ -40,6 +39,8 @@ module Regexp::Expression
|
|
40
39
|
def to_s(_format = :full)
|
41
40
|
text.dup
|
42
41
|
end
|
42
|
+
|
43
|
+
def comment?; true end
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
@@ -52,5 +53,4 @@ module Regexp::Expression
|
|
52
53
|
class Lookbehind < Assertion::Base; end
|
53
54
|
class NegativeLookbehind < Assertion::Base; end
|
54
55
|
end
|
55
|
-
|
56
56
|
end
|
@@ -9,6 +9,10 @@ module Regexp::Expression
|
|
9
9
|
def name
|
10
10
|
text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
|
11
11
|
end
|
12
|
+
|
13
|
+
def shortcut
|
14
|
+
(Regexp::Scanner.short_prop_map.rassoc(token.to_s) || []).first
|
15
|
+
end
|
12
16
|
end
|
13
17
|
|
14
18
|
class Alnum < Base; end
|
@@ -36,6 +40,7 @@ module Regexp::Expression
|
|
36
40
|
class Base < UnicodeProperty::Base; end
|
37
41
|
|
38
42
|
class Any < Letter::Base; end
|
43
|
+
class Cased < Letter::Base; end
|
39
44
|
class Uppercase < Letter::Base; end
|
40
45
|
class Lowercase < Letter::Base; end
|
41
46
|
class Titlecase < Letter::Base; end
|
@@ -47,6 +52,7 @@ module Regexp::Expression
|
|
47
52
|
class Base < UnicodeProperty::Base; end
|
48
53
|
|
49
54
|
class Any < Mark::Base; end
|
55
|
+
class Combining < Mark::Base; end
|
50
56
|
class Nonspacing < Mark::Base; end
|
51
57
|
class Spacing < Mark::Base; end
|
52
58
|
class Enclosing < Mark::Base; end
|
@@ -1,110 +1,27 @@
|
|
1
1
|
module Regexp::Expression
|
2
|
+
class CharacterSet < Regexp::Expression::Subexpression
|
3
|
+
attr_accessor :closed, :negative
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
+
alias :negative? :negative
|
6
|
+
alias :negated? :negative
|
7
|
+
alias :closed? :closed
|
5
8
|
|
6
9
|
def initialize(token, options = {})
|
7
|
-
|
8
|
-
|
9
|
-
@closed = false
|
10
|
+
self.negative = false
|
11
|
+
self.closed = false
|
10
12
|
super
|
11
13
|
end
|
12
14
|
|
13
|
-
# Override base method to clone set members as well.
|
14
|
-
def clone
|
15
|
-
copy = super
|
16
|
-
copy.members = @members.map {|m| m.clone }
|
17
|
-
copy
|
18
|
-
end
|
19
|
-
|
20
|
-
def <<(member)
|
21
|
-
if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
|
22
|
-
@members.last << member
|
23
|
-
else
|
24
|
-
@members << member
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def include?(member, directly = false)
|
29
|
-
@members.each do |m|
|
30
|
-
if m.is_a?(CharacterSubSet) and not directly
|
31
|
-
return true if m.include?(member)
|
32
|
-
else
|
33
|
-
return true if member == m.to_s
|
34
|
-
end
|
35
|
-
end; false
|
36
|
-
end
|
37
|
-
|
38
|
-
def each(&block)
|
39
|
-
@members.each {|m| yield m}
|
40
|
-
end
|
41
|
-
|
42
|
-
def each_with_index(&block)
|
43
|
-
@members.each_with_index {|m, i| yield m, i}
|
44
|
-
end
|
45
|
-
|
46
|
-
def length
|
47
|
-
@members.length
|
48
|
-
end
|
49
|
-
|
50
15
|
def negate
|
51
|
-
|
52
|
-
@members.last.negate
|
53
|
-
else
|
54
|
-
@negative = true
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def negative?
|
59
|
-
@negative
|
16
|
+
self.negative = true
|
60
17
|
end
|
61
|
-
alias :negated? :negative?
|
62
18
|
|
63
19
|
def close
|
64
|
-
|
65
|
-
@members.last.close
|
66
|
-
else
|
67
|
-
@closed = true
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def closed?
|
72
|
-
@closed
|
73
|
-
end
|
74
|
-
|
75
|
-
# Returns an array of the members with any shorthand members like \d and \W
|
76
|
-
# expanded to either traditional form or unicode properties.
|
77
|
-
def expand_members(use_properties = false)
|
78
|
-
@members.map do |member|
|
79
|
-
case member
|
80
|
-
when "\\d"
|
81
|
-
use_properties ? '\p{Digit}' : '0-9'
|
82
|
-
when "\\D"
|
83
|
-
use_properties ? '\P{Digit}' : '^0-9'
|
84
|
-
when "\\w"
|
85
|
-
use_properties ? '\p{Word}' : 'A-Za-z0-9_'
|
86
|
-
when "\\W"
|
87
|
-
use_properties ? '\P{Word}' : '^A-Za-z0-9_'
|
88
|
-
when "\\s"
|
89
|
-
use_properties ? '\p{Space}' : ' \t\f\v\n\r'
|
90
|
-
when "\\S"
|
91
|
-
use_properties ? '\P{Space}' : '^ \t\f\v\n\r'
|
92
|
-
when "\\h"
|
93
|
-
use_properties ? '\p{Xdigit}' : '0-9A-Fa-f'
|
94
|
-
when "\\H"
|
95
|
-
use_properties ? '\P{Xdigit}' : '^0-9A-Fa-f'
|
96
|
-
else
|
97
|
-
member
|
98
|
-
end
|
99
|
-
end
|
20
|
+
self.closed = true
|
100
21
|
end
|
101
22
|
|
102
23
|
def to_s(format = :full)
|
103
|
-
"#{text}#{'^' if
|
24
|
+
"#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
|
104
25
|
end
|
105
26
|
end
|
106
|
-
|
107
|
-
class CharacterSubSet < CharacterSet
|
108
|
-
end
|
109
|
-
|
110
27
|
end # module Regexp::Expression
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Regexp::Expression
|
2
|
+
class CharacterSet < Regexp::Expression::Subexpression
|
3
|
+
class Range < Regexp::Expression::Subexpression
|
4
|
+
def starts_at
|
5
|
+
expressions.first.starts_at
|
6
|
+
end
|
7
|
+
alias :ts :starts_at
|
8
|
+
|
9
|
+
def <<(exp)
|
10
|
+
complete? && raise("Can't add more than 2 expressions to a Range")
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def complete?
|
15
|
+
count == 2
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s(_format = :full)
|
19
|
+
expressions.join(text)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -40,14 +40,16 @@ module Regexp::Expression
|
|
40
40
|
|
41
41
|
part = {}
|
42
42
|
|
43
|
+
print_level = nesting_level > 0 ? nesting_level - 1 : nil
|
44
|
+
|
43
45
|
# Order is important! Fields that use other fields in their
|
44
46
|
# definition must appear before the fields they use.
|
45
47
|
part_keys = %w{a m b o i l x s e S y k c q Q z Z t ~t T >}
|
46
48
|
part.keys.each {|k| part[k] = "<?#{k}?>"}
|
47
49
|
|
48
|
-
part['>'] =
|
50
|
+
part['>'] = print_level ? (' ' * (print_level + indent_offset)) : ''
|
49
51
|
|
50
|
-
part['l'] =
|
52
|
+
part['l'] = print_level ? "#{'%d' % print_level}" : 'root'
|
51
53
|
part['x'] = "#{'%d' % index}" if have_index
|
52
54
|
|
53
55
|
part['s'] = starts_at
|
@@ -101,9 +103,9 @@ module Regexp::Expression
|
|
101
103
|
def strfregexp_tree(format = '%a', include_self = true, separator = "\n")
|
102
104
|
output = include_self ? [self.strfregexp(format)] : []
|
103
105
|
|
104
|
-
output +=
|
106
|
+
output += flat_map do |exp, index|
|
105
107
|
exp.strfregexp(format, (include_self ? 1 : 0), index)
|
106
|
-
|
108
|
+
end
|
107
109
|
|
108
110
|
output.join(separator)
|
109
111
|
end
|