regexp_parser 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -1,8 +1,16 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Backreference
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
5
+ class Number < Backreference::Base
6
+ attr_reader :number
7
+
8
+ def initialize(token, options = {})
9
+ @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
10
+ super
11
+ end
12
+ end
13
+
6
14
  class Name < Backreference::Base
7
15
  attr_reader :name
8
16
 
@@ -12,31 +20,28 @@ module Regexp::Expression
12
20
  end
13
21
  end
14
22
 
15
- class Number < Backreference::Base
16
- attr_reader :number
23
+ class NumberCall < Backreference::Number; end
24
+ class NumberRelative < Backreference::Number; end
25
+ class NumberCallRelative < Backreference::Number; end
26
+ class NameCall < Backreference::Name; end
27
+
28
+ class NumberRecursionLevel < Backreference::Base
29
+ attr_reader :number, :recursion_level
17
30
 
18
31
  def initialize(token, options = {})
19
- @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2]
32
+ @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i)
20
33
  super
21
34
  end
22
35
  end
23
36
 
24
- class NumberRelative < Backreference::Number; end
25
-
26
- class NameNestLevel < Backreference::Base; end
27
- class NumberNestLevel < Backreference::Base; end
28
-
29
- class NameCall < Backreference::Base
30
- attr_reader :name
37
+ class NameRecursionLevel < Backreference::Base
38
+ attr_reader :name, :recursion_level
31
39
 
32
40
  def initialize(token, options = {})
33
- @name = token.text[3..-2]
41
+ @name, recursion_level = token.text[3..-2].split(/(?=[+-])/)
42
+ @recursion_level = recursion_level.to_i
34
43
  super
35
44
  end
36
45
  end
37
-
38
- class NumberCall < Backreference::Base; end
39
- class NumberCallRelative < Backreference::Base; end
40
46
  end
41
-
42
47
  end
@@ -1,9 +1,23 @@
1
1
  module Regexp::Expression
2
-
3
2
  module EscapeSequence
4
- class Base < Regexp::Expression::Base; end
3
+ class Base < Regexp::Expression::Base
4
+ require 'yaml'
5
+
6
+ def char
7
+ # poor man's unescape without using eval
8
+ YAML.load(%Q(---\n"#{text}"\n))
9
+ end
5
10
 
6
- class Literal < EscapeSequence::Base; end
11
+ def codepoint
12
+ char.ord
13
+ end
14
+ end
15
+
16
+ class Literal < EscapeSequence::Base
17
+ def char
18
+ text[1..-1]
19
+ end
20
+ end
7
21
 
8
22
  class AsciiEscape < EscapeSequence::Base; end
9
23
  class Backspace < EscapeSequence::Base; end
@@ -11,17 +25,74 @@ module Regexp::Expression
11
25
  class FormFeed < EscapeSequence::Base; end
12
26
  class Newline < EscapeSequence::Base; end
13
27
  class Return < EscapeSequence::Base; end
14
- class Space < EscapeSequence::Base; end
15
28
  class Tab < EscapeSequence::Base; end
16
29
  class VerticalTab < EscapeSequence::Base; end
17
30
 
18
- class Octal < EscapeSequence::Base; end
19
31
  class Hex < EscapeSequence::Base; end
20
- class HexWide < EscapeSequence::Base; end
32
+ class Codepoint < EscapeSequence::Base; end
21
33
 
22
- class Control < EscapeSequence::Base; end
23
- class Meta < EscapeSequence::Base; end
24
- class MetaControl < EscapeSequence::Base; end
25
- end
34
+ class CodepointList < EscapeSequence::Base
35
+ def char
36
+ raise NoMethodError, 'CodepointList responds only to #chars'
37
+ end
38
+
39
+ def codepoint
40
+ raise NoMethodError, 'CodepointList responds only to #codepoints'
41
+ end
42
+
43
+ def chars
44
+ codepoints.map { |cp| cp.chr('utf-8') }
45
+ end
46
+
47
+ def codepoints
48
+ text.scan(/\h+/).map(&:hex)
49
+ end
50
+ end
51
+
52
+ class Octal < EscapeSequence::Base
53
+ def char
54
+ text[1..-1].to_i(8).chr('utf-8')
55
+ end
56
+ end
26
57
 
58
+ class AbstractMetaControlSequence < EscapeSequence::Base
59
+ def char
60
+ codepoint.chr('utf-8')
61
+ end
62
+
63
+ def codepoint
64
+ raise NotImplementedError, 'implement in subclass'
65
+ end
66
+
67
+ private
68
+
69
+ def control_sequence_to_s(control_sequence)
70
+ five_lsb = control_sequence.unpack('B*').first[-5..-1]
71
+ ["000#{five_lsb}"].pack('B*')
72
+ end
73
+
74
+ def meta_char_to_codepoint(meta_char)
75
+ byte_value = meta_char.ord
76
+ byte_value < 128 ? byte_value + 128 : byte_value
77
+ end
78
+ end
79
+
80
+ class Control < AbstractMetaControlSequence
81
+ def codepoint
82
+ control_sequence_to_s(text).ord
83
+ end
84
+ end
85
+
86
+ class Meta < AbstractMetaControlSequence
87
+ def codepoint
88
+ meta_char_to_codepoint(text[-1])
89
+ end
90
+ end
91
+
92
+ class MetaControl < AbstractMetaControlSequence
93
+ def codepoint
94
+ meta_char_to_codepoint(control_sequence_to_s(text))
95
+ end
96
+ end
97
+ end
27
98
  end
@@ -1,27 +1,27 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Group
4
3
  class Base < Regexp::Expression::Subexpression
5
- def capturing?
6
- [:capture, :named].include?(token)
7
- end
8
-
9
- def comment?
10
- type == :comment
11
- end
12
-
13
4
  def to_s(format = :full)
14
5
  "#{text}#{expressions.join})#{quantifier_affix(format)}"
15
6
  end
7
+
8
+ def capturing?; false end
9
+
10
+ def comment?; false end
16
11
  end
17
12
 
18
- class Atomic < Group::Base; end
19
- class Capture < Group::Base; end
20
- class Passive < Group::Base; end
21
- class Options < Group::Base; end
22
- class Absence < Group::Base; end
13
+ class Atomic < Group::Base; end
14
+ class Passive < Group::Base; end
15
+ class Options < Group::Base; end
16
+ class Absence < Group::Base; end
23
17
 
24
- class Named < Group::Capture
18
+ class Capture < Group::Base
19
+ attr_accessor :number, :number_at_level
20
+
21
+ def capturing?; true end
22
+ end
23
+
24
+ class Named < Group::Capture
25
25
  attr_reader :name
26
26
 
27
27
  def initialize(token, options = {})
@@ -29,10 +29,9 @@ module Regexp::Expression
29
29
  super
30
30
  end
31
31
 
32
- def clone
33
- copy = super
34
- copy.instance_variable_set(:@name, name.dup)
35
- copy
32
+ def initialize_clone(other)
33
+ other.instance_variable_set(:@name, name.dup)
34
+ super
36
35
  end
37
36
  end
38
37
 
@@ -40,6 +39,8 @@ module Regexp::Expression
40
39
  def to_s(_format = :full)
41
40
  text.dup
42
41
  end
42
+
43
+ def comment?; true end
43
44
  end
44
45
  end
45
46
 
@@ -52,5 +53,4 @@ module Regexp::Expression
52
53
  class Lookbehind < Assertion::Base; end
53
54
  class NegativeLookbehind < Assertion::Base; end
54
55
  end
55
-
56
56
  end
@@ -1,7 +1,7 @@
1
1
  module Regexp::Expression
2
- class CharacterClass < Regexp::Expression::Base
2
+ class PosixClass < Regexp::Expression::Base
3
3
  def negative?
4
- type == :nonclass
4
+ type == :nonposixclass
5
5
  end
6
6
 
7
7
  def name
@@ -9,6 +9,10 @@ module Regexp::Expression
9
9
  def name
10
10
  text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
11
11
  end
12
+
13
+ def shortcut
14
+ (Regexp::Scanner.short_prop_map.rassoc(token.to_s) || []).first
15
+ end
12
16
  end
13
17
 
14
18
  class Alnum < Base; end
@@ -36,6 +40,7 @@ module Regexp::Expression
36
40
  class Base < UnicodeProperty::Base; end
37
41
 
38
42
  class Any < Letter::Base; end
43
+ class Cased < Letter::Base; end
39
44
  class Uppercase < Letter::Base; end
40
45
  class Lowercase < Letter::Base; end
41
46
  class Titlecase < Letter::Base; end
@@ -47,6 +52,7 @@ module Regexp::Expression
47
52
  class Base < UnicodeProperty::Base; end
48
53
 
49
54
  class Any < Mark::Base; end
55
+ class Combining < Mark::Base; end
50
56
  class Nonspacing < Mark::Base; end
51
57
  class Spacing < Mark::Base; end
52
58
  class Enclosing < Mark::Base; end
@@ -1,110 +1,27 @@
1
1
  module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ attr_accessor :closed, :negative
2
4
 
3
- class CharacterSet < Regexp::Expression::Base
4
- attr_accessor :members
5
+ alias :negative? :negative
6
+ alias :negated? :negative
7
+ alias :closed? :closed
5
8
 
6
9
  def initialize(token, options = {})
7
- @members = []
8
- @negative = false
9
- @closed = false
10
+ self.negative = false
11
+ self.closed = false
10
12
  super
11
13
  end
12
14
 
13
- # Override base method to clone set members as well.
14
- def clone
15
- copy = super
16
- copy.members = @members.map {|m| m.clone }
17
- copy
18
- end
19
-
20
- def <<(member)
21
- if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
22
- @members.last << member
23
- else
24
- @members << member
25
- end
26
- end
27
-
28
- def include?(member, directly = false)
29
- @members.each do |m|
30
- if m.is_a?(CharacterSubSet) and not directly
31
- return true if m.include?(member)
32
- else
33
- return true if member == m.to_s
34
- end
35
- end; false
36
- end
37
-
38
- def each(&block)
39
- @members.each {|m| yield m}
40
- end
41
-
42
- def each_with_index(&block)
43
- @members.each_with_index {|m, i| yield m, i}
44
- end
45
-
46
- def length
47
- @members.length
48
- end
49
-
50
15
  def negate
51
- if @members.last.is_a?(CharacterSubSet)
52
- @members.last.negate
53
- else
54
- @negative = true
55
- end
56
- end
57
-
58
- def negative?
59
- @negative
16
+ self.negative = true
60
17
  end
61
- alias :negated? :negative?
62
18
 
63
19
  def close
64
- if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
65
- @members.last.close
66
- else
67
- @closed = true
68
- end
69
- end
70
-
71
- def closed?
72
- @closed
73
- end
74
-
75
- # Returns an array of the members with any shorthand members like \d and \W
76
- # expanded to either traditional form or unicode properties.
77
- def expand_members(use_properties = false)
78
- @members.map do |member|
79
- case member
80
- when "\\d"
81
- use_properties ? '\p{Digit}' : '0-9'
82
- when "\\D"
83
- use_properties ? '\P{Digit}' : '^0-9'
84
- when "\\w"
85
- use_properties ? '\p{Word}' : 'A-Za-z0-9_'
86
- when "\\W"
87
- use_properties ? '\P{Word}' : '^A-Za-z0-9_'
88
- when "\\s"
89
- use_properties ? '\p{Space}' : ' \t\f\v\n\r'
90
- when "\\S"
91
- use_properties ? '\P{Space}' : '^ \t\f\v\n\r'
92
- when "\\h"
93
- use_properties ? '\p{Xdigit}' : '0-9A-Fa-f'
94
- when "\\H"
95
- use_properties ? '\P{Xdigit}' : '^0-9A-Fa-f'
96
- else
97
- member
98
- end
99
- end
20
+ self.closed = true
100
21
  end
101
22
 
102
23
  def to_s(format = :full)
103
- "#{text}#{'^' if negative?}#{members.join}]#{quantifier_affix(format)}"
24
+ "#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
104
25
  end
105
26
  end
106
-
107
- class CharacterSubSet < CharacterSet
108
- end
109
-
110
27
  end # module Regexp::Expression
@@ -0,0 +1,9 @@
1
+ module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ class IntersectedSequence < Regexp::Expression::Sequence; end
4
+
5
+ class Intersection < Regexp::Expression::SequenceOperation
6
+ OPERAND = IntersectedSequence
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,23 @@
1
+ module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ class Range < Regexp::Expression::Subexpression
4
+ def starts_at
5
+ expressions.first.starts_at
6
+ end
7
+ alias :ts :starts_at
8
+
9
+ def <<(exp)
10
+ complete? && raise("Can't add more than 2 expressions to a Range")
11
+ super
12
+ end
13
+
14
+ def complete?
15
+ count == 2
16
+ end
17
+
18
+ def to_s(_format = :full)
19
+ expressions.join(text)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -40,14 +40,16 @@ module Regexp::Expression
40
40
 
41
41
  part = {}
42
42
 
43
+ print_level = nesting_level > 0 ? nesting_level - 1 : nil
44
+
43
45
  # Order is important! Fields that use other fields in their
44
46
  # definition must appear before the fields they use.
45
47
  part_keys = %w{a m b o i l x s e S y k c q Q z Z t ~t T >}
46
48
  part.keys.each {|k| part[k] = "<?#{k}?>"}
47
49
 
48
- part['>'] = level ? (' ' * (level + indent_offset)) : ''
50
+ part['>'] = print_level ? (' ' * (print_level + indent_offset)) : ''
49
51
 
50
- part['l'] = level ? "#{'%d' % level}" : 'root'
52
+ part['l'] = print_level ? "#{'%d' % print_level}" : 'root'
51
53
  part['x'] = "#{'%d' % index}" if have_index
52
54
 
53
55
  part['s'] = starts_at
@@ -101,9 +103,9 @@ module Regexp::Expression
101
103
  def strfregexp_tree(format = '%a', include_self = true, separator = "\n")
102
104
  output = include_self ? [self.strfregexp(format)] : []
103
105
 
104
- output += map {|exp, index|
106
+ output += flat_map do |exp, index|
105
107
  exp.strfregexp(format, (include_self ? 1 : 0), index)
106
- }
108
+ end
107
109
 
108
110
  output.join(separator)
109
111
  end