regexp_parser 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -1,8 +1,16 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Backreference
4
3
  class Base < Regexp::Expression::Base; end
5
4
 
5
+ class Number < Backreference::Base
6
+ attr_reader :number
7
+
8
+ def initialize(token, options = {})
9
+ @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2].to_i
10
+ super
11
+ end
12
+ end
13
+
6
14
  class Name < Backreference::Base
7
15
  attr_reader :name
8
16
 
@@ -12,31 +20,28 @@ module Regexp::Expression
12
20
  end
13
21
  end
14
22
 
15
- class Number < Backreference::Base
16
- attr_reader :number
23
+ class NumberCall < Backreference::Number; end
24
+ class NumberRelative < Backreference::Number; end
25
+ class NumberCallRelative < Backreference::Number; end
26
+ class NameCall < Backreference::Name; end
27
+
28
+ class NumberRecursionLevel < Backreference::Base
29
+ attr_reader :number, :recursion_level
17
30
 
18
31
  def initialize(token, options = {})
19
- @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2]
32
+ @number, @recursion_level = token.text[3..-2].split(/(?=[+-])/).map(&:to_i)
20
33
  super
21
34
  end
22
35
  end
23
36
 
24
- class NumberRelative < Backreference::Number; end
25
-
26
- class NameNestLevel < Backreference::Base; end
27
- class NumberNestLevel < Backreference::Base; end
28
-
29
- class NameCall < Backreference::Base
30
- attr_reader :name
37
+ class NameRecursionLevel < Backreference::Base
38
+ attr_reader :name, :recursion_level
31
39
 
32
40
  def initialize(token, options = {})
33
- @name = token.text[3..-2]
41
+ @name, recursion_level = token.text[3..-2].split(/(?=[+-])/)
42
+ @recursion_level = recursion_level.to_i
34
43
  super
35
44
  end
36
45
  end
37
-
38
- class NumberCall < Backreference::Base; end
39
- class NumberCallRelative < Backreference::Base; end
40
46
  end
41
-
42
47
  end
@@ -1,9 +1,23 @@
1
1
  module Regexp::Expression
2
-
3
2
  module EscapeSequence
4
- class Base < Regexp::Expression::Base; end
3
+ class Base < Regexp::Expression::Base
4
+ require 'yaml'
5
+
6
+ def char
7
+ # poor man's unescape without using eval
8
+ YAML.load(%Q(---\n"#{text}"\n))
9
+ end
5
10
 
6
- class Literal < EscapeSequence::Base; end
11
+ def codepoint
12
+ char.ord
13
+ end
14
+ end
15
+
16
+ class Literal < EscapeSequence::Base
17
+ def char
18
+ text[1..-1]
19
+ end
20
+ end
7
21
 
8
22
  class AsciiEscape < EscapeSequence::Base; end
9
23
  class Backspace < EscapeSequence::Base; end
@@ -11,17 +25,74 @@ module Regexp::Expression
11
25
  class FormFeed < EscapeSequence::Base; end
12
26
  class Newline < EscapeSequence::Base; end
13
27
  class Return < EscapeSequence::Base; end
14
- class Space < EscapeSequence::Base; end
15
28
  class Tab < EscapeSequence::Base; end
16
29
  class VerticalTab < EscapeSequence::Base; end
17
30
 
18
- class Octal < EscapeSequence::Base; end
19
31
  class Hex < EscapeSequence::Base; end
20
- class HexWide < EscapeSequence::Base; end
32
+ class Codepoint < EscapeSequence::Base; end
21
33
 
22
- class Control < EscapeSequence::Base; end
23
- class Meta < EscapeSequence::Base; end
24
- class MetaControl < EscapeSequence::Base; end
25
- end
34
+ class CodepointList < EscapeSequence::Base
35
+ def char
36
+ raise NoMethodError, 'CodepointList responds only to #chars'
37
+ end
38
+
39
+ def codepoint
40
+ raise NoMethodError, 'CodepointList responds only to #codepoints'
41
+ end
42
+
43
+ def chars
44
+ codepoints.map { |cp| cp.chr('utf-8') }
45
+ end
46
+
47
+ def codepoints
48
+ text.scan(/\h+/).map(&:hex)
49
+ end
50
+ end
51
+
52
+ class Octal < EscapeSequence::Base
53
+ def char
54
+ text[1..-1].to_i(8).chr('utf-8')
55
+ end
56
+ end
26
57
 
58
+ class AbstractMetaControlSequence < EscapeSequence::Base
59
+ def char
60
+ codepoint.chr('utf-8')
61
+ end
62
+
63
+ def codepoint
64
+ raise NotImplementedError, 'implement in subclass'
65
+ end
66
+
67
+ private
68
+
69
+ def control_sequence_to_s(control_sequence)
70
+ five_lsb = control_sequence.unpack('B*').first[-5..-1]
71
+ ["000#{five_lsb}"].pack('B*')
72
+ end
73
+
74
+ def meta_char_to_codepoint(meta_char)
75
+ byte_value = meta_char.ord
76
+ byte_value < 128 ? byte_value + 128 : byte_value
77
+ end
78
+ end
79
+
80
+ class Control < AbstractMetaControlSequence
81
+ def codepoint
82
+ control_sequence_to_s(text).ord
83
+ end
84
+ end
85
+
86
+ class Meta < AbstractMetaControlSequence
87
+ def codepoint
88
+ meta_char_to_codepoint(text[-1])
89
+ end
90
+ end
91
+
92
+ class MetaControl < AbstractMetaControlSequence
93
+ def codepoint
94
+ meta_char_to_codepoint(control_sequence_to_s(text))
95
+ end
96
+ end
97
+ end
27
98
  end
@@ -1,27 +1,27 @@
1
1
  module Regexp::Expression
2
-
3
2
  module Group
4
3
  class Base < Regexp::Expression::Subexpression
5
- def capturing?
6
- [:capture, :named].include?(token)
7
- end
8
-
9
- def comment?
10
- type == :comment
11
- end
12
-
13
4
  def to_s(format = :full)
14
5
  "#{text}#{expressions.join})#{quantifier_affix(format)}"
15
6
  end
7
+
8
+ def capturing?; false end
9
+
10
+ def comment?; false end
16
11
  end
17
12
 
18
- class Atomic < Group::Base; end
19
- class Capture < Group::Base; end
20
- class Passive < Group::Base; end
21
- class Options < Group::Base; end
22
- class Absence < Group::Base; end
13
+ class Atomic < Group::Base; end
14
+ class Passive < Group::Base; end
15
+ class Options < Group::Base; end
16
+ class Absence < Group::Base; end
23
17
 
24
- class Named < Group::Capture
18
+ class Capture < Group::Base
19
+ attr_accessor :number, :number_at_level
20
+
21
+ def capturing?; true end
22
+ end
23
+
24
+ class Named < Group::Capture
25
25
  attr_reader :name
26
26
 
27
27
  def initialize(token, options = {})
@@ -29,10 +29,9 @@ module Regexp::Expression
29
29
  super
30
30
  end
31
31
 
32
- def clone
33
- copy = super
34
- copy.instance_variable_set(:@name, name.dup)
35
- copy
32
+ def initialize_clone(other)
33
+ other.instance_variable_set(:@name, name.dup)
34
+ super
36
35
  end
37
36
  end
38
37
 
@@ -40,6 +39,8 @@ module Regexp::Expression
40
39
  def to_s(_format = :full)
41
40
  text.dup
42
41
  end
42
+
43
+ def comment?; true end
43
44
  end
44
45
  end
45
46
 
@@ -52,5 +53,4 @@ module Regexp::Expression
52
53
  class Lookbehind < Assertion::Base; end
53
54
  class NegativeLookbehind < Assertion::Base; end
54
55
  end
55
-
56
56
  end
@@ -1,7 +1,7 @@
1
1
  module Regexp::Expression
2
- class CharacterClass < Regexp::Expression::Base
2
+ class PosixClass < Regexp::Expression::Base
3
3
  def negative?
4
- type == :nonclass
4
+ type == :nonposixclass
5
5
  end
6
6
 
7
7
  def name
@@ -9,6 +9,10 @@ module Regexp::Expression
9
9
  def name
10
10
  text =~ /\A\\[pP]\{([^}]+)\}\z/; $1
11
11
  end
12
+
13
+ def shortcut
14
+ (Regexp::Scanner.short_prop_map.rassoc(token.to_s) || []).first
15
+ end
12
16
  end
13
17
 
14
18
  class Alnum < Base; end
@@ -36,6 +40,7 @@ module Regexp::Expression
36
40
  class Base < UnicodeProperty::Base; end
37
41
 
38
42
  class Any < Letter::Base; end
43
+ class Cased < Letter::Base; end
39
44
  class Uppercase < Letter::Base; end
40
45
  class Lowercase < Letter::Base; end
41
46
  class Titlecase < Letter::Base; end
@@ -47,6 +52,7 @@ module Regexp::Expression
47
52
  class Base < UnicodeProperty::Base; end
48
53
 
49
54
  class Any < Mark::Base; end
55
+ class Combining < Mark::Base; end
50
56
  class Nonspacing < Mark::Base; end
51
57
  class Spacing < Mark::Base; end
52
58
  class Enclosing < Mark::Base; end
@@ -1,110 +1,27 @@
1
1
  module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ attr_accessor :closed, :negative
2
4
 
3
- class CharacterSet < Regexp::Expression::Base
4
- attr_accessor :members
5
+ alias :negative? :negative
6
+ alias :negated? :negative
7
+ alias :closed? :closed
5
8
 
6
9
  def initialize(token, options = {})
7
- @members = []
8
- @negative = false
9
- @closed = false
10
+ self.negative = false
11
+ self.closed = false
10
12
  super
11
13
  end
12
14
 
13
- # Override base method to clone set members as well.
14
- def clone
15
- copy = super
16
- copy.members = @members.map {|m| m.clone }
17
- copy
18
- end
19
-
20
- def <<(member)
21
- if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
22
- @members.last << member
23
- else
24
- @members << member
25
- end
26
- end
27
-
28
- def include?(member, directly = false)
29
- @members.each do |m|
30
- if m.is_a?(CharacterSubSet) and not directly
31
- return true if m.include?(member)
32
- else
33
- return true if member == m.to_s
34
- end
35
- end; false
36
- end
37
-
38
- def each(&block)
39
- @members.each {|m| yield m}
40
- end
41
-
42
- def each_with_index(&block)
43
- @members.each_with_index {|m, i| yield m, i}
44
- end
45
-
46
- def length
47
- @members.length
48
- end
49
-
50
15
  def negate
51
- if @members.last.is_a?(CharacterSubSet)
52
- @members.last.negate
53
- else
54
- @negative = true
55
- end
56
- end
57
-
58
- def negative?
59
- @negative
16
+ self.negative = true
60
17
  end
61
- alias :negated? :negative?
62
18
 
63
19
  def close
64
- if @members.last.is_a?(CharacterSubSet) and not @members.last.closed?
65
- @members.last.close
66
- else
67
- @closed = true
68
- end
69
- end
70
-
71
- def closed?
72
- @closed
73
- end
74
-
75
- # Returns an array of the members with any shorthand members like \d and \W
76
- # expanded to either traditional form or unicode properties.
77
- def expand_members(use_properties = false)
78
- @members.map do |member|
79
- case member
80
- when "\\d"
81
- use_properties ? '\p{Digit}' : '0-9'
82
- when "\\D"
83
- use_properties ? '\P{Digit}' : '^0-9'
84
- when "\\w"
85
- use_properties ? '\p{Word}' : 'A-Za-z0-9_'
86
- when "\\W"
87
- use_properties ? '\P{Word}' : '^A-Za-z0-9_'
88
- when "\\s"
89
- use_properties ? '\p{Space}' : ' \t\f\v\n\r'
90
- when "\\S"
91
- use_properties ? '\P{Space}' : '^ \t\f\v\n\r'
92
- when "\\h"
93
- use_properties ? '\p{Xdigit}' : '0-9A-Fa-f'
94
- when "\\H"
95
- use_properties ? '\P{Xdigit}' : '^0-9A-Fa-f'
96
- else
97
- member
98
- end
99
- end
20
+ self.closed = true
100
21
  end
101
22
 
102
23
  def to_s(format = :full)
103
- "#{text}#{'^' if negative?}#{members.join}]#{quantifier_affix(format)}"
24
+ "#{text}#{'^' if negated?}#{expressions.join}]#{quantifier_affix(format)}"
104
25
  end
105
26
  end
106
-
107
- class CharacterSubSet < CharacterSet
108
- end
109
-
110
27
  end # module Regexp::Expression
@@ -0,0 +1,9 @@
1
+ module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ class IntersectedSequence < Regexp::Expression::Sequence; end
4
+
5
+ class Intersection < Regexp::Expression::SequenceOperation
6
+ OPERAND = IntersectedSequence
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,23 @@
1
+ module Regexp::Expression
2
+ class CharacterSet < Regexp::Expression::Subexpression
3
+ class Range < Regexp::Expression::Subexpression
4
+ def starts_at
5
+ expressions.first.starts_at
6
+ end
7
+ alias :ts :starts_at
8
+
9
+ def <<(exp)
10
+ complete? && raise("Can't add more than 2 expressions to a Range")
11
+ super
12
+ end
13
+
14
+ def complete?
15
+ count == 2
16
+ end
17
+
18
+ def to_s(_format = :full)
19
+ expressions.join(text)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -40,14 +40,16 @@ module Regexp::Expression
40
40
 
41
41
  part = {}
42
42
 
43
+ print_level = nesting_level > 0 ? nesting_level - 1 : nil
44
+
43
45
  # Order is important! Fields that use other fields in their
44
46
  # definition must appear before the fields they use.
45
47
  part_keys = %w{a m b o i l x s e S y k c q Q z Z t ~t T >}
46
48
  part.keys.each {|k| part[k] = "<?#{k}?>"}
47
49
 
48
- part['>'] = level ? (' ' * (level + indent_offset)) : ''
50
+ part['>'] = print_level ? (' ' * (print_level + indent_offset)) : ''
49
51
 
50
- part['l'] = level ? "#{'%d' % level}" : 'root'
52
+ part['l'] = print_level ? "#{'%d' % print_level}" : 'root'
51
53
  part['x'] = "#{'%d' % index}" if have_index
52
54
 
53
55
  part['s'] = starts_at
@@ -101,9 +103,9 @@ module Regexp::Expression
101
103
  def strfregexp_tree(format = '%a', include_self = true, separator = "\n")
102
104
  output = include_self ? [self.strfregexp(format)] : []
103
105
 
104
- output += map {|exp, index|
106
+ output += flat_map do |exp, index|
105
107
  exp.strfregexp(format, (include_self ? 1 : 0), index)
106
- }
108
+ end
107
109
 
108
110
  output.join(separator)
109
111
  end