regexp_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,172 @@
1
+ module Regexp::Syntax
2
+ require File.expand_path('../syntax/tokens', __FILE__)
3
+
4
+ class SyntaxError < StandardError
5
+ def initialize(what)
6
+ super what
7
+ end
8
+ end
9
+
10
+ class UnknownSyntaxNameError < SyntaxError
11
+ def initialize(name)
12
+ super "Unknown syntax name '#{name}'"
13
+ end
14
+ end
15
+
16
+ class MissingSyntaxSpecError < SyntaxError
17
+ def initialize(name)
18
+ super "Missing syntax specification file for '#{name}'"
19
+ end
20
+ end
21
+
22
+ class NotImplementedError < SyntaxError
23
+ def initialize(syntax, type, token)
24
+ super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
25
+ end
26
+ end
27
+
28
+ SYNTAX_SPEC_ROOT = File.expand_path('../syntax', __FILE__)
29
+
30
+ # Loads, and instantiates an instance of the syntax specification class for
31
+ # the given syntax flavor name. The special names 'any' and '*' returns a
32
+ # instance of Syntax::Any. See below for more details.
33
+ def self.new(name)
34
+ return Regexp::Syntax::Any.new if
35
+ ['*', 'any'].include?( name.to_s )
36
+
37
+ self.load(name)
38
+
39
+ case name
40
+ when 'ruby/1.8.6'; syntax = Regexp::Syntax::Ruby::V186.new
41
+ when 'ruby/1.8.7'; syntax = Regexp::Syntax::Ruby::V187.new
42
+
43
+ # alias for the latest 1.8 implementation
44
+ when 'ruby/1.8'; syntax = Regexp::Syntax::Ruby::V18.new
45
+
46
+ when 'ruby/1.9.1'; syntax = Regexp::Syntax::Ruby::V191.new
47
+ when 'ruby/1.9.2'; syntax = Regexp::Syntax::Ruby::V192.new
48
+ when 'ruby/1.9.3'; syntax = Regexp::Syntax::Ruby::V193.new
49
+
50
+ # alias for the latest 1.9 implementation
51
+ when 'ruby/1.9'; syntax = Regexp::Syntax::Ruby::V19.new
52
+
53
+ else
54
+ raise UnknownSyntaxError.new(name)
55
+ end
56
+ end
57
+
58
+ # Checks if the named syntax has a specification class file, and requires
59
+ # it if it does. Downcases names, and adds the .rb extension if omitted.
60
+ def self.load(name)
61
+ full = "#{SYNTAX_SPEC_ROOT}/#{name.downcase}"
62
+ full = (full[-1, 3] == '.rb') ? full : "#{full}.rb"
63
+
64
+ raise MissingSyntaxSpecError.new(name) unless File.exist?(full)
65
+ require full
66
+ end
67
+
68
+ # A lookup map of supported types and tokens in a given syntax
69
+ class Base
70
+ def initialize
71
+ @implements = {}
72
+
73
+ implements :literal, [:literal]
74
+ end
75
+
76
+ def implementation
77
+ @implements
78
+ end
79
+
80
+ def implements(type, tokens)
81
+ if @implements[type]
82
+ @implements[type] = (@implements[type] + tokens).uniq
83
+ else
84
+ @implements[type] = tokens
85
+ end
86
+ end
87
+
88
+ # removes
89
+ def excludes(type, tokens)
90
+ if tokens
91
+ tokens = [tokens] unless tokens.is_a?(Array)
92
+ end
93
+
94
+ if @implements[type]
95
+ if tokens
96
+ @implements[type] = @implements[type] - tokens
97
+ @implements[type] = nil if @implements[type].empty?
98
+ else
99
+ @implements[type] = nil
100
+ end
101
+ end
102
+ end
103
+
104
+ def implements?(type, token)
105
+ return true if @implements[type] and @implements[type].include?(token)
106
+ false
107
+ end
108
+ alias :check? :implements?
109
+
110
+ def implements!(type, token)
111
+ raise NotImplementedError.new(self, type, token) unless
112
+ implements?(type, token)
113
+ end
114
+ alias :check! :implements!
115
+
116
+ def normalize(type, token)
117
+ case type
118
+ when :group
119
+ normalize_group(type, token)
120
+ when :backref
121
+ normalize_backref(type, token)
122
+ else
123
+ [type, token]
124
+ end
125
+ end
126
+
127
+ def normalize_group(type, token)
128
+ case token
129
+ when :named_ab, :named_sq
130
+ [:group, :named]
131
+ else
132
+ [type, token]
133
+ end
134
+ end
135
+
136
+ def normalize_backref(type, token)
137
+ case token
138
+ when :name_ref_ab, :name_ref_sq
139
+ [:backref, :name_ref]
140
+ when :name_call_ab, :name_call_sq
141
+ [:backref, :name_call]
142
+ when :name_nest_ref_ab, :name_nest_ref_sq
143
+ [:backref, :name_nest_ref]
144
+ when :number_ref_ab, :number_ref_sq
145
+ [:backref, :number_ref]
146
+ when :number_call_ab, :number_call_sq
147
+ [:backref, :number_call]
148
+ when :number_rel_ref_ab, :number_rel_ref_sq
149
+ [:backref, :number_rel_ref]
150
+ when :number_rel_call_ab, :number_rel_call_sq
151
+ [:backref, :number_rel_call]
152
+ when :number_nest_ref_ab, :number_nest_ref_sq
153
+ [:backref, :number_nest_ref]
154
+ else
155
+ [type, token]
156
+ end
157
+ end
158
+ end
159
+
160
+ # A syntax that always returns true, passing all tokens as implemented. This
161
+ # is useful during development, testing, and should be useful for some types
162
+ # of transformations as well.
163
+ class Any < Base
164
+ def initialize
165
+ @implements = { :* => [:*] }
166
+ end
167
+
168
+ def implements?(type, token) true end
169
+ def implements!(type, token) true end
170
+ end
171
+
172
+ end
@@ -0,0 +1,45 @@
1
+ class Regexp
2
+
3
+ module Parser
4
+ VERSION = '0.0.1'
5
+ end
6
+
7
+ TOKEN_KEYS = [:type, :token, :text, :ts, :te, :depth, :set_depth].freeze
8
+ Token = Struct.new(*TOKEN_KEYS) do
9
+ def offset
10
+ [self.ts, self.te]
11
+ end
12
+
13
+ def length
14
+ self.te - self.ts
15
+ end
16
+
17
+ def to_h
18
+ hash = {}
19
+ members.each do |member|
20
+ hash[member.to_sym] = self.send(member.to_sym)
21
+ end; hash
22
+ end
23
+
24
+ def next(exp = nil)
25
+ if exp
26
+ @next = exp
27
+ else
28
+ @next
29
+ end
30
+ end
31
+
32
+ def previous(exp = nil)
33
+ if exp
34
+ @previous = exp
35
+ else
36
+ @previous
37
+ end
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ %w{ctype scanner syntax lexer parser}.each do |file|
44
+ require File.expand_path("../regexp_parser/#{file}", __FILE__)
45
+ end
data/test/helpers.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "test/unit"
2
+ require File.expand_path("../../lib/regexp_parser", __FILE__)
3
+
4
+ RS = Regexp::Scanner
5
+ RL = Regexp::Lexer
6
+ RP = Regexp::Parser
7
+
8
+ include Regexp::Expression
@@ -0,0 +1,26 @@
1
+ require File.expand_path("../../helpers", __FILE__)
2
+
3
+ %w{
4
+ literals nesting refcalls
5
+ }.each do|tc|
6
+ require File.expand_path("../test_#{tc}", __FILE__)
7
+ end
8
+
9
+ class TestRegexpLexer < Test::Unit::TestCase
10
+
11
+ def test_lexer_returns_an_array
12
+ assert_instance_of( Array, RL.scan('abc'))
13
+ end
14
+
15
+ def test_lexer_returns_tokens
16
+ tokens = RL.scan('^abc+[^one]{2,3}\b\d\\\C-C$')
17
+ assert( tokens.all?{|token| token.kind_of?(Regexp::Token)},
18
+ "Not all array members are tokens")
19
+ end
20
+
21
+ def test_lexer_token_count
22
+ tokens = RL.scan(/^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i)
23
+ assert_equal( 26, tokens.length )
24
+ end
25
+
26
+ end
@@ -0,0 +1,120 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.expand_path("../../helpers", __FILE__)
4
+
5
+ class LexerLiterals < Test::Unit::TestCase
6
+
7
+ tests = {
8
+ # ascii, single byte characters
9
+ 'a' => {
10
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
11
+ },
12
+
13
+ 'ab+' => {
14
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
15
+ 1 => [:literal, :literal, 'b', 1, 2, 0, 0],
16
+ 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0],
17
+ },
18
+
19
+
20
+ # 2 byte wide characters, Arabic
21
+ 'ا' => {
22
+ 0 => [:literal, :literal, 'ا', 0, 2, 0, 0],
23
+ },
24
+
25
+ 'aاbبcت' => {
26
+ 0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0],
27
+ },
28
+
29
+ 'aاbبت?' => {
30
+ 0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0],
31
+ 1 => [:literal, :literal, 'ت', 6, 8, 0, 0],
32
+ 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
33
+ },
34
+
35
+ 'aا?bبcت+' => {
36
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
37
+ 1 => [:literal, :literal, 'ا', 1, 3, 0, 0],
38
+ 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0],
39
+ 3 => [:literal, :literal, 'bبc', 4, 8, 0, 0],
40
+ 4 => [:literal, :literal, 'ت', 8, 10, 0, 0],
41
+ 5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0],
42
+ },
43
+
44
+ 'a(اbب+)cت?' => {
45
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
46
+ 1 => [:group, :capture, '(', 1, 2, 0, 0],
47
+ 2 => [:literal, :literal, 'اb', 2, 5, 1, 0],
48
+ 3 => [:literal, :literal, 'ب', 5, 7, 1, 0],
49
+ 4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0],
50
+ 5 => [:group, :close, ')', 8, 9, 0, 0],
51
+ 6 => [:literal, :literal, 'c', 9, 10, 0, 0],
52
+ 7 => [:literal, :literal, 'ت', 10, 12, 0, 0],
53
+ 8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0],
54
+ },
55
+
56
+
57
+ # 3 byte wide characters, Japanese
58
+ 'ab?れます+cd' => {
59
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
60
+ 1 => [:literal, :literal, 'b', 1, 2, 0, 0],
61
+ 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0],
62
+ 3 => [:literal, :literal, 'れま', 3, 9, 0, 0],
63
+ 4 => [:literal, :literal, 'す', 9, 12, 0, 0],
64
+ 5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0],
65
+ 6 => [:literal, :literal, 'cd', 13, 15, 0, 0],
66
+ },
67
+
68
+
69
+ # 4 byte wide characters, Osmanya
70
+ '𐒀𐒁?𐒂ab+𐒃' => {
71
+ 0 => [:literal, :literal, '𐒀', 0, 4, 0, 0],
72
+ 1 => [:literal, :literal, '𐒁', 4, 8, 0, 0],
73
+ 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
74
+ 3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0],
75
+ 4 => [:literal, :literal, 'b', 14, 15, 0, 0],
76
+ 5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
77
+ 6 => [:literal, :literal, '𐒃', 16, 20, 0, 0],
78
+ },
79
+
80
+ 'mu𝄞?si*𝄫c+' => {
81
+ 0 => [:literal, :literal, 'mu', 0, 2, 0, 0],
82
+ 1 => [:literal, :literal, '𝄞', 2, 6, 0, 0],
83
+ 2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0],
84
+ 3 => [:literal, :literal, 's', 7, 8, 0, 0],
85
+ 4 => [:literal, :literal, 'i', 8, 9, 0, 0],
86
+ 5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0],
87
+ 6 => [:literal, :literal, '𝄫', 10, 14, 0, 0],
88
+ 7 => [:literal, :literal, 'c', 14, 15, 0, 0],
89
+ 8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
90
+ },
91
+ }
92
+
93
+ count = 0
94
+ tests.each do |pattern, checks|
95
+ define_method "test_lex_literal_runs_#{count+=1}" do
96
+
97
+ tokens = RL.scan(pattern)
98
+ checks.each do |offset, token|
99
+ assert_equal( token, tokens[offset].to_a )
100
+ end
101
+
102
+ end
103
+ end
104
+
105
+ def test_lex_single_2_byte_char
106
+ tokens = RL.scan('ا+')
107
+ assert_equal( 2, tokens.length )
108
+ end
109
+
110
+ def test_lex_single_3_byte_char
111
+ tokens = RL.scan('れ+')
112
+ assert_equal( 2, tokens.length )
113
+ end
114
+
115
+ def test_lex_single_4_byte_char
116
+ tokens = RL.scan('𝄞+')
117
+ assert_equal( 2, tokens.length )
118
+ end
119
+
120
+ end
@@ -0,0 +1,107 @@
1
+ require File.expand_path("../../helpers", __FILE__)
2
+
3
+ class LexerNesting < Test::Unit::TestCase
4
+
5
+ tests = {
6
+ '(((b)))' => {
7
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
8
+ 1 => [:group, :capture, '(', 1, 2, 1, 0],
9
+ 2 => [:group, :capture, '(', 2, 3, 2, 0],
10
+ 3 => [:literal, :literal, 'b', 3, 4, 3, 0],
11
+ 4 => [:group, :close, ')', 4, 5, 2, 0],
12
+ 5 => [:group, :close, ')', 5, 6, 1, 0],
13
+ 6 => [:group, :close, ')', 6, 7, 0, 0],
14
+ },
15
+
16
+ '(\((b)\))' => {
17
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
18
+ 1 => [:escape, :group_open, '\(', 1, 3, 1, 0],
19
+ 2 => [:group, :capture, '(', 3, 4, 1, 0],
20
+ 3 => [:literal, :literal, 'b', 4, 5, 2, 0],
21
+ 4 => [:group, :close, ')', 5, 6, 1, 0],
22
+ 5 => [:escape, :group_close, '\)', 6, 8, 1, 0],
23
+ 6 => [:group, :close, ')', 8, 9, 0, 0],
24
+ },
25
+
26
+ '(?>a(?>b(?>c)))' => {
27
+ 0 => [:group, :atomic, '(?>', 0, 3, 0, 0],
28
+ 2 => [:group, :atomic, '(?>', 4, 7, 1, 0],
29
+ 4 => [:group, :atomic, '(?>', 8, 11, 2, 0],
30
+ 6 => [:group, :close, ')', 12, 13, 2, 0],
31
+ 7 => [:group, :close, ')', 13, 14, 1, 0],
32
+ 8 => [:group, :close, ')', 14, 15, 0, 0],
33
+ },
34
+
35
+ '(?:a(?:b(?:c)))' => {
36
+ 0 => [:group, :passive, '(?:', 0, 3, 0, 0],
37
+ 2 => [:group, :passive, '(?:', 4, 7, 1, 0],
38
+ 4 => [:group, :passive, '(?:', 8, 11, 2, 0],
39
+ 6 => [:group, :close, ')', 12, 13, 2, 0],
40
+ 7 => [:group, :close, ')', 13, 14, 1, 0],
41
+ 8 => [:group, :close, ')', 14, 15, 0, 0],
42
+ },
43
+
44
+ '(?=a(?!b(?<=c(?<!d))))' => {
45
+ 0 => [:assertion, :lookahead, '(?=', 0, 3, 0, 0],
46
+ 2 => [:assertion, :nlookahead, '(?!', 4, 7, 1, 0],
47
+ 4 => [:assertion, :lookbehind, '(?<=', 8, 12, 2, 0],
48
+ 6 => [:assertion, :nlookbehind, '(?<!', 13, 17, 3, 0],
49
+ 8 => [:group, :close, ')', 18, 19, 3, 0],
50
+ 9 => [:group, :close, ')', 19, 20, 2, 0],
51
+ 10 => [:group, :close, ')', 20, 21, 1, 0],
52
+ 11 => [:group, :close, ')', 21, 22, 0, 0],
53
+ },
54
+
55
+ '((?#a)b(?#c)d(?#e))' => {
56
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
57
+ 1 => [:group, :comment, '(?#a)', 1, 6, 1, 0],
58
+ 3 => [:group, :comment, '(?#c)', 7, 12, 1, 0],
59
+ 5 => [:group, :comment, '(?#e)', 13, 18, 1, 0],
60
+ 6 => [:group, :close, ')', 18, 19, 0, 0],
61
+ },
62
+
63
+ 'a[b-e]f' => {
64
+ 1 => [:set, :open, '[', 1, 2, 0, 0],
65
+ 2 => [:set, :range, 'b-e', 2, 5, 0, 1],
66
+ 3 => [:set, :close, ']', 5, 6, 0, 0],
67
+ },
68
+
69
+ '[a-w&&[^c-g]z]' => {
70
+ 0 => [:set, :open, '[', 0, 1, 0, 0],
71
+ 2 => [:set, :intersection, '&&', 4, 6, 0, 1],
72
+ 3 => [:subset, :open, '[', 6, 7, 0, 1],
73
+ 4 => [:subset, :negate, '^', 7, 8, 0, 2],
74
+ 5 => [:subset, :range, 'c-g', 8, 11, 0, 2],
75
+ 6 => [:subset, :close, ']', 11, 12, 0, 1],
76
+ 8 => [:set, :close, ']', 13, 14, 0, 0],
77
+ },
78
+
79
+ '[a[b[c[d-g]]]]' => {
80
+ 0 => [:set, :open, '[', 0, 1, 0, 0],
81
+ 1 => [:set, :member, 'a', 1, 2, 0, 1],
82
+ 2 => [:subset, :open, '[', 2, 3, 0, 1],
83
+ 3 => [:subset, :member, 'b', 3, 4, 0, 2],
84
+ 4 => [:subset, :open, '[', 4, 5, 0, 2],
85
+ 5 => [:subset, :member, 'c', 5, 6, 0, 3],
86
+ 6 => [:subset, :open, '[', 6, 7, 0, 3],
87
+ 7 => [:subset, :range, 'd-g', 7, 10, 0, 4],
88
+ 8 => [:subset, :close, ']', 10, 11, 0, 3],
89
+ 9 => [:subset, :close, ']', 11, 12, 0, 2],
90
+ 10 => [:subset, :close, ']', 12, 13, 0, 1],
91
+ 11 => [:set, :close, ']', 13, 14, 0, 0],
92
+ },
93
+ }
94
+
95
+ count = 0
96
+ tests.each do |pattern, checks|
97
+ define_method "test_lex_nesting_#{count+=1}" do
98
+
99
+ tokens = RL.scan(pattern, 'ruby/1.9')
100
+ checks.each do |offset, token|
101
+ assert_equal( token, tokens[offset].to_a )
102
+ end
103
+
104
+ end
105
+ end
106
+
107
+ end