regexp_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/ChangeLog +4 -0
  2. data/LICENSE +22 -0
  3. data/README.rdoc +307 -0
  4. data/Rakefile +91 -0
  5. data/lib/regexp_parser/ctype.rb +48 -0
  6. data/lib/regexp_parser/expression/property.rb +108 -0
  7. data/lib/regexp_parser/expression/set.rb +59 -0
  8. data/lib/regexp_parser/expression.rb +287 -0
  9. data/lib/regexp_parser/lexer.rb +105 -0
  10. data/lib/regexp_parser/parser.rb +417 -0
  11. data/lib/regexp_parser/scanner/property.rl +534 -0
  12. data/lib/regexp_parser/scanner/scanner.rl +712 -0
  13. data/lib/regexp_parser/scanner.rb +3325 -0
  14. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +14 -0
  15. data/lib/regexp_parser/syntax/ruby/1.8.7.rb +14 -0
  16. data/lib/regexp_parser/syntax/ruby/1.8.rb +39 -0
  17. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +39 -0
  18. data/lib/regexp_parser/syntax/ruby/1.9.2.rb +10 -0
  19. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +24 -0
  20. data/lib/regexp_parser/syntax/ruby/1.9.rb +8 -0
  21. data/lib/regexp_parser/syntax/tokens.rb +332 -0
  22. data/lib/regexp_parser/syntax.rb +172 -0
  23. data/lib/regexp_parser.rb +45 -0
  24. data/test/helpers.rb +8 -0
  25. data/test/lexer/test_all.rb +26 -0
  26. data/test/lexer/test_literals.rb +120 -0
  27. data/test/lexer/test_nesting.rb +107 -0
  28. data/test/lexer/test_refcalls.rb +45 -0
  29. data/test/parser/test_all.rb +44 -0
  30. data/test/parser/test_alternation.rb +46 -0
  31. data/test/parser/test_anchors.rb +35 -0
  32. data/test/parser/test_errors.rb +59 -0
  33. data/test/parser/test_escapes.rb +48 -0
  34. data/test/parser/test_expression.rb +51 -0
  35. data/test/parser/test_groups.rb +69 -0
  36. data/test/parser/test_properties.rb +346 -0
  37. data/test/parser/test_quantifiers.rb +236 -0
  38. data/test/parser/test_refcalls.rb +101 -0
  39. data/test/parser/test_sets.rb +99 -0
  40. data/test/scanner/test_all.rb +30 -0
  41. data/test/scanner/test_anchors.rb +35 -0
  42. data/test/scanner/test_errors.rb +36 -0
  43. data/test/scanner/test_escapes.rb +49 -0
  44. data/test/scanner/test_groups.rb +41 -0
  45. data/test/scanner/test_literals.rb +85 -0
  46. data/test/scanner/test_meta.rb +36 -0
  47. data/test/scanner/test_properties.rb +315 -0
  48. data/test/scanner/test_quantifiers.rb +38 -0
  49. data/test/scanner/test_refcalls.rb +45 -0
  50. data/test/scanner/test_scripts.rb +314 -0
  51. data/test/scanner/test_sets.rb +80 -0
  52. data/test/scanner/test_types.rb +30 -0
  53. data/test/syntax/ruby/test_1.8.rb +57 -0
  54. data/test/syntax/ruby/test_1.9.1.rb +39 -0
  55. data/test/syntax/ruby/test_1.9.3.rb +38 -0
  56. data/test/syntax/ruby/test_all.rb +12 -0
  57. data/test/syntax/test_all.rb +19 -0
  58. data/test/test_all.rb +4 -0
  59. metadata +160 -0
@@ -0,0 +1,172 @@
1
+ module Regexp::Syntax
2
+ require File.expand_path('../syntax/tokens', __FILE__)
3
+
4
+ class SyntaxError < StandardError
5
+ def initialize(what)
6
+ super what
7
+ end
8
+ end
9
+
10
+ class UnknownSyntaxNameError < SyntaxError
11
+ def initialize(name)
12
+ super "Unknown syntax name '#{name}'"
13
+ end
14
+ end
15
+
16
+ class MissingSyntaxSpecError < SyntaxError
17
+ def initialize(name)
18
+ super "Missing syntax specification file for '#{name}'"
19
+ end
20
+ end
21
+
22
+ class NotImplementedError < SyntaxError
23
+ def initialize(syntax, type, token)
24
+ super "#{syntax.class.name} does not implement: [#{type}:#{token}]"
25
+ end
26
+ end
27
+
28
+ SYNTAX_SPEC_ROOT = File.expand_path('../syntax', __FILE__)
29
+
30
+ # Loads, and instantiates an instance of the syntax specification class for
31
+ # the given syntax flavor name. The special names 'any' and '*' returns a
32
+ # instance of Syntax::Any. See below for more details.
33
+ def self.new(name)
34
+ return Regexp::Syntax::Any.new if
35
+ ['*', 'any'].include?( name.to_s )
36
+
37
+ self.load(name)
38
+
39
+ case name
40
+ when 'ruby/1.8.6'; syntax = Regexp::Syntax::Ruby::V186.new
41
+ when 'ruby/1.8.7'; syntax = Regexp::Syntax::Ruby::V187.new
42
+
43
+ # alias for the latest 1.8 implementation
44
+ when 'ruby/1.8'; syntax = Regexp::Syntax::Ruby::V18.new
45
+
46
+ when 'ruby/1.9.1'; syntax = Regexp::Syntax::Ruby::V191.new
47
+ when 'ruby/1.9.2'; syntax = Regexp::Syntax::Ruby::V192.new
48
+ when 'ruby/1.9.3'; syntax = Regexp::Syntax::Ruby::V193.new
49
+
50
+ # alias for the latest 1.9 implementation
51
+ when 'ruby/1.9'; syntax = Regexp::Syntax::Ruby::V19.new
52
+
53
+ else
54
+ raise UnknownSyntaxError.new(name)
55
+ end
56
+ end
57
+
58
+ # Checks if the named syntax has a specification class file, and requires
59
+ # it if it does. Downcases names, and adds the .rb extension if omitted.
60
+ def self.load(name)
61
+ full = "#{SYNTAX_SPEC_ROOT}/#{name.downcase}"
62
+ full = (full[-1, 3] == '.rb') ? full : "#{full}.rb"
63
+
64
+ raise MissingSyntaxSpecError.new(name) unless File.exist?(full)
65
+ require full
66
+ end
67
+
68
+ # A lookup map of supported types and tokens in a given syntax
69
+ class Base
70
+ def initialize
71
+ @implements = {}
72
+
73
+ implements :literal, [:literal]
74
+ end
75
+
76
+ def implementation
77
+ @implements
78
+ end
79
+
80
+ def implements(type, tokens)
81
+ if @implements[type]
82
+ @implements[type] = (@implements[type] + tokens).uniq
83
+ else
84
+ @implements[type] = tokens
85
+ end
86
+ end
87
+
88
+ # removes
89
+ def excludes(type, tokens)
90
+ if tokens
91
+ tokens = [tokens] unless tokens.is_a?(Array)
92
+ end
93
+
94
+ if @implements[type]
95
+ if tokens
96
+ @implements[type] = @implements[type] - tokens
97
+ @implements[type] = nil if @implements[type].empty?
98
+ else
99
+ @implements[type] = nil
100
+ end
101
+ end
102
+ end
103
+
104
+ def implements?(type, token)
105
+ return true if @implements[type] and @implements[type].include?(token)
106
+ false
107
+ end
108
+ alias :check? :implements?
109
+
110
+ def implements!(type, token)
111
+ raise NotImplementedError.new(self, type, token) unless
112
+ implements?(type, token)
113
+ end
114
+ alias :check! :implements!
115
+
116
+ def normalize(type, token)
117
+ case type
118
+ when :group
119
+ normalize_group(type, token)
120
+ when :backref
121
+ normalize_backref(type, token)
122
+ else
123
+ [type, token]
124
+ end
125
+ end
126
+
127
+ def normalize_group(type, token)
128
+ case token
129
+ when :named_ab, :named_sq
130
+ [:group, :named]
131
+ else
132
+ [type, token]
133
+ end
134
+ end
135
+
136
+ def normalize_backref(type, token)
137
+ case token
138
+ when :name_ref_ab, :name_ref_sq
139
+ [:backref, :name_ref]
140
+ when :name_call_ab, :name_call_sq
141
+ [:backref, :name_call]
142
+ when :name_nest_ref_ab, :name_nest_ref_sq
143
+ [:backref, :name_nest_ref]
144
+ when :number_ref_ab, :number_ref_sq
145
+ [:backref, :number_ref]
146
+ when :number_call_ab, :number_call_sq
147
+ [:backref, :number_call]
148
+ when :number_rel_ref_ab, :number_rel_ref_sq
149
+ [:backref, :number_rel_ref]
150
+ when :number_rel_call_ab, :number_rel_call_sq
151
+ [:backref, :number_rel_call]
152
+ when :number_nest_ref_ab, :number_nest_ref_sq
153
+ [:backref, :number_nest_ref]
154
+ else
155
+ [type, token]
156
+ end
157
+ end
158
+ end
159
+
160
+ # A syntax that always returns true, passing all tokens as implemented. This
161
+ # is useful during development, testing, and should be useful for some types
162
+ # of transformations as well.
163
+ class Any < Base
164
+ def initialize
165
+ @implements = { :* => [:*] }
166
+ end
167
+
168
+ def implements?(type, token) true end
169
+ def implements!(type, token) true end
170
+ end
171
+
172
+ end
@@ -0,0 +1,45 @@
1
+ class Regexp
2
+
3
+ module Parser
4
+ VERSION = '0.0.1'
5
+ end
6
+
7
+ TOKEN_KEYS = [:type, :token, :text, :ts, :te, :depth, :set_depth].freeze
8
+ Token = Struct.new(*TOKEN_KEYS) do
9
+ def offset
10
+ [self.ts, self.te]
11
+ end
12
+
13
+ def length
14
+ self.te - self.ts
15
+ end
16
+
17
+ def to_h
18
+ hash = {}
19
+ members.each do |member|
20
+ hash[member.to_sym] = self.send(member.to_sym)
21
+ end; hash
22
+ end
23
+
24
+ def next(exp = nil)
25
+ if exp
26
+ @next = exp
27
+ else
28
+ @next
29
+ end
30
+ end
31
+
32
+ def previous(exp = nil)
33
+ if exp
34
+ @previous = exp
35
+ else
36
+ @previous
37
+ end
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ %w{ctype scanner syntax lexer parser}.each do |file|
44
+ require File.expand_path("../regexp_parser/#{file}", __FILE__)
45
+ end
data/test/helpers.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "test/unit"
2
+ require File.expand_path("../../lib/regexp_parser", __FILE__)
3
+
4
+ RS = Regexp::Scanner
5
+ RL = Regexp::Lexer
6
+ RP = Regexp::Parser
7
+
8
+ include Regexp::Expression
@@ -0,0 +1,26 @@
1
+ require File.expand_path("../../helpers", __FILE__)
2
+
3
+ %w{
4
+ literals nesting refcalls
5
+ }.each do|tc|
6
+ require File.expand_path("../test_#{tc}", __FILE__)
7
+ end
8
+
9
+ class TestRegexpLexer < Test::Unit::TestCase
10
+
11
+ def test_lexer_returns_an_array
12
+ assert_instance_of( Array, RL.scan('abc'))
13
+ end
14
+
15
+ def test_lexer_returns_tokens
16
+ tokens = RL.scan('^abc+[^one]{2,3}\b\d\\\C-C$')
17
+ assert( tokens.all?{|token| token.kind_of?(Regexp::Token)},
18
+ "Not all array members are tokens")
19
+ end
20
+
21
+ def test_lexer_token_count
22
+ tokens = RL.scan(/^(one|two){2,3}([^d\]efm-qz\,\-]*)(ghi)+$/i)
23
+ assert_equal( 26, tokens.length )
24
+ end
25
+
26
+ end
@@ -0,0 +1,120 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.expand_path("../../helpers", __FILE__)
4
+
5
+ class LexerLiterals < Test::Unit::TestCase
6
+
7
+ tests = {
8
+ # ascii, single byte characters
9
+ 'a' => {
10
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
11
+ },
12
+
13
+ 'ab+' => {
14
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
15
+ 1 => [:literal, :literal, 'b', 1, 2, 0, 0],
16
+ 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0],
17
+ },
18
+
19
+
20
+ # 2 byte wide characters, Arabic
21
+ 'ا' => {
22
+ 0 => [:literal, :literal, 'ا', 0, 2, 0, 0],
23
+ },
24
+
25
+ 'aاbبcت' => {
26
+ 0 => [:literal, :literal, 'aاbبcت', 0, 9, 0, 0],
27
+ },
28
+
29
+ 'aاbبت?' => {
30
+ 0 => [:literal, :literal, 'aاbب', 0, 6, 0, 0],
31
+ 1 => [:literal, :literal, 'ت', 6, 8, 0, 0],
32
+ 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
33
+ },
34
+
35
+ 'aا?bبcت+' => {
36
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
37
+ 1 => [:literal, :literal, 'ا', 1, 3, 0, 0],
38
+ 2 => [:quantifier, :zero_or_one, '?', 3, 4, 0, 0],
39
+ 3 => [:literal, :literal, 'bبc', 4, 8, 0, 0],
40
+ 4 => [:literal, :literal, 'ت', 8, 10, 0, 0],
41
+ 5 => [:quantifier, :one_or_more, '+', 10, 11, 0, 0],
42
+ },
43
+
44
+ 'a(اbب+)cت?' => {
45
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
46
+ 1 => [:group, :capture, '(', 1, 2, 0, 0],
47
+ 2 => [:literal, :literal, 'اb', 2, 5, 1, 0],
48
+ 3 => [:literal, :literal, 'ب', 5, 7, 1, 0],
49
+ 4 => [:quantifier, :one_or_more, '+', 7, 8, 1, 0],
50
+ 5 => [:group, :close, ')', 8, 9, 0, 0],
51
+ 6 => [:literal, :literal, 'c', 9, 10, 0, 0],
52
+ 7 => [:literal, :literal, 'ت', 10, 12, 0, 0],
53
+ 8 => [:quantifier, :zero_or_one, '?', 12, 13, 0, 0],
54
+ },
55
+
56
+
57
+ # 3 byte wide characters, Japanese
58
+ 'ab?れます+cd' => {
59
+ 0 => [:literal, :literal, 'a', 0, 1, 0, 0],
60
+ 1 => [:literal, :literal, 'b', 1, 2, 0, 0],
61
+ 2 => [:quantifier, :zero_or_one, '?', 2, 3, 0, 0],
62
+ 3 => [:literal, :literal, 'れま', 3, 9, 0, 0],
63
+ 4 => [:literal, :literal, 'す', 9, 12, 0, 0],
64
+ 5 => [:quantifier, :one_or_more, '+', 12, 13, 0, 0],
65
+ 6 => [:literal, :literal, 'cd', 13, 15, 0, 0],
66
+ },
67
+
68
+
69
+ # 4 byte wide characters, Osmanya
70
+ '𐒀𐒁?𐒂ab+𐒃' => {
71
+ 0 => [:literal, :literal, '𐒀', 0, 4, 0, 0],
72
+ 1 => [:literal, :literal, '𐒁', 4, 8, 0, 0],
73
+ 2 => [:quantifier, :zero_or_one, '?', 8, 9, 0, 0],
74
+ 3 => [:literal, :literal, '𐒂a', 9, 14, 0, 0],
75
+ 4 => [:literal, :literal, 'b', 14, 15, 0, 0],
76
+ 5 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
77
+ 6 => [:literal, :literal, '𐒃', 16, 20, 0, 0],
78
+ },
79
+
80
+ 'mu𝄞?si*𝄫c+' => {
81
+ 0 => [:literal, :literal, 'mu', 0, 2, 0, 0],
82
+ 1 => [:literal, :literal, '𝄞', 2, 6, 0, 0],
83
+ 2 => [:quantifier, :zero_or_one, '?', 6, 7, 0, 0],
84
+ 3 => [:literal, :literal, 's', 7, 8, 0, 0],
85
+ 4 => [:literal, :literal, 'i', 8, 9, 0, 0],
86
+ 5 => [:quantifier, :zero_or_more, '*', 9, 10, 0, 0],
87
+ 6 => [:literal, :literal, '𝄫', 10, 14, 0, 0],
88
+ 7 => [:literal, :literal, 'c', 14, 15, 0, 0],
89
+ 8 => [:quantifier, :one_or_more, '+', 15, 16, 0, 0],
90
+ },
91
+ }
92
+
93
+ count = 0
94
+ tests.each do |pattern, checks|
95
+ define_method "test_lex_literal_runs_#{count+=1}" do
96
+
97
+ tokens = RL.scan(pattern)
98
+ checks.each do |offset, token|
99
+ assert_equal( token, tokens[offset].to_a )
100
+ end
101
+
102
+ end
103
+ end
104
+
105
+ def test_lex_single_2_byte_char
106
+ tokens = RL.scan('ا+')
107
+ assert_equal( 2, tokens.length )
108
+ end
109
+
110
+ def test_lex_single_3_byte_char
111
+ tokens = RL.scan('れ+')
112
+ assert_equal( 2, tokens.length )
113
+ end
114
+
115
+ def test_lex_single_4_byte_char
116
+ tokens = RL.scan('𝄞+')
117
+ assert_equal( 2, tokens.length )
118
+ end
119
+
120
+ end
@@ -0,0 +1,107 @@
1
+ require File.expand_path("../../helpers", __FILE__)
2
+
3
+ class LexerNesting < Test::Unit::TestCase
4
+
5
+ tests = {
6
+ '(((b)))' => {
7
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
8
+ 1 => [:group, :capture, '(', 1, 2, 1, 0],
9
+ 2 => [:group, :capture, '(', 2, 3, 2, 0],
10
+ 3 => [:literal, :literal, 'b', 3, 4, 3, 0],
11
+ 4 => [:group, :close, ')', 4, 5, 2, 0],
12
+ 5 => [:group, :close, ')', 5, 6, 1, 0],
13
+ 6 => [:group, :close, ')', 6, 7, 0, 0],
14
+ },
15
+
16
+ '(\((b)\))' => {
17
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
18
+ 1 => [:escape, :group_open, '\(', 1, 3, 1, 0],
19
+ 2 => [:group, :capture, '(', 3, 4, 1, 0],
20
+ 3 => [:literal, :literal, 'b', 4, 5, 2, 0],
21
+ 4 => [:group, :close, ')', 5, 6, 1, 0],
22
+ 5 => [:escape, :group_close, '\)', 6, 8, 1, 0],
23
+ 6 => [:group, :close, ')', 8, 9, 0, 0],
24
+ },
25
+
26
+ '(?>a(?>b(?>c)))' => {
27
+ 0 => [:group, :atomic, '(?>', 0, 3, 0, 0],
28
+ 2 => [:group, :atomic, '(?>', 4, 7, 1, 0],
29
+ 4 => [:group, :atomic, '(?>', 8, 11, 2, 0],
30
+ 6 => [:group, :close, ')', 12, 13, 2, 0],
31
+ 7 => [:group, :close, ')', 13, 14, 1, 0],
32
+ 8 => [:group, :close, ')', 14, 15, 0, 0],
33
+ },
34
+
35
+ '(?:a(?:b(?:c)))' => {
36
+ 0 => [:group, :passive, '(?:', 0, 3, 0, 0],
37
+ 2 => [:group, :passive, '(?:', 4, 7, 1, 0],
38
+ 4 => [:group, :passive, '(?:', 8, 11, 2, 0],
39
+ 6 => [:group, :close, ')', 12, 13, 2, 0],
40
+ 7 => [:group, :close, ')', 13, 14, 1, 0],
41
+ 8 => [:group, :close, ')', 14, 15, 0, 0],
42
+ },
43
+
44
+ '(?=a(?!b(?<=c(?<!d))))' => {
45
+ 0 => [:assertion, :lookahead, '(?=', 0, 3, 0, 0],
46
+ 2 => [:assertion, :nlookahead, '(?!', 4, 7, 1, 0],
47
+ 4 => [:assertion, :lookbehind, '(?<=', 8, 12, 2, 0],
48
+ 6 => [:assertion, :nlookbehind, '(?<!', 13, 17, 3, 0],
49
+ 8 => [:group, :close, ')', 18, 19, 3, 0],
50
+ 9 => [:group, :close, ')', 19, 20, 2, 0],
51
+ 10 => [:group, :close, ')', 20, 21, 1, 0],
52
+ 11 => [:group, :close, ')', 21, 22, 0, 0],
53
+ },
54
+
55
+ '((?#a)b(?#c)d(?#e))' => {
56
+ 0 => [:group, :capture, '(', 0, 1, 0, 0],
57
+ 1 => [:group, :comment, '(?#a)', 1, 6, 1, 0],
58
+ 3 => [:group, :comment, '(?#c)', 7, 12, 1, 0],
59
+ 5 => [:group, :comment, '(?#e)', 13, 18, 1, 0],
60
+ 6 => [:group, :close, ')', 18, 19, 0, 0],
61
+ },
62
+
63
+ 'a[b-e]f' => {
64
+ 1 => [:set, :open, '[', 1, 2, 0, 0],
65
+ 2 => [:set, :range, 'b-e', 2, 5, 0, 1],
66
+ 3 => [:set, :close, ']', 5, 6, 0, 0],
67
+ },
68
+
69
+ '[a-w&&[^c-g]z]' => {
70
+ 0 => [:set, :open, '[', 0, 1, 0, 0],
71
+ 2 => [:set, :intersection, '&&', 4, 6, 0, 1],
72
+ 3 => [:subset, :open, '[', 6, 7, 0, 1],
73
+ 4 => [:subset, :negate, '^', 7, 8, 0, 2],
74
+ 5 => [:subset, :range, 'c-g', 8, 11, 0, 2],
75
+ 6 => [:subset, :close, ']', 11, 12, 0, 1],
76
+ 8 => [:set, :close, ']', 13, 14, 0, 0],
77
+ },
78
+
79
+ '[a[b[c[d-g]]]]' => {
80
+ 0 => [:set, :open, '[', 0, 1, 0, 0],
81
+ 1 => [:set, :member, 'a', 1, 2, 0, 1],
82
+ 2 => [:subset, :open, '[', 2, 3, 0, 1],
83
+ 3 => [:subset, :member, 'b', 3, 4, 0, 2],
84
+ 4 => [:subset, :open, '[', 4, 5, 0, 2],
85
+ 5 => [:subset, :member, 'c', 5, 6, 0, 3],
86
+ 6 => [:subset, :open, '[', 6, 7, 0, 3],
87
+ 7 => [:subset, :range, 'd-g', 7, 10, 0, 4],
88
+ 8 => [:subset, :close, ']', 10, 11, 0, 3],
89
+ 9 => [:subset, :close, ']', 11, 12, 0, 2],
90
+ 10 => [:subset, :close, ']', 12, 13, 0, 1],
91
+ 11 => [:set, :close, ']', 13, 14, 0, 0],
92
+ },
93
+ }
94
+
95
+ count = 0
96
+ tests.each do |pattern, checks|
97
+ define_method "test_lex_nesting_#{count+=1}" do
98
+
99
+ tokens = RL.scan(pattern, 'ruby/1.9')
100
+ checks.each do |offset, token|
101
+ assert_equal( token, tokens[offset].to_a )
102
+ end
103
+
104
+ end
105
+ end
106
+
107
+ end