regexador 0.4.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ class Regexador
2
+ # Only a skeleton...
3
+ end
4
+
5
+ require_relative './regexador_parser'
6
+ require_relative './regexador_xform'
7
+
8
+ require 'parslet/convenience'
9
+
10
+ class Regexador
11
+ def initialize(str, debug=false)
12
+ @code = str
13
+ if debug
14
+ puts
15
+ puts "---- Code: ------"
16
+ puts str
17
+ puts "-----------------"
18
+ end
19
+
20
+ @parser = Parser.new
21
+ meth = debug ? :parse_with_debug : :parse
22
+ @tree = @parser.send(meth, str)
23
+
24
+ xform = Transform.new
25
+ if debug
26
+ puts "\n\nParser gives:"
27
+ pp @tree
28
+ end
29
+
30
+ @regex_tree = xform.apply(@tree)
31
+ @regex_str = @regex_tree.to_s
32
+ if debug
33
+ puts "\n\nTransform gives:"
34
+ pp @regex_tree
35
+ end
36
+
37
+ @regex = Regexp.compile(@regex_tree.to_s)
38
+ end
39
+
40
+ def to_regex
41
+ @regex
42
+ end
43
+
44
+ def match(str, hash={})
45
+ hash.each_pair do |var, val|
46
+ @regex_str.gsub!(/\(#{var}\)\{0\}/, val)
47
+ end
48
+ @regex = Regexp.compile(@regex_str) unless hash.empty?
49
+ result = @regex.match(str)
50
+ return nil if result.nil?
51
+
52
+ # Logic below may change...
53
+
54
+ names = result.names
55
+ obj = Object.new
56
+ klass = obj.singleton_class
57
+ names.each {|name| klass.class_eval { define_method(name) { result[name] } } }
58
+ klass.class_eval { define_method(:[]) {|*args| args.map {|cvar| result[name] } } }
59
+ obj
60
+ end
61
+
62
+ def match?(str, hash={})
63
+ !!match(str, hash) # Return Boolean
64
+ end
65
+
66
+ def =~(other)
67
+ other = stringify(other)
68
+ raise ArgumentError unless String === other
69
+ match(other)
70
+ end
71
+
72
+ private
73
+
74
+ def stringify(obj)
75
+ return obj if String === obj
76
+ return obj.to_str if obj.respond_to?(:to_str)
77
+ return obj
78
+ end
79
+ end
@@ -0,0 +1,113 @@
1
+ require 'parslet'
2
+
3
+ abort "Require out of order" if ! defined? Regexador
4
+
5
+ class Regexador::Parser < Parslet::Parser
6
+ end
7
+
8
+ require_relative './chars' # These three files
9
+ require_relative './predefs' # reopen the class
10
+ require_relative './keywords' # Regexador::Parser
11
+
12
+ class Regexador::Parser
13
+ rule(:space) { match[" \t"].repeat(1) }
14
+ rule(:space?) { space.maybe }
15
+ rule(:white) { (endofline | match("\s")).repeat(1) }
16
+ rule(:white?) { white.maybe }
17
+
18
+ rule(:lower) { match('[a-z]') }
19
+ rule(:upper) { match('[A-Z]') }
20
+
21
+ rule(:comment) { cHASH >> space >> (cNEWLINE.absent? >> any).repeat(0) }
22
+ rule(:endofline) { space? >> comment.maybe >> cNEWLINE }
23
+
24
+ rule(:digit) { match('[0-9]') }
25
+ rule(:digits) { digit.repeat(1) }
26
+ rule(:hexdigit) { digit | match("[abcdef]") }
27
+ rule(:quoted) { match('[^"]').repeat(0) }
28
+ rule(:single_quoted) { match("[^']").repeat(0) }
29
+ rule(:graph_char) { match ("[[:graph:]]") } # { match('[!-~]') }
30
+ rule(:name) { keyword.absent? >> lower >> (lower | cUNDERSCORE | digit).repeat(0) }
31
+
32
+ rule(:variable) { name.as(:var) }
33
+ rule(:capture_var) { (cAT >> name.as(:cvar)) }
34
+ rule(:parameter) { (cCOLON >> name.as(:param)) }
35
+
36
+ rule(:posix_class) { cPERCENT >> name.as(:pclass) }
37
+
38
+ rule(:string) { cQUOTE >> quoted.as(:string) >> cQUOTE }
39
+
40
+ rule(:simple_class) { cSQUOTE >> single_quoted.as(:char_class) >> cSQUOTE }
41
+ rule(:negated_class) { cTILDE >> cSQUOTE >> single_quoted.as(:neg_class) >> cSQUOTE }
42
+ rule(:char_class) { simple_class | negated_class }
43
+
44
+ rule(:number) { digits }
45
+ rule(:numeric) { number | variable | parameter }
46
+
47
+ rule(:codepoint) { cAMPERSAND >> (hexdigit >> hexdigit >> hexdigit >> hexdigit).as(:unicode) }
48
+
49
+ rule(:char) { (cTICK >> graph_char.as(:char)) | codepoint }
50
+
51
+ rule(:simple_range) { char.as(:c1) >> cHYPHEN >> char.as(:c2) }
52
+ rule(:negated_range) { char.as(:nr1) >> cTILDE >> char.as(:nr2) }
53
+ rule(:range) { negated_range | simple_range }
54
+
55
+ rule(:negated_char) { cTILDE >> char.as(:nchar) } # ~`x means /[^x]/
56
+
57
+ rule(:capture) { capture_var.as(:lhs) >> space? >> (cEQUAL >> space? >> pattern.as(:rhs)).maybe }
58
+
59
+ rule(:simple_pattern) { predef | range | negated_char | posix_class | string |
60
+ # X `a-`c ~`a %name "abc"
61
+ char_class | char | parameter | variable | capture }
62
+ # 'abc' `a :param xyz @xyz = ...
63
+
64
+ rule(:qualifier) { (kANY | kMANY | kMAYBE | kNOCASE | kWITHIN | kESCAPING).as(:qualifier) >>
65
+ fancy_pattern.as(:match_item) }
66
+
67
+ # FIXME above: within and escaping can't really take an arbitrary pattern
68
+
69
+ ###
70
+ rule(:pos_lookahead) { kFIND >> space >> simple_pattern.as(:findpat_ahead) >> space >>
71
+ kWITH >> space >> simple_pattern.as(:pospat) }
72
+ rule(:neg_lookahead) { kFIND >> space >> simple_pattern.as(:findpat_ahead) >> space >>
73
+ kWITHOUT >> space >> simple_pattern.as(:negpat) }
74
+ rule(:pos_lookbehind) { kWITH >> space >> simple_pattern.as(:pospat) >> space >>
75
+ kFIND >> space >> simple_pattern.as(:findpat_behind) }
76
+ rule(:neg_lookbehind) { kWITHOUT >> space >> simple_pattern.as(:negpat) >> space >>
77
+ kFIND >> space >> simple_pattern.as(:findpat_behind) }
78
+ rule(:lookaround) { pos_lookahead | neg_lookahead | pos_lookbehind | neg_lookbehind }
79
+ ###
80
+
81
+ rule(:repeat1) { numeric.as(:num1) }
82
+ rule(:repeat2) { repeat1 >> cCOMMA >> numeric.as(:num2) }
83
+ rule(:repetition) { (repeat2 | repeat1) >> space? >> cTIMES >> space? >> fancy_pattern.as(:match_item) }
84
+
85
+ rule(:parenthesized) { cLPAREN >> space? >> pattern >> space? >> cRPAREN }
86
+
87
+ rule(:fancy_pattern) { space? >> (repetition | simple_pattern | qualifier | lookaround | parenthesized) >> space? }
88
+ # num `~"' keyword find/with (
89
+
90
+ rule(:concat) { (fancy_pattern >> (space? >> fancy_pattern).repeat(0)).as(:sequence) }
91
+
92
+ rule(:pattern) { (concat >> space? >> (cBAR >> space? >> concat).repeat(0)).as(:alternation) }
93
+
94
+ rule(:rvalue) { pattern | numeric } # a string is-a pattern
95
+
96
+ rule(:assignment) { space? >> name.as(:var) >> space? >> cEQUAL >> space? >> rvalue.as(:rvalue) }
97
+
98
+ rule(:definitions) { (endofline | assignment >> endofline).repeat(0) }
99
+
100
+ rule(:oneline_clause) { space? >> kMATCH >> space? >> pattern >> kEND >> endofline.maybe }
101
+
102
+ rule(:single_line) { endofline | space? >> pattern >> endofline }
103
+
104
+ rule(:multiline_clause) { space? >> kMATCH >> endofline >> single_line.repeat(1).as(:lines) >> space? >>
105
+ kEND >> endofline.maybe }
106
+
107
+ rule(:match_clause) { multiline_clause | oneline_clause }
108
+
109
+ rule(:program) { definitions.as(:definitions) >> match_clause.as(:match) >> endofline.repeat(0) }
110
+
111
+ root(:program)
112
+ end
113
+
@@ -0,0 +1,180 @@
1
+ require 'parslet'
2
+
3
+ abort "Require out of order" if ! defined? Regexador
4
+
5
+ class Regexador::Transform < Parslet::Transform
6
+ class Node
7
+ def self.make(*fields, &block)
8
+ klass = ::Class.new(self) do
9
+ fields.each {|field| attr_accessor field }
10
+ define_method(:fields) { fields.dup }
11
+ define_method(:to_s, &block)
12
+ end
13
+ klass
14
+ end
15
+
16
+ def initialize *values
17
+ fields.zip(values) {|f,v| self.send("#{f}=", v) }
18
+ end
19
+
20
+ def to_s
21
+ raise NotImplementedError,
22
+ "Please implement #to_s for #{short_name}."
23
+ end
24
+
25
+ def to_str
26
+ to_s
27
+ end
28
+
29
+ def short_name
30
+ str = self.class.name
31
+ str[str.rindex('::')+2..-1]
32
+ end
33
+
34
+ def inspect
35
+ data = fields.map {|f| "#{f}=#{self.send(f).inspect}" }.join(', ')
36
+ short_name + "(" + data + ")"
37
+ end
38
+ end
39
+
40
+ # Later: Remember escaping for chars (char, c1, c2, nchar, ...)
41
+
42
+ XChar = Node.make(:char) do
43
+ Regexp.escape(char)
44
+ end
45
+
46
+ CharRange = Node.make(:c1, :c2) { "[#@c1-#@c2]" }
47
+ NegatedRange = Node.make(:nr1, :nr2) { "[^#@nr1-#@nr2]" }
48
+ NegatedChar = Node.make(:nchar) { "[^#@nchar]" } # More like a range really
49
+ POSIXClass = Node.make(:pclass) { "[[:#@pclass:]]" }
50
+ CharClass = Node.make(:char_class) { "[#@char_class]" }
51
+ NegatedClass = Node.make(:neg_class) { "[^#@neg_class]" }
52
+ Predefined = Node.make(:pre) do
53
+ sym = "p#@pre".to_sym
54
+ str = Regexador::Parser::Predef2Regex[sym]
55
+ raise "#@pre is not handled yet" if str.nil?
56
+ str
57
+ end
58
+
59
+ StringNode = Node.make(:string) { Regexp.escape(string.to_s) }
60
+ Repeat1 = Node.make(:num1, :match_item) { "(#@match_item){#@num1}" }
61
+ Repeat2 = Node.make(:num1, :num2, :match_item) { "(#@match_item){#@num1,#@num2}" }
62
+ Any = Node.make(:match_item) { "(#@match_item)*" }
63
+ Many = Node.make(:match_item) { "(#@match_item)+" }
64
+ Maybe = Node.make(:match_item) { "(#@match_item)?" }
65
+ Nocase = Node.make(:match_item) { "((?i)#@match_item)" }
66
+
67
+ FindWith = Node.make(:findpat_ahead, :pospat) { "((?=#@findpat_ahead#@pospat)#@findpat_ahead)" }
68
+ FindWithout = Node.make(:findpat_ahead, :negpat) { "((?!#@findpat#@negpat)#@findpat)" }
69
+ WithFind = Node.make(:pospat, :findpat_behind) { "((?<=#@pospat)#@findpat)" }
70
+ WithoutFind = Node.make(:negpat, :findpat_behind) { "((?<!#@negpat)#@pospat)" }
71
+
72
+ Within = Node.make(:delim) { "(#@delim.*?#@delim)" } # /x[^y]*?y/
73
+ Escaping = Node.make(:delim) { "\\#@delim|[^#@delim]*?#@delim" }
74
+ # escaping `" # /"(\\"|[^"])*?"/
75
+
76
+ Sequence = Node.make(:elements) { elements.map(&:to_s).join }
77
+ Alternation = Node.make(:elements) { '(' + elements.map(&:to_s).join('|') + ')' }
78
+
79
+ Assignment = Node.make(:var, :rvalue) { "" } # Doesn't actually translate directly.
80
+ Usage = Node.make(:var) { Assignment.bindings[var.to_s].to_s }
81
+
82
+ Program = Node.make(:definitions, :match) do
83
+ # NOTE Since we're using to_s for conversion to regular expression,
84
+ # debugging cannot be done using string interpolation, otherwise we
85
+ # call things out of order just by debug-printing them!
86
+ #
87
+ # puts "In Program: #{match}" # Don't do this
88
+ # puts "In Program: #{match.inspect}" # But this is OK
89
+ definitions.each {|d| d.store }
90
+ match.to_s
91
+ end
92
+
93
+ class Assignment < Node # For clarity: Really already is-a Node
94
+ class << self
95
+ attr_accessor :bindings
96
+ end
97
+
98
+ def store
99
+ # puts "Storing #@var = #{@rvalue.inspect}"
100
+ hash = self.class.bindings ||= {}
101
+
102
+ hash[@var.to_s] = @rvalue # Late binding
103
+ # hash[@var.to_s] = @rvalue.to_s # Early binding
104
+ # Think about the difference... :)
105
+ end
106
+ end
107
+
108
+ Captured = Node.make(:cname, :pattern) { "(?<#@cname>#@pattern)" }
109
+ Backref = Node.make(:name) { "\\k<#@name>" }
110
+
111
+ Parameter = Node.make(:param) { "(#{param}){0}" }
112
+
113
+ PosAhead = Node.make(:pla1, :pla2) { "(?=#@pla1#@pla2)#@pla1" }
114
+ NegAhead = Node.make(:nla1, :nla2) { "(?!#@nla1#@nla2)#@nla1" }
115
+ PosBehind = Node.make(:plb1, :plb2) { "(?<=#@plb1)#@plb2" }
116
+ NegBehind = Node.make(:nlb1, :nlb2) { "(?<!#@nlb1)#@nlb2" }
117
+
118
+ # Actual transformation rules
119
+
120
+ rule(:char => simple(:ch)) { XChar.new(ch) }
121
+ rule(:unicode => simple(:hex4)) { StringNode.new("" << Integer("0x#{hex4}")) }
122
+
123
+ rule(:string => simple(:string)) { StringNode.new(string) }
124
+ # When the string is empty, parslet returns an empty array for lack of content.
125
+ # Map that to the empty string node.
126
+ rule(:string => sequence(:string)) { StringNode.new('') }
127
+
128
+ rule(:c1 => simple(:c1), :c2 => simple(:c2)) { CharRange.new(c1, c2) }
129
+
130
+ rule(:nr1 => simple(:nr1), :nr2 => simple(:nr2)) { NegatedRange.new(nr1, nr2) }
131
+ rule(:nchar => simple(:nchar)) { NegatedChar.new(nchar) } # Don't forget escaping
132
+
133
+ rule(:pclass => simple(:pclass)) { POSIXClass.new(pclass) }
134
+
135
+ rule(:char_class => simple(:char_class)) { CharClass.new(char_class) }
136
+ rule(:neg_class => simple(:neg_class)) { NegatedClass.new(neg_class) }
137
+
138
+ rule(:predef => simple(:content)) { Predefined.new(content) }
139
+
140
+ rule(:num1 => simple(:num1), :match_item => simple(:match_item)) { Repeat1.new(num1, match_item) }
141
+
142
+ rule(:num1 => simple(:num1), :num2 => simple(:num2), :match_item => simple(:match_item)) { Repeat2.new(num1, num2, match_item) }
143
+
144
+ rule(:qualifier => 'any', :match_item => simple(:match_item)) { Any.new(match_item) }
145
+ rule(:qualifier => 'many', :match_item => simple(:match_item)) { Many.new(match_item) }
146
+ rule(:qualifier => 'maybe', :match_item => simple(:match_item)) { Maybe.new(match_item) }
147
+ rule(:qualifier => 'nocase', :match_item => simple(:match_item)) { Nocase.new(match_item) }
148
+ rule(:qualifier => 'within', :match_item => simple(:match_item)) { Within.new(match_item) }
149
+ rule(:qualifier => 'escaping', :match_item => simple(:match_item)) { Escaping.new(match_item) }
150
+
151
+ rule(:findpat_ahead => simple(:pla1), :pospat => simple(:pla2)) { PosAhead.new(pla1, pla2) }
152
+ rule(:findpat_ahead => simple(:nla1), :negpat => simple(:nla2)) { NegAhead.new(nla1, nla2) }
153
+ rule(:pospat => simple(:plb1), :findpat_behind => simple(:plb2)) { PosBehind.new(plb1, plb2) }
154
+ rule(:negpat => simple(:nlb1), :findpat_behind => simple(:nlb2)) { NegBehind.new(nlb1, nlb2) }
155
+
156
+ rule(:var => simple(:var), :rvalue => simple(:rvalue)) { Assignment.new(@var, @rvalue) }
157
+
158
+ rule(:param => simple(:param)) { Parameter.new(param) }
159
+
160
+ rule(:alternation => simple(:pattern)) { pattern }
161
+ rule(:alternation => sequence(:alternatives)) { Alternation.new(alternatives) }
162
+
163
+ rule(:sequence => simple(:element)) { element }
164
+ rule(:sequence => sequence(:elements)) { Sequence.new(elements) }
165
+
166
+ # A series of statements on different lines is also a sequence.
167
+ rule(:lines => sequence(:lines)) { Sequence.new(lines) }
168
+
169
+ rule(:var => simple(:name)) { Usage.new(name) }
170
+
171
+ rule(:definitions => sequence(:definitions), :match => simple(:match)) { Program.new(definitions, match) }
172
+ rule(:definitions => sequence(:definitions), :match => sequence(:match)) { Program.new(definitions, match) }
173
+
174
+ # An expression of the form '@variable'
175
+ rule(:lhs => {:cvar => simple(:backref)}) { Backref.new(backref) }
176
+
177
+ # An expression of the form '@variable = expr'
178
+ rule(:lhs => {:cvar => simple(:cname)}, :rhs => simple(:pattern)) { Captured.new(cname, pattern) }
179
+ end
180
+
@@ -0,0 +1,174 @@
1
+ # Encoding: UTF-8
2
+ require './spec/testing'
3
+
4
+ class Object
5
+ def succeeds
6
+ self.should_not == nil
7
+ end
8
+ end
9
+
10
+ describe Regexador do
11
+ before(:all) do
12
+ @parser = Regexador::Parser.new
13
+ @pattern = @parser.pattern
14
+ end
15
+
16
+ describe "A special character" do
17
+ it "can be matched correctly" do
18
+ @parser.cSQUOTE.parse_with_debug("'").succeeds
19
+ @parser.cHASH.parse('#').succeeds
20
+ @parser.cNEWLINE.parse("\n").succeeds
21
+ @parser.cEQUAL.parse('=').succeeds
22
+ end
23
+ end
24
+
25
+ describe "An international character" do
26
+ it "can follow a backtick" do #
27
+ @parser.char.parse_with_debug("`æ").succeeds
28
+ @parser.char.parse("`ß").succeeds
29
+ @parser.char.parse("`ç").succeeds
30
+ @parser.char.parse("`ö").succeeds
31
+ @parser.char.parse("`ñ").succeeds
32
+ end
33
+ end
34
+
35
+ describe "A Unicode codepoint expression" do
36
+ it "can be matched" do
37
+ @parser.codepoint.parse_with_debug("&1234").succeeds
38
+ @parser.codepoint.parse('&beef').succeeds
39
+ end
40
+ end
41
+
42
+ describe "A predefined token" do
43
+ %w(BOS EOS START END).each do |token|
44
+ describe token do
45
+ it 'matches using pattern' do
46
+ @parser.pattern.parse_with_debug(token).succeeds
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ describe "An assignment" do
53
+ it "can be parsed" do
54
+ @parser.assignment.parse("a = 5").succeeds
55
+ @parser.assignment.parse("a= 5").succeeds
56
+ @parser.assignment.parse("a =5").succeeds
57
+ @parser.assignment.parse("a=5").succeeds
58
+ @parser.assignment.parse("myvar = 'xyz'").succeeds
59
+ @parser.assignment.parse('var2 = "hello"').succeeds
60
+ @parser.assignment.parse('this_var = `x-`z').succeeds
61
+ @parser.assignment.parse_with_debug('pat = maybe many `x-`z').succeeds
62
+ end
63
+ end
64
+
65
+ describe "A keyword used as a variable name" do
66
+ it "will not parse" do
67
+ @parser.assignment.should_not parse("end = 'hello'")
68
+ # @parser.assignment.parse("endx = 'hello'")
69
+ end
70
+ end
71
+
72
+ describe "A definition section" do
73
+ it "can be parsed" do
74
+ defs1 = "a = 5\nstr = \"hello\"\n"
75
+ @parser.definitions.parse_with_debug(defs1).succeeds
76
+ defs2 = <<-EOF
77
+ a = 5
78
+ # comment...
79
+ pat = maybe many `a-`c
80
+ # empty line follows:
81
+
82
+ str = "hello"
83
+ # another comment...
84
+ EOF
85
+ @parser.definitions.parse_with_debug(defs2).succeeds
86
+ end
87
+ end
88
+
89
+ describe "A capture variable" do
90
+ it "can be parsed" do
91
+ str1 = "@myvar"
92
+ @parser.capture_var.parse(str1).succeeds
93
+ end
94
+ end
95
+
96
+ describe "A captured pattern" do
97
+ let(:prog) { "@myvar = maybe 'abc'" }
98
+
99
+ it "can be parsed (#capture)" do
100
+ @parser.capture.parse(prog).succeeds
101
+ end
102
+ it "can be parsed (#program)" do
103
+ @parser.parse("match #{prog} end").succeeds
104
+ end
105
+ end
106
+ describe "A back reference" do
107
+ let(:prog) { '@myvar' }
108
+
109
+ it 'can be parsed (#capture)' do
110
+ @parser.capture.parse(prog).succeeds
111
+ end
112
+ it 'can be parsed' do
113
+ @parser.parse("match #{prog} end").succeeds
114
+ end
115
+ end
116
+
117
+
118
+ describe "A one-line match clause" do
119
+ it "can be parsed" do
120
+ mc1 = <<-EOF
121
+ match `a~`x end
122
+ EOF
123
+ @parser.match_clause.parse_with_debug(mc1).succeeds
124
+ end
125
+ end
126
+
127
+ describe "A multiline match clause" do
128
+ it "can be parsed" do
129
+ mc2 = <<-EOF
130
+ match
131
+ `< "tag" WB
132
+ any ~`>
133
+ # blah blah blah
134
+ "</" "tag" `>
135
+ end
136
+ EOF
137
+ @parser.multiline_clause.parse_with_debug(mc2).succeeds
138
+ end
139
+ end
140
+
141
+ describe "An entire one-line program" do
142
+ it "can be parsed" do
143
+ prog = "match `a-`f end"
144
+ @parser.parse_with_debug(prog).succeeds
145
+ end
146
+ end
147
+
148
+
149
+ describe "An entire program" do
150
+ it "can be parsed" do
151
+ prog1 = <<-EOF
152
+ dot = "."
153
+ num = "25" D5 | `2 D4 D | maybe D1 1,2*D
154
+ match WB num dot num dot num dot num WB end
155
+ EOF
156
+ @parser.program.parse_with_debug(prog1).succeeds
157
+
158
+ prog2 = <<-EOF
159
+ # Warning: This one likely has errors!
160
+
161
+ visa = `4 12*D maybe 3*D
162
+ mc = `5 D5 14*D
163
+ amex = `3 '47' 13*D
164
+ diners = `3 (`0 D5 | '68' D) 11*D
165
+ discover = `6 ("011" | `5 2*D) 12*D
166
+ jcb = ("2131"|"1800"|"35" 3*D) 11*D
167
+
168
+ match visa | mc | amex | diners | discover | jcb end
169
+ EOF
170
+ @parser.program.parse_with_debug(prog2).succeeds
171
+ end
172
+ end
173
+ end
174
+