regexador 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ class Regexador
2
+ # Only a skeleton...
3
+ end
4
+
5
+ require_relative './regexador_parser'
6
+ require_relative './regexador_xform'
7
+
8
+ require 'parslet/convenience'
9
+
10
+ class Regexador
11
+ def initialize(str, debug=false)
12
+ @code = str
13
+ if debug
14
+ puts
15
+ puts "---- Code: ------"
16
+ puts str
17
+ puts "-----------------"
18
+ end
19
+
20
+ @parser = Parser.new
21
+ meth = debug ? :parse_with_debug : :parse
22
+ @tree = @parser.send(meth, str)
23
+
24
+ xform = Transform.new
25
+ if debug
26
+ puts "\n\nParser gives:"
27
+ pp @tree
28
+ end
29
+
30
+ @regex_tree = xform.apply(@tree)
31
+ @regex_str = @regex_tree.to_s
32
+ if debug
33
+ puts "\n\nTransform gives:"
34
+ pp @regex_tree
35
+ end
36
+
37
+ @regex = Regexp.compile(@regex_tree.to_s)
38
+ end
39
+
40
+ def to_regex
41
+ @regex
42
+ end
43
+
44
+ def match(str, hash={})
45
+ hash.each_pair do |var, val|
46
+ @regex_str.gsub!(/\(#{var}\)\{0\}/, val)
47
+ end
48
+ @regex = Regexp.compile(@regex_str) unless hash.empty?
49
+ result = @regex.match(str)
50
+ return nil if result.nil?
51
+
52
+ # Logic below may change...
53
+
54
+ names = result.names
55
+ obj = Object.new
56
+ klass = obj.singleton_class
57
+ names.each {|name| klass.class_eval { define_method(name) { result[name] } } }
58
+ klass.class_eval { define_method(:[]) {|*args| args.map {|cvar| result[name] } } }
59
+ obj
60
+ end
61
+
62
+ def match?(str, hash={})
63
+ !!match(str, hash) # Return Boolean
64
+ end
65
+
66
+ def =~(other)
67
+ other = stringify(other)
68
+ raise ArgumentError unless String === other
69
+ match(other)
70
+ end
71
+
72
+ private
73
+
74
+ def stringify(obj)
75
+ return obj if String === obj
76
+ return obj.to_str if obj.respond_to?(:to_str)
77
+ return obj
78
+ end
79
+ end
@@ -0,0 +1,113 @@
1
+ require 'parslet'
2
+
3
+ abort "Require out of order" if ! defined? Regexador
4
+
5
+ class Regexador::Parser < Parslet::Parser
6
+ end
7
+
8
+ require_relative './chars' # These three files
9
+ require_relative './predefs' # reopen the class
10
+ require_relative './keywords' # Regexador::Parser
11
+
12
+ class Regexador::Parser
13
+ rule(:space) { match[" \t"].repeat(1) }
14
+ rule(:space?) { space.maybe }
15
+ rule(:white) { (endofline | match("\s")).repeat(1) }
16
+ rule(:white?) { white.maybe }
17
+
18
+ rule(:lower) { match('[a-z]') }
19
+ rule(:upper) { match('[A-Z]') }
20
+
21
+ rule(:comment) { cHASH >> space >> (cNEWLINE.absent? >> any).repeat(0) }
22
+ rule(:endofline) { space? >> comment.maybe >> cNEWLINE }
23
+
24
+ rule(:digit) { match('[0-9]') }
25
+ rule(:digits) { digit.repeat(1) }
26
+ rule(:hexdigit) { digit | match("[abcdef]") }
27
+ rule(:quoted) { match('[^"]').repeat(0) }
28
+ rule(:single_quoted) { match("[^']").repeat(0) }
29
+ rule(:graph_char) { match ("[[:graph:]]") } # { match('[!-~]') }
30
+ rule(:name) { keyword.absent? >> lower >> (lower | cUNDERSCORE | digit).repeat(0) }
31
+
32
+ rule(:variable) { name.as(:var) }
33
+ rule(:capture_var) { (cAT >> name.as(:cvar)) }
34
+ rule(:parameter) { (cCOLON >> name.as(:param)) }
35
+
36
+ rule(:posix_class) { cPERCENT >> name.as(:pclass) }
37
+
38
+ rule(:string) { cQUOTE >> quoted.as(:string) >> cQUOTE }
39
+
40
+ rule(:simple_class) { cSQUOTE >> single_quoted.as(:char_class) >> cSQUOTE }
41
+ rule(:negated_class) { cTILDE >> cSQUOTE >> single_quoted.as(:neg_class) >> cSQUOTE }
42
+ rule(:char_class) { simple_class | negated_class }
43
+
44
+ rule(:number) { digits }
45
+ rule(:numeric) { number | variable | parameter }
46
+
47
+ rule(:codepoint) { cAMPERSAND >> (hexdigit >> hexdigit >> hexdigit >> hexdigit).as(:unicode) }
48
+
49
+ rule(:char) { (cTICK >> graph_char.as(:char)) | codepoint }
50
+
51
+ rule(:simple_range) { char.as(:c1) >> cHYPHEN >> char.as(:c2) }
52
+ rule(:negated_range) { char.as(:nr1) >> cTILDE >> char.as(:nr2) }
53
+ rule(:range) { negated_range | simple_range }
54
+
55
+ rule(:negated_char) { cTILDE >> char.as(:nchar) } # ~`x means /[^x]/
56
+
57
+ rule(:capture) { capture_var.as(:lhs) >> space? >> (cEQUAL >> space? >> pattern.as(:rhs)).maybe }
58
+
59
+ rule(:simple_pattern) { predef | range | negated_char | posix_class | string |
60
+ # X `a-`c ~`a %name "abc"
61
+ char_class | char | parameter | variable | capture }
62
+ # 'abc' `a :param xyz @xyz = ...
63
+
64
+ rule(:qualifier) { (kANY | kMANY | kMAYBE | kNOCASE | kWITHIN | kESCAPING).as(:qualifier) >>
65
+ fancy_pattern.as(:match_item) }
66
+
67
+ # FIXME above: within and escaping can't really take an arbitrary pattern
68
+
69
+ ###
70
+ rule(:pos_lookahead) { kFIND >> space >> simple_pattern.as(:findpat_ahead) >> space >>
71
+ kWITH >> space >> simple_pattern.as(:pospat) }
72
+ rule(:neg_lookahead) { kFIND >> space >> simple_pattern.as(:findpat_ahead) >> space >>
73
+ kWITHOUT >> space >> simple_pattern.as(:negpat) }
74
+ rule(:pos_lookbehind) { kWITH >> space >> simple_pattern.as(:pospat) >> space >>
75
+ kFIND >> space >> simple_pattern.as(:findpat_behind) }
76
+ rule(:neg_lookbehind) { kWITHOUT >> space >> simple_pattern.as(:negpat) >> space >>
77
+ kFIND >> space >> simple_pattern.as(:findpat_behind) }
78
+ rule(:lookaround) { pos_lookahead | neg_lookahead | pos_lookbehind | neg_lookbehind }
79
+ ###
80
+
81
+ rule(:repeat1) { numeric.as(:num1) }
82
+ rule(:repeat2) { repeat1 >> cCOMMA >> numeric.as(:num2) }
83
+ rule(:repetition) { (repeat2 | repeat1) >> space? >> cTIMES >> space? >> fancy_pattern.as(:match_item) }
84
+
85
+ rule(:parenthesized) { cLPAREN >> space? >> pattern >> space? >> cRPAREN }
86
+
87
+ rule(:fancy_pattern) { space? >> (repetition | simple_pattern | qualifier | lookaround | parenthesized) >> space? }
88
+ # num `~"' keyword find/with (
89
+
90
+ rule(:concat) { (fancy_pattern >> (space? >> fancy_pattern).repeat(0)).as(:sequence) }
91
+
92
+ rule(:pattern) { (concat >> space? >> (cBAR >> space? >> concat).repeat(0)).as(:alternation) }
93
+
94
+ rule(:rvalue) { pattern | numeric } # a string is-a pattern
95
+
96
+ rule(:assignment) { space? >> name.as(:var) >> space? >> cEQUAL >> space? >> rvalue.as(:rvalue) }
97
+
98
+ rule(:definitions) { (endofline | assignment >> endofline).repeat(0) }
99
+
100
+ rule(:oneline_clause) { space? >> kMATCH >> space? >> pattern >> kEND >> endofline.maybe }
101
+
102
+ rule(:single_line) { endofline | space? >> pattern >> endofline }
103
+
104
+ rule(:multiline_clause) { space? >> kMATCH >> endofline >> single_line.repeat(1).as(:lines) >> space? >>
105
+ kEND >> endofline.maybe }
106
+
107
+ rule(:match_clause) { multiline_clause | oneline_clause }
108
+
109
+ rule(:program) { definitions.as(:definitions) >> match_clause.as(:match) >> endofline.repeat(0) }
110
+
111
+ root(:program)
112
+ end
113
+
@@ -0,0 +1,180 @@
1
+ require 'parslet'
2
+
3
+ abort "Require out of order" if ! defined? Regexador
4
+
5
+ class Regexador::Transform < Parslet::Transform
6
+ class Node
7
+ def self.make(*fields, &block)
8
+ klass = ::Class.new(self) do
9
+ fields.each {|field| attr_accessor field }
10
+ define_method(:fields) { fields.dup }
11
+ define_method(:to_s, &block)
12
+ end
13
+ klass
14
+ end
15
+
16
+ def initialize *values
17
+ fields.zip(values) {|f,v| self.send("#{f}=", v) }
18
+ end
19
+
20
+ def to_s
21
+ raise NotImplementedError,
22
+ "Please implement #to_s for #{short_name}."
23
+ end
24
+
25
+ def to_str
26
+ to_s
27
+ end
28
+
29
+ def short_name
30
+ str = self.class.name
31
+ str[str.rindex('::')+2..-1]
32
+ end
33
+
34
+ def inspect
35
+ data = fields.map {|f| "#{f}=#{self.send(f).inspect}" }.join(', ')
36
+ short_name + "(" + data + ")"
37
+ end
38
+ end
39
+
40
+ # Later: Remember escaping for chars (char, c1, c2, nchar, ...)
41
+
42
+ XChar = Node.make(:char) do
43
+ Regexp.escape(char)
44
+ end
45
+
46
+ CharRange = Node.make(:c1, :c2) { "[#@c1-#@c2]" }
47
+ NegatedRange = Node.make(:nr1, :nr2) { "[^#@nr1-#@nr2]" }
48
+ NegatedChar = Node.make(:nchar) { "[^#@nchar]" } # More like a range really
49
+ POSIXClass = Node.make(:pclass) { "[[:#@pclass:]]" }
50
+ CharClass = Node.make(:char_class) { "[#@char_class]" }
51
+ NegatedClass = Node.make(:neg_class) { "[^#@neg_class]" }
52
+ Predefined = Node.make(:pre) do
53
+ sym = "p#@pre".to_sym
54
+ str = Regexador::Parser::Predef2Regex[sym]
55
+ raise "#@pre is not handled yet" if str.nil?
56
+ str
57
+ end
58
+
59
+ StringNode = Node.make(:string) { Regexp.escape(string.to_s) }
60
+ Repeat1 = Node.make(:num1, :match_item) { "(#@match_item){#@num1}" }
61
+ Repeat2 = Node.make(:num1, :num2, :match_item) { "(#@match_item){#@num1,#@num2}" }
62
+ Any = Node.make(:match_item) { "(#@match_item)*" }
63
+ Many = Node.make(:match_item) { "(#@match_item)+" }
64
+ Maybe = Node.make(:match_item) { "(#@match_item)?" }
65
+ Nocase = Node.make(:match_item) { "((?i)#@match_item)" }
66
+
67
+ FindWith = Node.make(:findpat_ahead, :pospat) { "((?=#@findpat_ahead#@pospat)#@findpat_ahead)" }
68
+ FindWithout = Node.make(:findpat_ahead, :negpat) { "((?!#@findpat#@negpat)#@findpat)" }
69
+ WithFind = Node.make(:pospat, :findpat_behind) { "((?<=#@pospat)#@findpat)" }
70
+ WithoutFind = Node.make(:negpat, :findpat_behind) { "((?<!#@negpat)#@pospat)" }
71
+
72
+ Within = Node.make(:delim) { "(#@delim.*?#@delim)" } # /x[^y]*?y/
73
+ Escaping = Node.make(:delim) { "\\#@delim|[^#@delim]*?#@delim" }
74
+ # escaping `" # /"(\\"|[^"])*?"/
75
+
76
+ Sequence = Node.make(:elements) { elements.map(&:to_s).join }
77
+ Alternation = Node.make(:elements) { '(' + elements.map(&:to_s).join('|') + ')' }
78
+
79
+ Assignment = Node.make(:var, :rvalue) { "" } # Doesn't actually translate directly.
80
+ Usage = Node.make(:var) { Assignment.bindings[var.to_s].to_s }
81
+
82
+ Program = Node.make(:definitions, :match) do
83
+ # NOTE Since we're using to_s for conversion to regular expression,
84
+ # debugging cannot be done using string interpolation, otherwise we
85
+ # call things out of order just by debug-printing them!
86
+ #
87
+ # puts "In Program: #{match}" # Don't do this
88
+ # puts "In Program: #{match.inspect}" # But this is OK
89
+ definitions.each {|d| d.store }
90
+ match.to_s
91
+ end
92
+
93
+ class Assignment < Node # For clarity: Really already is-a Node
94
+ class << self
95
+ attr_accessor :bindings
96
+ end
97
+
98
+ def store
99
+ # puts "Storing #@var = #{@rvalue.inspect}"
100
+ hash = self.class.bindings ||= {}
101
+
102
+ hash[@var.to_s] = @rvalue # Late binding
103
+ # hash[@var.to_s] = @rvalue.to_s # Early binding
104
+ # Think about the difference... :)
105
+ end
106
+ end
107
+
108
+ Captured = Node.make(:cname, :pattern) { "(?<#@cname>#@pattern)" }
109
+ Backref = Node.make(:name) { "\\k<#@name>" }
110
+
111
+ Parameter = Node.make(:param) { "(#{param}){0}" }
112
+
113
+ PosAhead = Node.make(:pla1, :pla2) { "(?=#@pla1#@pla2)#@pla1" }
114
+ NegAhead = Node.make(:nla1, :nla2) { "(?!#@nla1#@nla2)#@nla1" }
115
+ PosBehind = Node.make(:plb1, :plb2) { "(?<=#@plb1)#@plb2" }
116
+ NegBehind = Node.make(:nlb1, :nlb2) { "(?<!#@nlb1)#@nlb2" }
117
+
118
+ # Actual transformation rules
119
+
120
+ rule(:char => simple(:ch)) { XChar.new(ch) }
121
+ rule(:unicode => simple(:hex4)) { StringNode.new("" << Integer("0x#{hex4}")) }
122
+
123
+ rule(:string => simple(:string)) { StringNode.new(string) }
124
+ # When the string is empty, parslet returns an empty array for lack of content.
125
+ # Map that to the empty string node.
126
+ rule(:string => sequence(:string)) { StringNode.new('') }
127
+
128
+ rule(:c1 => simple(:c1), :c2 => simple(:c2)) { CharRange.new(c1, c2) }
129
+
130
+ rule(:nr1 => simple(:nr1), :nr2 => simple(:nr2)) { NegatedRange.new(nr1, nr2) }
131
+ rule(:nchar => simple(:nchar)) { NegatedChar.new(nchar) } # Don't forget escaping
132
+
133
+ rule(:pclass => simple(:pclass)) { POSIXClass.new(pclass) }
134
+
135
+ rule(:char_class => simple(:char_class)) { CharClass.new(char_class) }
136
+ rule(:neg_class => simple(:neg_class)) { NegatedClass.new(neg_class) }
137
+
138
+ rule(:predef => simple(:content)) { Predefined.new(content) }
139
+
140
+ rule(:num1 => simple(:num1), :match_item => simple(:match_item)) { Repeat1.new(num1, match_item) }
141
+
142
+ rule(:num1 => simple(:num1), :num2 => simple(:num2), :match_item => simple(:match_item)) { Repeat2.new(num1, num2, match_item) }
143
+
144
+ rule(:qualifier => 'any', :match_item => simple(:match_item)) { Any.new(match_item) }
145
+ rule(:qualifier => 'many', :match_item => simple(:match_item)) { Many.new(match_item) }
146
+ rule(:qualifier => 'maybe', :match_item => simple(:match_item)) { Maybe.new(match_item) }
147
+ rule(:qualifier => 'nocase', :match_item => simple(:match_item)) { Nocase.new(match_item) }
148
+ rule(:qualifier => 'within', :match_item => simple(:match_item)) { Within.new(match_item) }
149
+ rule(:qualifier => 'escaping', :match_item => simple(:match_item)) { Escaping.new(match_item) }
150
+
151
+ rule(:findpat_ahead => simple(:pla1), :pospat => simple(:pla2)) { PosAhead.new(pla1, pla2) }
152
+ rule(:findpat_ahead => simple(:nla1), :negpat => simple(:nla2)) { NegAhead.new(nla1, nla2) }
153
+ rule(:pospat => simple(:plb1), :findpat_behind => simple(:plb2)) { PosBehind.new(plb1, plb2) }
154
+ rule(:negpat => simple(:nlb1), :findpat_behind => simple(:nlb2)) { NegBehind.new(nlb1, nlb2) }
155
+
156
+ rule(:var => simple(:var), :rvalue => simple(:rvalue)) { Assignment.new(@var, @rvalue) }
157
+
158
+ rule(:param => simple(:param)) { Parameter.new(param) }
159
+
160
+ rule(:alternation => simple(:pattern)) { pattern }
161
+ rule(:alternation => sequence(:alternatives)) { Alternation.new(alternatives) }
162
+
163
+ rule(:sequence => simple(:element)) { element }
164
+ rule(:sequence => sequence(:elements)) { Sequence.new(elements) }
165
+
166
+ # A series of statements on different lines is also a sequence.
167
+ rule(:lines => sequence(:lines)) { Sequence.new(lines) }
168
+
169
+ rule(:var => simple(:name)) { Usage.new(name) }
170
+
171
+ rule(:definitions => sequence(:definitions), :match => simple(:match)) { Program.new(definitions, match) }
172
+ rule(:definitions => sequence(:definitions), :match => sequence(:match)) { Program.new(definitions, match) }
173
+
174
+ # An expression of the form '@variable'
175
+ rule(:lhs => {:cvar => simple(:backref)}) { Backref.new(backref) }
176
+
177
+ # An expression of the form '@variable = expr'
178
+ rule(:lhs => {:cvar => simple(:cname)}, :rhs => simple(:pattern)) { Captured.new(cname, pattern) }
179
+ end
180
+
@@ -0,0 +1,174 @@
1
+ # Encoding: UTF-8
2
+ require './spec/testing'
3
+
4
+ class Object
5
+ def succeeds
6
+ self.should_not == nil
7
+ end
8
+ end
9
+
10
+ describe Regexador do
11
+ before(:all) do
12
+ @parser = Regexador::Parser.new
13
+ @pattern = @parser.pattern
14
+ end
15
+
16
+ describe "A special character" do
17
+ it "can be matched correctly" do
18
+ @parser.cSQUOTE.parse_with_debug("'").succeeds
19
+ @parser.cHASH.parse('#').succeeds
20
+ @parser.cNEWLINE.parse("\n").succeeds
21
+ @parser.cEQUAL.parse('=').succeeds
22
+ end
23
+ end
24
+
25
+ describe "An international character" do
26
+ it "can follow a backtick" do #
27
+ @parser.char.parse_with_debug("`æ").succeeds
28
+ @parser.char.parse("`ß").succeeds
29
+ @parser.char.parse("`ç").succeeds
30
+ @parser.char.parse("`ö").succeeds
31
+ @parser.char.parse("`ñ").succeeds
32
+ end
33
+ end
34
+
35
+ describe "A Unicode codepoint expression" do
36
+ it "can be matched" do
37
+ @parser.codepoint.parse_with_debug("&1234").succeeds
38
+ @parser.codepoint.parse('&beef').succeeds
39
+ end
40
+ end
41
+
42
+ describe "A predefined token" do
43
+ %w(BOS EOS START END).each do |token|
44
+ describe token do
45
+ it 'matches using pattern' do
46
+ @parser.pattern.parse_with_debug(token).succeeds
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ describe "An assignment" do
53
+ it "can be parsed" do
54
+ @parser.assignment.parse("a = 5").succeeds
55
+ @parser.assignment.parse("a= 5").succeeds
56
+ @parser.assignment.parse("a =5").succeeds
57
+ @parser.assignment.parse("a=5").succeeds
58
+ @parser.assignment.parse("myvar = 'xyz'").succeeds
59
+ @parser.assignment.parse('var2 = "hello"').succeeds
60
+ @parser.assignment.parse('this_var = `x-`z').succeeds
61
+ @parser.assignment.parse_with_debug('pat = maybe many `x-`z').succeeds
62
+ end
63
+ end
64
+
65
+ describe "A keyword used as a variable name" do
66
+ it "will not parse" do
67
+ @parser.assignment.should_not parse("end = 'hello'")
68
+ # @parser.assignment.parse("endx = 'hello'")
69
+ end
70
+ end
71
+
72
+ describe "A definition section" do
73
+ it "can be parsed" do
74
+ defs1 = "a = 5\nstr = \"hello\"\n"
75
+ @parser.definitions.parse_with_debug(defs1).succeeds
76
+ defs2 = <<-EOF
77
+ a = 5
78
+ # comment...
79
+ pat = maybe many `a-`c
80
+ # empty line follows:
81
+
82
+ str = "hello"
83
+ # another comment...
84
+ EOF
85
+ @parser.definitions.parse_with_debug(defs2).succeeds
86
+ end
87
+ end
88
+
89
+ describe "A capture variable" do
90
+ it "can be parsed" do
91
+ str1 = "@myvar"
92
+ @parser.capture_var.parse(str1).succeeds
93
+ end
94
+ end
95
+
96
+ describe "A captured pattern" do
97
+ let(:prog) { "@myvar = maybe 'abc'" }
98
+
99
+ it "can be parsed (#capture)" do
100
+ @parser.capture.parse(prog).succeeds
101
+ end
102
+ it "can be parsed (#program)" do
103
+ @parser.parse("match #{prog} end").succeeds
104
+ end
105
+ end
106
+ describe "A back reference" do
107
+ let(:prog) { '@myvar' }
108
+
109
+ it 'can be parsed (#capture)' do
110
+ @parser.capture.parse(prog).succeeds
111
+ end
112
+ it 'can be parsed' do
113
+ @parser.parse("match #{prog} end").succeeds
114
+ end
115
+ end
116
+
117
+
118
+ describe "A one-line match clause" do
119
+ it "can be parsed" do
120
+ mc1 = <<-EOF
121
+ match `a~`x end
122
+ EOF
123
+ @parser.match_clause.parse_with_debug(mc1).succeeds
124
+ end
125
+ end
126
+
127
+ describe "A multiline match clause" do
128
+ it "can be parsed" do
129
+ mc2 = <<-EOF
130
+ match
131
+ `< "tag" WB
132
+ any ~`>
133
+ # blah blah blah
134
+ "</" "tag" `>
135
+ end
136
+ EOF
137
+ @parser.multiline_clause.parse_with_debug(mc2).succeeds
138
+ end
139
+ end
140
+
141
+ describe "An entire one-line program" do
142
+ it "can be parsed" do
143
+ prog = "match `a-`f end"
144
+ @parser.parse_with_debug(prog).succeeds
145
+ end
146
+ end
147
+
148
+
149
+ describe "An entire program" do
150
+ it "can be parsed" do
151
+ prog1 = <<-EOF
152
+ dot = "."
153
+ num = "25" D5 | `2 D4 D | maybe D1 1,2*D
154
+ match WB num dot num dot num dot num WB end
155
+ EOF
156
+ @parser.program.parse_with_debug(prog1).succeeds
157
+
158
+ prog2 = <<-EOF
159
+ # Warning: This one likely has errors!
160
+
161
+ visa = `4 12*D maybe 3*D
162
+ mc = `5 D5 14*D
163
+ amex = `3 '47' 13*D
164
+ diners = `3 (`0 D5 | '68' D) 11*D
165
+ discover = `6 ("011" | `5 2*D) 12*D
166
+ jcb = ("2131"|"1800"|"35" 3*D) 11*D
167
+
168
+ match visa | mc | amex | diners | discover | jcb end
169
+ EOF
170
+ @parser.program.parse_with_debug(prog2).succeeds
171
+ end
172
+ end
173
+ end
174
+