reg 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ =begin copyright
2
+ reg - the ruby extended grammar
3
+ Copyright (C) 2005 Caleb Clausen
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
+ =end
19
+ module Kernel
20
+ #forward or delegate one or more methods to
21
+ #something else. this is sort of like aliasing
22
+ #a method, but the receiver can be changed as well.
23
+ #target is where the methods are forwarded to.
24
+ #(the new receiver.)
25
+ #it may be a Module or Class or a String or Symbol
26
+ #containing the name of an instance, class, or
27
+ #global variable. actually, the target string
28
+ #may contain any code (to be evaluated in the context
29
+ #of an object of the current class).
30
+ #multiple names to forward may be provided. if the
31
+ #hash slot is used, it may contain methods whose
32
+ #names are changed while forwarding, in the form
33
+ # :localname=>:targetname
34
+ protected
35
+ def forward_to(target,*names)
36
+ Module===target and target="::#{target}"
37
+
38
+ eval names.pop.map{|myname,targetname|
39
+ "def #{myname}(*args,&block) (#{target}).#{targetname}(*args,&block) end\n"
40
+ }.to_s if Hash===names.last
41
+ eval names.map{|name|
42
+ "def #{name}(*args,&block) (#{target}).#{name}(*args,&block) end\n"
43
+ }.to_s
44
+ end
45
+ end
46
+
47
+ class Module
48
+ public :forward_to
49
+ end
@@ -0,0 +1,47 @@
1
+ =begin copyright
2
+ reg - the ruby extended grammar
3
+ Copyright (C) 2005 Caleb Clausen
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
+ =end
19
+ require 'test/unit'
20
+ class Object
21
+ #define a more stable version of inspect (for testing purposes)
22
+ alias pristine_inspect inspect
23
+ def inspect
24
+ res=pristine_inspect
25
+ res[/^#</] or return res
26
+ res=["#<",self.class,": ",instance_variables.sort.collect{|v|
27
+ [v,"=",instance_variable_get(v).inspect," "]
28
+ }]
29
+ res.last.pop
30
+ res.push('>')
31
+ res.to_s
32
+ end
33
+ end
34
+ class T411 < Test::Unit::TestCase
35
+ def test_unnamed
36
+ _=require 'reg'
37
+
38
+ _=item_that<4===3
39
+ assert_equal 'true', _.inspect
40
+
41
+ _=item_that<4===5
42
+ assert_equal 'false', _.inspect
43
+
44
+ assert_nothing_thrown {_=item_that.respond_to?(false)==='ddd'}
45
+ end
46
+ end
47
+
@@ -0,0 +1,200 @@
1
+ =begin copyright
2
+ reg - the ruby extended grammar
3
+ Copyright (C) 2005 Caleb Clausen
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
+ =end
19
+ require "forward_to"
20
+
21
+ SpaceshipPirate=proc{
22
+ alias spaceship__withoutpirates <=>
23
+ def <=>(other)
24
+ if NumberSet::Part===other
25
+ res=other<=>self
26
+ res and -res
27
+ else
28
+ spaceship__withoutpirates other
29
+ end
30
+ end
31
+ }
32
+
33
+
34
+ [Fixnum,Bignum,Float].each{|cl| cl.instance_eval SpaceshipPirate }
35
+
36
+ class NumberSet
37
+
38
+ def initialize(*pieces)
39
+ pieces.map {|r| self.insert r}
40
+ end
41
+ class<<self
42
+ alias [] new
43
+ end
44
+
45
+ forward_to :@pieces, :[]
46
+
47
+ def insert
48
+ mid=high-low/2
49
+ case
50
+ end
51
+
52
+ def ===(num)
53
+ @pieces.empty? and return
54
+
55
+ low,high=0,@pieces.size-1
56
+
57
+ loop {
58
+ case num <=> @pieces[mid=high-low/2]
59
+ when 1: low=mid+1
60
+ when 0: return true
61
+ when -1: high=mid-1
62
+ when nil: return false
63
+ else fail "didn't expect anything else from <=>"
64
+ end
65
+ (-1..1)===high-low and
66
+ }
67
+ end
68
+
69
+ class Part
70
+ def initialize
71
+ abstract
72
+ end
73
+
74
+ def ===
75
+ abstract
76
+ end
77
+
78
+ def first
79
+ abstract
80
+ end
81
+
82
+ def last
83
+ abstract
84
+ end
85
+
86
+ def <=>(other)
87
+ if Part===other
88
+ result=(self<=>other.first)
89
+ return(result == (self<=>other.last) and result)
90
+ end
91
+
92
+ if first> other: -1
93
+ elsif last < other: 1
94
+ elsif self===other: 0
95
+ end
96
+ #else other's in our range, but not in the bitset, what else to do?
97
+ end
98
+
99
+ end
100
+
101
+ class Range < Part
102
+ include Enumerable
103
+
104
+ def initialize(first,last=nil,exclude_end=nil)
105
+ last or first,last,exclude_end=first.first,first.last,first.exclude_end?
106
+ @first,@last,@exclude_end=first,last,exclude_end||nil
107
+ end
108
+ class <<self; alias [] new; end
109
+
110
+ attr_reader :first,:last
111
+ alias begin first
112
+ alias end last
113
+
114
+ def exclude_end?; @exclude_end end
115
+
116
+ def ===(num)
117
+ lt=@exclude_end && :< || :<=
118
+ num>=@first and num.send lt,@last
119
+ end
120
+ alias member? ===
121
+ alias include? ===
122
+
123
+ def to_s
124
+ "#{@first}..#{@exclude_end && "."}#{@last}"
125
+ end
126
+ alias inspect to_s
127
+
128
+ def eql?(other)
129
+ Range===other||::Range===other and
130
+ @first.eql? other.first and
131
+ @last.eql? other.last and
132
+ @exclude_end==other.exclude_end?
133
+ end
134
+
135
+ def each
136
+ item=@first
137
+ until item==@last
138
+ yield item
139
+ item=item.succ!
140
+ end
141
+ yield item unless @exclude_end
142
+ return self
143
+ end
144
+
145
+ def step(skipcnt)
146
+ item=@first
147
+ cnt=1
148
+ until item==@last
149
+ if (cnt-=1).zero?
150
+ cnt=skipcnt
151
+ yield item
152
+ end
153
+ item=item.succ!
154
+ end
155
+ yield item unless @exclude_end || cnt!=1
156
+ return self
157
+ end
158
+ end
159
+
160
+
161
+ class Fragment < Part
162
+ include Enumerable
163
+
164
+ attr_reader :base, :bits
165
+
166
+
167
+ def begin
168
+ assert @bits[0].nonzero?
169
+ @base+@bits[0].ffs-1
170
+ end
171
+
172
+ def end
173
+ assert @bits[-1].fls.nonzero?
174
+ @base+ (@bits.length-1)*8 + @bits[-1].fls-1
175
+ end
176
+
177
+ alias first begin
178
+ alias last end
179
+
180
+ def ===(num)
181
+ num-=@base
182
+ num<0 and return false
183
+ bitidx=num&7
184
+ byteidx=(num&~7)>>3
185
+ (@bits[byteidx]&(1<<bitidx)).nonzero?
186
+ end
187
+
188
+ def each
189
+ (0...@bits.size).each{|idx|
190
+ bits=@bits[idx]
191
+ until bits.zero?
192
+ bit=bits.ffs-1
193
+ yield @base + idx*8 + bit
194
+ bits &= ~(1<<bit)
195
+ end
196
+ }
197
+ return self
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,188 @@
1
+ Lalr(n) parsing with reg
2
+
3
+ Yesterday, I introduced my the Ruby Extended Grammar, a pattern matching
4
+ library for ruby data. Astute readers may have noticed a slight
5
+ misnomer. Reg is not a grammar (parser), nor a tool for grammars. It's
6
+ really just a very fancy regular expression engine. Regular expressions
7
+ are equivalent to state machines. State machines are not powerful
8
+ enough by themselves to solve interesting parsing problems -- that is,
9
+ how to parse a language like ruby with infix operators of different
10
+ precedence and associativity.
11
+
12
+ Handling precedence and associativity requires a lalr(1) parser. Let me
13
+ explain briefly the lalr algorithm:
14
+
15
+ The important lalr data structures are the stack and input. The input
16
+ is simply a stream of tokens fed into the parser, as it requests them. The
17
+ next token(s) waiting to be taken off the input is called the lookahead.
18
+ The stack contains the results of partially parsed expressions. At each step
19
+ of the parse process, the parser decides (based on what's found at the top
20
+ of the stack and in the lookahead) whether to shift another token off the
21
+ input onto the stack or to reduce some of the tokens at the top of the stack
22
+ using the rules of the language's grammar. At the end, we expect to see the
23
+ input empty and on the stack a single token, which represents the parse tree
24
+ of the entire program.
25
+
26
+ Normal parsers (also called compiler compilers) use a big complicated
27
+ table to decide at runtime whether to shift or reduce and, if reducing, which
28
+ rule to reduce by. This table represents the compiled form of the language
29
+ grammar. That's why they're called compiler compilers. My approach is rather
30
+ different, and might best be described as an interpreter interpreter. (Or, if
31
+ it's to be used in a compiler, it would be a compiler interpreter.)
32
+
33
+ Instead of shifting or choosing one rule to match at each step, each rule is
34
+ given a chance to match, and when none can, then the input is shifted. Reg
35
+ is used as the pattern matching engine, and a small wrapper layer manages
36
+ the parser data structures and invokes reg at each step to do a match
37
+ attempt. I believe this approach is in general equivalent to the normal lalr
38
+ algorithm.
39
+
40
+ Yesterday's reg release contained a sketch of these ideas in the form of a parser for a
41
+ small, bc-like calculator language, in calc.reg. I've also reproduced it below. Basically, it's
42
+ a subset of ruby with only local variables, numbers, a few operators (+, -, *, /,
43
+ =, ;), parentheses, and p as the sole function. Although small,
44
+ parsing this language is a representative problem because it requires solving
45
+ precedence and associativity.
46
+
47
+ The heart of the parser are its grammar rules, reproduced here:
48
+
49
+ #last element is always lookahead
50
+ Reduce=
51
+ -[ -[:p, '(', exp, ')'].sub {PrintExp.new BR[2]}, OB ] | # p(exp)
52
+ -[ -['(', exp, ')'] .sub {BR[1]}, OB ] | # (exp)
53
+ -[ -[exp, leftop, exp] .sub {OpExp.new *BR[0..2]}, regproc{lowerop(BR[1])} ] | # exp+exp
54
+ -[ exp, -[';'] .sub [], :EOI ] | #elide final trailing ;
55
+ -[ -[name, '=', exp] .sub {AssignExp.new BR[0],BR[2]}, lowerop('=') ] #name=exp
56
+
57
+ Precedence is handled by the middle rule. This rule reduces infix operator
58
+ expressions (except =). It only matches if the lookahead does not contain a
59
+ higher precedence operator. This ensures that expressions like '3+4*5' will
60
+ parse correctly.
61
+
62
+ Associativity is handled by the last rule. = is the only right-associative
63
+ operator, so it's the only one that has to be handled specially. Again, it
64
+ allows a reduce only if the lookahead is not also right-associative (and lower
65
+ precedence...). This ensures that expressions like 'a=b=c' will parse
66
+ correctly.
67
+
68
+ The great advantage of the interpreter interpreter is flexibility. It would
69
+ be quite easy to extend this parser -- even at runtime -- by adding things
70
+ at the right place in Reduce. The disadvantage is performance, which is
71
+ likely to be very bad currently. The current implementation of reg is not
72
+ optimized to any great extent. Many regexp-type optimizations could be
73
+ applied to reg. Optimized regexp engines can actually be quite fast, so,
74
+ (aside from performance issues with ruby itself) an optimized reg might
75
+ actually be competitive with a table-based parser in terms of performance.
76
+ Keep in mind that table-based parsers are not actually the fastest; the
77
+ gold standard are hand-coded or direct execution parsers.
78
+
79
+ Error detection is an area that might be troublesome. I haven't given this
80
+ a lot of thought yet, but I think it's approachable, without
81
+ causing too much pain. One way might be to wait until a synchronizing
82
+ token, then report errors.
83
+
84
+
85
+ Some comments made by florian pflug have clarified things for me:
86
+
87
+ Hm.. I belive it not that different. The tables of an LR(k) parser
88
+ specifiy for each input symbol, and each top-of-stack
89
+ a) An action (either shift, or "reduct p" where p is a rule
90
+ ( a production) of your grammar
91
+ b) A "goto" - the new state the parser shall transition to.
92
+
93
+ Your represent the "action" table implicitly - you scan
94
+ the rules for every symbol you read, and decide to shift
95
+ or to reduce based on that, instead of looking into a predefined
96
+ table. Therefore, you just trade compiler-compile time for runtime -
97
+ but the mechanism is the same.
98
+
99
+ The goto table is entirely absent in your approach - but this
100
+ stems from the fact that you don't _need_ to remeber a state.
101
+ The state of a table-based LR(k) parser is just an "abbreviation"
102
+ for the current state of the stack. An table-based LR(k) parser
103
+ decided wether to shift or to reduce _soley_ based on the current
104
+ input symbol, and the top-of-the-stack. It therefore needs a state,
105
+ to "remeber" what it put on the stack previously. Each state
106
+ of a LR(k) parser represents a _single_ production (or rule) - but
107
+ a rule can be represented by more than one state.
108
+
109
+ I believe that you could improve the performance of your parser by
110
+ just-in-time compiling of the action and goto tables, or some
111
+ çÒuivalent thing.
112
+
113
+ You could, for example, calculate the FOLLOW set (The set of symbols
114
+ which can follow a valid right-hand side of a given rule). Then,
115
+ you just have to try those rules which have the current top-of-stack
116
+ in their FOLLOW set.
117
+
118
+ This would give a sort of an half-table-based LR(k) parser.
119
+
120
+ Anyway, thanks for your cool work, and for getting me interested in
121
+ parsers again ;-)
122
+
123
+ greetings, Florian Pflug
124
+
125
+
126
+ calc.reg:
127
+
128
+ require 'reg'
129
+
130
+ #warning: this code is untested
131
+ #currently, it will not work because it depends on
132
+ #features of reg which do not exist (backreferences
133
+ and substitutions). in addition,
134
+ #it is likely to contain serious bugs, as it has
135
+ #not been thoroughly tested or assured in any way.
136
+ #nevertheless, it should give you a good idea of
137
+ #how this sort of thing works.
138
+
139
+
140
+ precedence={
141
+ :'('=>10, :p=>10,
142
+ :* =>9, :/ =>9,
143
+ :+ =>8, :- =>8,
144
+ :'='=>7,
145
+ :';'=>6
146
+ }
147
+ name=String.reg
148
+ exp=name|PrintExp|OpExp|AssignExp|Number #definitions of the expression classes ommitted for brevity
149
+ leftop=/^[*\/;+-]$/
150
+ rightop=/^=$/
151
+ op=leftop|rightop
152
+ def lowerop opname
153
+ regproc{
154
+ leftop & proceq(Symbol) {|v| precedence[opname] >= precedence[v] }
155
+ }
156
+ end
157
+
158
+ #last element is always lookahead
159
+ Reduce=
160
+ -[ -[:p, '(', exp, ')'].sub {PrintExp.new BR[2]}, OB ] | # p(exp)
161
+ -[ -['(', exp, ')'] .sub {BR[1]}, OB ] | # (exp)
162
+ -[ -[exp, leftop, exp] .sub {OpExp.new *BR[0..2]}, regproc{lowerop(BR[1])} ] | # exp+exp
163
+ -[ exp, -[';'] .sub [], :EOI ] | #elide final trailing ;
164
+ -[ -[name, '=', exp] .sub {AssignExp.new BR[0],BR[2]}, lowerop('=') ] #name=exp
165
+
166
+ #last element of stack is always lookahead
167
+ def reduceloop(stack)
168
+ old_stack=stack
169
+ while stack.match +[OBS, Reduce]
170
+ end
171
+ stack.equal? old_stack or raise 'error'
172
+ end
173
+
174
+ #last element of stack is always lookahead
175
+ def parse(input)
176
+ input<<:EOI
177
+ stack=[input.shift]
178
+ until input.empty? and +[OB,:EOI]===stack
179
+ stack.push input.shift #shift
180
+ reduceloop stack
181
+ end
182
+ return stack.first
183
+ end
184
+
185
+
186
+
187
+
188
+