peggy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,86 @@
1
+ module Peggy
2
+ class Node
3
+ attr_accessor :name, :first, :next, :parent, :from, :to
4
+
5
+ def initialize name
6
+ self.name = name
7
+ end
8
+
9
+ def << child
10
+ child.parent = self
11
+ if first
12
+ last.next = child
13
+ else
14
+ first = child
15
+ end
16
+ end
17
+
18
+ def each
19
+ child = first
20
+ while child
21
+ yield child
22
+ child = child.next
23
+ end
24
+ end
25
+
26
+ def format tabs
27
+ result = "#{tabs}#{self}\n"
28
+ tabs << ' '
29
+ node = first
30
+ while node
31
+ "#{tabs}#{node.format tabs}"
32
+ end
33
+ tabs = tabs[0..-3]
34
+ end
35
+
36
+ def last
37
+ node = first
38
+ while (n2 = node.next)
39
+ node = n2
40
+ end
41
+ end
42
+
43
+ def to_s source=nil
44
+ source ? source[from...to] : "#{name}[#{from}...#{to}]"
45
+ end
46
+ end
47
+
48
+ class AST
49
+
50
+ attr_reader :root
51
+
52
+ def initialize
53
+ @root = Node.new
54
+ build result, index, @root
55
+ end
56
+
57
+ def self::build parser, index = 0
58
+ AST.new.build_one results, index, root
59
+ end
60
+
61
+ private
62
+
63
+ def build_one results, index, parent
64
+ row = results[index]
65
+ row = results[index += 1] until row
66
+ top = parent
67
+ results[:found_order].reverse_each do |name|
68
+ node = Node.new name
69
+ node.from = index
70
+ node.to = row[name]
71
+ build_rest results, node
72
+ top << node
73
+ node = top
74
+ end
75
+
76
+ end
77
+
78
+ def build_rest results, previous
79
+ build_one results, previous.to, previous.parent
80
+ end
81
+
82
+ def to_s
83
+ @root.format ''
84
+ end
85
+ end
86
+ end # Peggy
@@ -0,0 +1,359 @@
1
+ require 'rubygems'
2
+ require 'parser'
3
+ # require File.join(File.dirname(__FILE__), 'parser')
4
+
5
+ module Peggy
6
+
7
+ # Base syntax element class.
8
+ class Element
9
+ # Create an element.
10
+ def self::build *args
11
+ new *args
12
+ end
13
+
14
+ # Test to see if there is a match of this element at the current index.
15
+ # Return's the index following if match is found, or NO_MATCH if not
16
+ def match parser, index
17
+ raise "Must override match"
18
+ end
19
+
20
+ def report index
21
+ # puts "#{to_s} #{index}"
22
+ index
23
+ end
24
+ end
25
+
26
+ # An element that matches a sequence of elements. All must match for the sequence to match.
27
+ class Sequence < Element
28
+ # Add a child element.
29
+ def add element
30
+ @list = [] unless @list
31
+ @list << element
32
+ end
33
+
34
+ # Synonym for add(element)
35
+ alias :'<<' :add
36
+
37
+ # Reference a child by index.
38
+ def [] index
39
+ @list[index]
40
+ end
41
+
42
+ # Child iterator.
43
+ def each &blk
44
+ @list.each &blk
45
+ end
46
+
47
+ # Match each child in sequence. If any fail this returns NO_MATCH. If all succeed this
48
+ # returns the end index of the last.
49
+ def match parser, index
50
+ raise "no children added to sequence" unless @list
51
+ each do |element|
52
+ index = element.match parser, index
53
+ return NO_MATCH unless index
54
+ end
55
+ report index
56
+ end
57
+ end
58
+
59
+ # An element which matches any one of its children. The children are tested in order. The first
60
+ # to match wins.
61
+ class Alternatives < Sequence
62
+ # Match any one of the children. The children are tried in order. The first to match wins.
63
+ # The result is the end index of the first matching child. If none match this returns NO_MATCH.
64
+ def match parser, index
65
+ raise "no children added to alternate" unless @list
66
+ each do |element|
67
+ found = element.match parser, index
68
+ return report(found) if found
69
+ end
70
+ report NO_MATCH
71
+ end
72
+ end
73
+
74
+ # An element which tries its single child multiple times. It is greedy, meaning it will continue
75
+ # to match as long as possible, unless the range specifies a maximum number of matches.
76
+ class Multiple < Element
77
+ # A big number
78
+ MANY = 32767
79
+ # The minimum and maximum number of tries
80
+ attr_accessor :range
81
+ # The single child
82
+ attr_accessor :child
83
+
84
+ # Init the range
85
+ def initialize range
86
+ @range = range
87
+ end
88
+
89
+ # synonym for child=(element)
90
+ alias :'<<' :'child='
91
+
92
+ # Matches the child multiple times. The range specifies the least and most number of matches.
93
+ # If the number of matches is less than the minimim of the range then NO_MATCH is returned.
94
+ # If equal or more than the minimim then the end index of the last match is returned.
95
+ def match parser, index
96
+ raise "multiple element child not set" unless child
97
+ raise "multiple element range not set" unless range
98
+ count = 0
99
+ while count < range.last
100
+ found = child.match parser, index
101
+ break unless found
102
+ index = found
103
+ count += 1
104
+ end
105
+ report range === count ? index : NO_MATCH
106
+ end
107
+ end
108
+
109
+ # Matcher of 0 or more times.
110
+ class AnyNumber < Multiple
111
+ def initialize
112
+ super 0..MANY
113
+ end
114
+ end
115
+
116
+ # Matcher of 1 or more times.
117
+ class AtLeastOne < Multiple
118
+ def initialize
119
+ super 1..MANY
120
+ end
121
+ end
122
+
123
+ # Matcher of 0 or 1 time.
124
+ class Optional < Multiple
125
+ def initialize
126
+ super 0..1
127
+ end
128
+ end
129
+
130
+ # An element which tries its single child but does not advance the index if found.
131
+ # If not found, however, it returns NO_MATCH. Used for a positive semantic predicate.
132
+ class Positive < Element
133
+ # The single child
134
+ attr_accessor :child
135
+
136
+ # synonym for child=(element)
137
+ alias :'<<' :'child='
138
+
139
+ # Matches the child once. If found the original index is returned.
140
+ # If not found NO_MATCH is returned.
141
+ def match parser, index
142
+ raise "positive element child not set" unless child
143
+ found = child.match parser, index
144
+ found ? index : NO_MATCH
145
+ end
146
+ end
147
+
148
+ # An element which tries its single child but does not advance the index if not found.
149
+ # If found, however, it returns NO_MATCH. Used for a negative semantic predicate.
150
+ class Negative < Positive
151
+ def match parser, index
152
+ raise "negative element child not set" unless child
153
+ found = child.match parser, index
154
+ found ? NO_MATCH : index
155
+ end
156
+ end
157
+
158
+ # Match another production in the grammar.
159
+ class Reference < Element
160
+ # The name of the production to lookup and match.
161
+ attr_reader :name
162
+
163
+ # Init the name
164
+ def initialize name=nil
165
+ self.name = name
166
+ end
167
+
168
+ # Set the name of production to match.
169
+ def name= value
170
+ @name = value.to_sym
171
+ end
172
+
173
+ # Match the entire production from the parser grammar. If it matches
174
+ # the end index is returned. If not, NO_MATCH is returned.
175
+ def match parser, index
176
+ raise "reference name not set" unless name
177
+ parser.match? name, index
178
+ end
179
+
180
+ def to_s
181
+ name
182
+ end
183
+ end
184
+
185
+ # Matcher of a grammar production. The one and only child defines the production.
186
+ class Production < Reference
187
+ # The production definition.
188
+ attr_accessor :child
189
+
190
+ # Init the name and child.
191
+ def initialize name=nil, child=nil
192
+ super name
193
+ @child = child
194
+ end
195
+
196
+ # Synonym of child=(element)
197
+ alias :'<<' :'child='
198
+
199
+ # Match the production one time. If it matches the end index is returned. If not,
200
+ # NO_MATCH is returned.
201
+ def match parser, index
202
+ raise "production name not set" unless name
203
+ raise "production child not set" unless child
204
+ report @child.match(parser, index)
205
+ end
206
+ end
207
+
208
+ # Matcher of a literal string or regular expression.
209
+ class Literal < Element
210
+ # Value to match.
211
+ attr_reader :value
212
+
213
+ # Init the value.
214
+ def initialize value=nil
215
+ @value = value
216
+ end
217
+
218
+ # Set the value to match.
219
+ def value= literal
220
+ # Make sure regular expressions check at the beginnig of the string
221
+ literal = correct_regexp literal if literal.is_a? Regexp
222
+ @value = literal
223
+ end
224
+
225
+ # Match the literal value. If it matches the end index is returned.
226
+ # If no, NO_MATCH is returned.
227
+ def match parser, index
228
+ report parser.literal?(value, index)
229
+ end
230
+
231
+ def to_s
232
+ value.inspect
233
+ end
234
+ end
235
+
236
+ # Parser builder. The built in methods create syntax elements. Any other
237
+ # method called on this object create references to production, or actual
238
+ # productions, if called at the top level.
239
+ # Todo: Change to a class and separate from Parser.
240
+ class Builder < Parser
241
+ # Productions to build
242
+ attr_reader :productions
243
+ # Current parent being built
244
+ attr_reader :parent
245
+
246
+ def initialize
247
+ @building = true
248
+ end
249
+
250
+ # Reference a production by its name index.
251
+ def [] index
252
+ productions[index]
253
+ end
254
+
255
+ # Create a production if at the top level, or a reference to a production a
256
+ # production is being built.
257
+ def method_missing name, *args
258
+ if @building
259
+ if @parent
260
+ ref = Reference.new name
261
+ @parent << ref
262
+ elsif block_given?
263
+ @productions = {} unless @productions
264
+ prod = Production.new name
265
+ @parent = prod
266
+ yield
267
+ @parent = nil
268
+ @productions[name] = prod
269
+ else
270
+ super
271
+ end
272
+ else
273
+ prod = @productions[name]
274
+ super unless prod
275
+ # puts "matching #{name} at #{args.first}"
276
+ prod.match self, args.first
277
+ end
278
+ end
279
+
280
+ # Add an Alternatives element to the parent.
281
+ def one &blk
282
+ build_piece Alternatives, blk
283
+ end
284
+ # Synonym for one().
285
+ alias :alt :one
286
+
287
+ def eof *args
288
+ if args.length == 1 then super args.first
289
+ else method_missing :eof, *args
290
+ end
291
+ end
292
+
293
+ # Add an Sequence element to the parent.
294
+ def each &blk
295
+ build_piece Sequence, blk
296
+ end
297
+ # Synonym for each()
298
+ alias :seq :each
299
+
300
+ # Add an Literal element to the parent.
301
+ def lit *values
302
+ if values.size == 1
303
+ build_piece Literal, nil, values.first
304
+ else
305
+ one{
306
+ for v in values
307
+ build_piece Literal, nil, v
308
+ end
309
+ }
310
+ end
311
+ end
312
+
313
+ # Add an AnyNumber element to the parent.
314
+ def many &blk
315
+ build_piece AnyNumber, blk
316
+ end
317
+
318
+ # Add an Optional element to the parent.
319
+ def opt &blk
320
+ build_piece Optional, blk
321
+ end
322
+
323
+ # Add an AtLeastOne element to the parent.
324
+ def some &blk
325
+ build_piece AtLeastOne, blk
326
+ end
327
+
328
+ def neg &blk
329
+ build_piece Negative, blk
330
+ end
331
+
332
+ def pos &blk
333
+ build_piece Positive, blk
334
+ end
335
+
336
+ def parse? goal, index=0
337
+ @building = nil
338
+ super
339
+ end
340
+
341
+ private
342
+
343
+ # Add an object of klass to the parent and yield to its block. If
344
+ # value is specified it is passed to the klass constructor.
345
+ def build_piece klass, blk=nil, value=nil
346
+ # puts "building #{klass.name} with #{value.inspect}"
347
+ elem = value ? klass.new(value) : klass.new
348
+ @parent << elem
349
+ if blk
350
+ parent = @parent
351
+ @parent = elem
352
+ blk.call
353
+ @parent = parent
354
+ end
355
+ end
356
+
357
+ end # Builder
358
+
359
+ end # Peggy
@@ -0,0 +1,203 @@
1
+ require 'pp'
2
+
3
+ # Peggy is a packrat parsing engine. Packrat parsers memoize every production so that
4
+ # parses can happen in linear time. No production needs to be processed more than once for
5
+ # a given position of the source. See http://pdos.csail.mit.edu/~baford/packrat/ for
6
+ # more details.
7
+ #
8
+ # Peggy also incorporates Parsing Expression Grammar (PEG) as proposed by Bryan Ford,
9
+ # as one of several input grammars. PEG is a formalized grammar specification needing
10
+ # no separate lexer/scanner step. See http://pdos.csail.mit.edu/~baford/packrat/popl04/
11
+ #
12
+ # As good as packrat parsers are, they have a few limitations. They cannot handle left
13
+ # recursion of a production, meaning a production cannot reference itself as the first
14
+ # element in a sequence. Also memoizing of production results means than memory consumption
15
+ # increasses with the size of the source being parsed. This is not usaly a concern, execpt
16
+ # when attempting to parse multi-megabyte source files, such as a huge XML database.
17
+ module Peggy
18
+
19
+ # Returned when a production did not match
20
+ NO_MATCH = false
21
+ # Used to prevent infinite (left) recursions
22
+ IN_USE = true
23
+
24
+ # Packrat parser class. Note all methods have a trailing exclamation (!) or question
25
+ # mark (?), or have long names with underscores (_). This is because productions are
26
+ # methods and we need to avoid name collisions. To use this class you must subclass
27
+ # Parser and provide your productions as methods. Your productions must call match?
28
+ # or one of the protected convenience routines to perform parsing. Productions must
29
+ # never call another production directly, or results will not get memoized and you
30
+ # will slow down your parse conciderably, and possibly risk getting into an infinite
31
+ # recursion (until the stack blows its top). Note, as a conveience in writting
32
+ # productions, you can call any match? function multiple times, passing each returned
33
+ # index, such as in a sequence, without checking the results of each production.
34
+ class Parser
35
+
36
+ # Tells parser to print intermediate results if set.
37
+ attr_accessor :debug_flag
38
+
39
+ # The source to parse, can be set prior to calling parse!().
40
+ attr_accessor :source_text
41
+
42
+ # The results of the parse. A hash (keys of indexs) of hashes (keys of production
43
+ # symbols and values of end indexes.
44
+ attr_reader :parse_results
45
+
46
+ # The productions to ignore.
47
+ attr_accessor :ignore_productions
48
+
49
+ # Return a range (or character) of the source_text.
50
+ def [] range
51
+ raise "source_text not set" if source_text.nil?
52
+ source_text[range]
53
+ end
54
+
55
+ # Envokes the parser from the beginning of the source on the given production goal.
56
+ # You sohuld provide the source here or you can set source_text prior to calling.
57
+ # If index is provided the parser will ignore characters previous to it.
58
+ def parse? goal, source = nil, index = 0
59
+ source_text = source unless source.nil?
60
+ # Hash of automatic hashes
61
+ @parse_results = Hash.new {|h1, k1| h1[k1] = {}}
62
+ @keys = nil
63
+ index = match? goal, index
64
+ puts pp(parse_results) if debug_flag
65
+ index
66
+ end
67
+
68
+ # Queries the parse results for a heirarchy of production matches. An array of
69
+ # index ranges is returned, or an empny array if none are found. This can only be
70
+ # called after parse_results have been set by a parse.
71
+ def query? *args
72
+ raise "You must first call parse!" unless parse_results
73
+ @keys = @parse_results.keys.sort unless @keys
74
+ found_list = []
75
+ index = 0
76
+ args.each do |arg|
77
+ index = find? arg, index
78
+ end
79
+ end
80
+
81
+ # Try to match a production from the given index. Returns the end index if found
82
+ # or start index if not found.
83
+ def allow? goal, index
84
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
85
+ found = match? goal, index
86
+ found == NO_MATCH ? index : found
87
+ end
88
+
89
+ # Try to match a production from the given index then backtrack. Returns index if
90
+ # found or NO_MATCH if not.
91
+ def check? goal, index
92
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
93
+ found = match? goal, index
94
+ found == NO_MATCH ? NO_MATCH : index
95
+ end
96
+
97
+ # Try not to match a production from the given index then backtrack. Returns index
98
+ # if not found or NO_MATCH if found.
99
+ def dissallow? goal, index
100
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
101
+ found = match? goal, index
102
+ found == NO_MATCH ? index : NO_MATCH
103
+ end
104
+
105
+ # Special production that only matches the end of source_text. Note, this function
106
+ # does not end in (?) or (!) because it is meant be used as a normal production.
107
+ def eof index
108
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
109
+ index >= source_text.length ? index : NO_MATCH
110
+ end
111
+
112
+ # Match a production from the given index. Returns the end index if found or NO_MATCH
113
+ # if not found.
114
+ def match? goal, index
115
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
116
+ index = ignore? index unless @ignoring
117
+ goal = goal.to_sym
118
+ position = parse_results[index]
119
+ found = position.fetch(goal) do
120
+ position[goal] = IN_USE # used to prevent inifinite recursion in case user attemts
121
+ # a left recursion
122
+ if (result = send goal, index)
123
+ position[:found_order] = [] unless position.has_key?(:found_order)
124
+ position[:found_order] << goal
125
+ end
126
+ position[goal] = result
127
+ end
128
+ puts "found #{goal} at #{index}...#{found} #{source_text[index...found].inspect}" if found && debug_flag
129
+ raise "Parser cannot handle infinite (left) recursions. Please rewrite usage of '#{goal}'." if found == IN_USE
130
+ found
131
+ end
132
+
133
+ # Match tokens that should be ignored. Used by match?(). Returns end index if found
134
+ # or start index if not found. Subclasses should override this method if they wish
135
+ # to ignore other text, such as comments.
136
+ def ignore? index
137
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
138
+ return index if @ignoring || ignore_productions.nil?
139
+ @ignoring = true
140
+ ignore_productions.each do |prod|
141
+ index = allow? prod, index
142
+ end
143
+ @ignoring = nil
144
+ index
145
+ end
146
+
147
+ # Match a literal string or regular expression from the given index. Returns
148
+ # the end index if found or NO_MATCH if not found.
149
+ def literal? value, index
150
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
151
+ case value
152
+ when String
153
+ string? value, index
154
+ when Regexp
155
+ regexp? value, index
156
+ else
157
+ raise "Unknown literal: #{value.inspect}"
158
+ end
159
+ end
160
+
161
+ # Match a string from the given index. Returns the end index if found
162
+ # or NO_MATCH if not found.
163
+ def string? value, index
164
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
165
+ value = value.to_s
166
+ index = ignore? index unless @ignoring
167
+ i2 = index + value.length
168
+ # puts source_text[index...i2].inspect + ' ' + value.inspect
169
+ source_text[index...i2] == value ? i2 : NO_MATCH
170
+ end
171
+
172
+ # Match a regular expression from the given index. Returns the end index
173
+ # if found or NO_MATCH if not found.
174
+ def regexp? value, index
175
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
176
+ value = correct_regexp! value
177
+ index = ignore? index unless @ignoring
178
+ found = value.match source_text[index..-1]
179
+ # puts "#{value.inspect} ~= #{found[0].inspect}" if found
180
+ found ? found.end(0) + index : NO_MATCH
181
+ end
182
+
183
+ # Make sure regular expressions match the beginning of the string, actually from
184
+ # the string from the given index.
185
+ def correct_regexp! re
186
+ source = re.source
187
+ source[0..1] == '\\A' ? re : Regexp.new("\\A(#{source})", re.options)
188
+ end
189
+
190
+ protected
191
+
192
+ def index_results!
193
+ raise "You must first call parse!" unless parse_results
194
+ @index = new Hash {|h, k| h[k] = []}
195
+ parse_results.each_pair do |index, prod_map|
196
+ prod_map[:found_order].reverse_each
197
+ prod_map.each_value
198
+ @index[prod]
199
+ end
200
+ end
201
+ end # Parser
202
+
203
+ end # Peggy