peggy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ module Peggy
2
+ class Node
3
+ attr_accessor :name, :first, :next, :parent, :from, :to
4
+
5
+ def initialize name
6
+ self.name = name
7
+ end
8
+
9
+ def << child
10
+ child.parent = self
11
+ if first
12
+ last.next = child
13
+ else
14
+ first = child
15
+ end
16
+ end
17
+
18
+ def each
19
+ child = first
20
+ while child
21
+ yield child
22
+ child = child.next
23
+ end
24
+ end
25
+
26
+ def format tabs
27
+ result = "#{tabs}#{self}\n"
28
+ tabs << ' '
29
+ node = first
30
+ while node
31
+ "#{tabs}#{node.format tabs}"
32
+ end
33
+ tabs = tabs[0..-3]
34
+ end
35
+
36
+ def last
37
+ node = first
38
+ while (n2 = node.next)
39
+ node = n2
40
+ end
41
+ end
42
+
43
+ def to_s source=nil
44
+ source ? source[from...to] : "#{name}[#{from}...#{to}]"
45
+ end
46
+ end
47
+
48
+ class AST
49
+
50
+ attr_reader :root
51
+
52
+ def initialize
53
+ @root = Node.new
54
+ build result, index, @root
55
+ end
56
+
57
+ def self::build parser, index = 0
58
+ AST.new.build_one results, index, root
59
+ end
60
+
61
+ private
62
+
63
+ def build_one results, index, parent
64
+ row = results[index]
65
+ row = results[index += 1] until row
66
+ top = parent
67
+ results[:found_order].reverse_each do |name|
68
+ node = Node.new name
69
+ node.from = index
70
+ node.to = row[name]
71
+ build_rest results, node
72
+ top << node
73
+ node = top
74
+ end
75
+
76
+ end
77
+
78
+ def build_rest results, previous
79
+ build_one results, previous.to, previous.parent
80
+ end
81
+
82
+ def to_s
83
+ @root.format ''
84
+ end
85
+ end
86
+ end # Peggy
@@ -0,0 +1,359 @@
1
+ require 'rubygems'
2
+ require 'parser'
3
+ # require File.join(File.dirname(__FILE__), 'parser')
4
+
5
+ module Peggy
6
+
7
+ # Base syntax element class.
8
+ class Element
9
+ # Create an element.
10
+ def self::build *args
11
+ new *args
12
+ end
13
+
14
+ # Test to see if there is a match of this element at the current index.
15
+ # Return's the index following if match is found, or NO_MATCH if not
16
+ def match parser, index
17
+ raise "Must override match"
18
+ end
19
+
20
+ def report index
21
+ # puts "#{to_s} #{index}"
22
+ index
23
+ end
24
+ end
25
+
26
+ # An element that matches a sequence of elements. All must match for the sequence to match.
27
+ class Sequence < Element
28
+ # Add a child element.
29
+ def add element
30
+ @list = [] unless @list
31
+ @list << element
32
+ end
33
+
34
+ # Synonym for add(element)
35
+ alias :'<<' :add
36
+
37
+ # Reference a child by index.
38
+ def [] index
39
+ @list[index]
40
+ end
41
+
42
+ # Child iterator.
43
+ def each &blk
44
+ @list.each &blk
45
+ end
46
+
47
+ # Match each child in sequence. If any fail this returns NO_MATCH. If all succeed this
48
+ # returns the end index of the last.
49
+ def match parser, index
50
+ raise "no children added to sequence" unless @list
51
+ each do |element|
52
+ index = element.match parser, index
53
+ return NO_MATCH unless index
54
+ end
55
+ report index
56
+ end
57
+ end
58
+
59
+ # An element which matches any one of its children. The children are tested in order. The first
60
+ # to match wins.
61
+ class Alternatives < Sequence
62
+ # Match any one of the children. The children are tried in order. The first to match wins.
63
+ # The result is the end index of the first matching child. If none match this returns NO_MATCH.
64
+ def match parser, index
65
+ raise "no children added to alternate" unless @list
66
+ each do |element|
67
+ found = element.match parser, index
68
+ return report(found) if found
69
+ end
70
+ report NO_MATCH
71
+ end
72
+ end
73
+
74
+ # An element which tries its single child multiple times. It is greedy, meaning it will continue
75
+ # to match as long as possible, unless the range specifies a maximum number of matches.
76
+ class Multiple < Element
77
+ # A big number
78
+ MANY = 32767
79
+ # The minimum and maximum number of tries
80
+ attr_accessor :range
81
+ # The single child
82
+ attr_accessor :child
83
+
84
+ # Init the range
85
+ def initialize range
86
+ @range = range
87
+ end
88
+
89
+ # synonym for child=(element)
90
+ alias :'<<' :'child='
91
+
92
+ # Matches the child multiple times. The range specifies the least and most number of matches.
93
+ # If the number of matches is less than the minimim of the range then NO_MATCH is returned.
94
+ # If equal or more than the minimim then the end index of the last match is returned.
95
+ def match parser, index
96
+ raise "multiple element child not set" unless child
97
+ raise "multiple element range not set" unless range
98
+ count = 0
99
+ while count < range.last
100
+ found = child.match parser, index
101
+ break unless found
102
+ index = found
103
+ count += 1
104
+ end
105
+ report range === count ? index : NO_MATCH
106
+ end
107
+ end
108
+
109
+ # Matcher of 0 or more times.
110
+ class AnyNumber < Multiple
111
+ def initialize
112
+ super 0..MANY
113
+ end
114
+ end
115
+
116
+ # Matcher of 1 or more times.
117
+ class AtLeastOne < Multiple
118
+ def initialize
119
+ super 1..MANY
120
+ end
121
+ end
122
+
123
+ # Matcher of 0 or 1 time.
124
+ class Optional < Multiple
125
+ def initialize
126
+ super 0..1
127
+ end
128
+ end
129
+
130
+ # An element which tries its single child but does not advance the index if found.
131
+ # If not found, however, it returns NO_MATCH. Used for a positive semantic predicate.
132
+ class Positive < Element
133
+ # The single child
134
+ attr_accessor :child
135
+
136
+ # synonym for child=(element)
137
+ alias :'<<' :'child='
138
+
139
+ # Matches the child once. If found the original index is returned.
140
+ # If not found NO_MATCH is returned.
141
+ def match parser, index
142
+ raise "positive element child not set" unless child
143
+ found = child.match parser, index
144
+ found ? index : NO_MATCH
145
+ end
146
+ end
147
+
148
+ # An element which tries its single child but does not advance the index if not found.
149
+ # If found, however, it returns NO_MATCH. Used for a negative semantic predicate.
150
+ class Negative < Positive
151
+ def match parser, index
152
+ raise "negative element child not set" unless child
153
+ found = child.match parser, index
154
+ found ? NO_MATCH : index
155
+ end
156
+ end
157
+
158
+ # Match another production in the grammar.
159
+ class Reference < Element
160
+ # The name of the production to lookup and match.
161
+ attr_reader :name
162
+
163
+ # Init the name
164
+ def initialize name=nil
165
+ self.name = name
166
+ end
167
+
168
+ # Set the name of production to match.
169
+ def name= value
170
+ @name = value.to_sym
171
+ end
172
+
173
+ # Match the entire production from the parser grammar. If it matches
174
+ # the end index is returned. If not, NO_MATCH is returned.
175
+ def match parser, index
176
+ raise "reference name not set" unless name
177
+ parser.match? name, index
178
+ end
179
+
180
+ def to_s
181
+ name
182
+ end
183
+ end
184
+
185
+ # Matcher of a grammar production. The one and only child defines the production.
186
+ class Production < Reference
187
+ # The production definition.
188
+ attr_accessor :child
189
+
190
+ # Init the name and child.
191
+ def initialize name=nil, child=nil
192
+ super name
193
+ @child = child
194
+ end
195
+
196
+ # Synonym of child=(element)
197
+ alias :'<<' :'child='
198
+
199
+ # Match the production one time. If it matches the end index is returned. If not,
200
+ # NO_MATCH is returned.
201
+ def match parser, index
202
+ raise "production name not set" unless name
203
+ raise "production child not set" unless child
204
+ report @child.match(parser, index)
205
+ end
206
+ end
207
+
208
+ # Matcher of a literal string or regular expression.
209
+ class Literal < Element
210
+ # Value to match.
211
+ attr_reader :value
212
+
213
+ # Init the value.
214
+ def initialize value=nil
215
+ @value = value
216
+ end
217
+
218
+ # Set the value to match.
219
+ def value= literal
220
+ # Make sure regular expressions check at the beginnig of the string
221
+ literal = correct_regexp literal if literal.is_a? Regexp
222
+ @value = literal
223
+ end
224
+
225
+ # Match the literal value. If it matches the end index is returned.
226
+ # If no, NO_MATCH is returned.
227
+ def match parser, index
228
+ report parser.literal?(value, index)
229
+ end
230
+
231
+ def to_s
232
+ value.inspect
233
+ end
234
+ end
235
+
236
+ # Parser builder. The built in methods create syntax elements. Any other
237
+ # method called on this object create references to production, or actual
238
+ # productions, if called at the top level.
239
+ # Todo: Change to a class and separate from Parser.
240
+ class Builder < Parser
241
+ # Productions to build
242
+ attr_reader :productions
243
+ # Current parent being built
244
+ attr_reader :parent
245
+
246
+ def initialize
247
+ @building = true
248
+ end
249
+
250
+ # Reference a production by its name index.
251
+ def [] index
252
+ productions[index]
253
+ end
254
+
255
+ # Create a production if at the top level, or a reference to a production a
256
+ # production is being built.
257
+ def method_missing name, *args
258
+ if @building
259
+ if @parent
260
+ ref = Reference.new name
261
+ @parent << ref
262
+ elsif block_given?
263
+ @productions = {} unless @productions
264
+ prod = Production.new name
265
+ @parent = prod
266
+ yield
267
+ @parent = nil
268
+ @productions[name] = prod
269
+ else
270
+ super
271
+ end
272
+ else
273
+ prod = @productions[name]
274
+ super unless prod
275
+ # puts "matching #{name} at #{args.first}"
276
+ prod.match self, args.first
277
+ end
278
+ end
279
+
280
+ # Add an Alternatives element to the parent.
281
+ def one &blk
282
+ build_piece Alternatives, blk
283
+ end
284
+ # Synonym for one().
285
+ alias :alt :one
286
+
287
+ def eof *args
288
+ if args.length == 1 then super args.first
289
+ else method_missing :eof, *args
290
+ end
291
+ end
292
+
293
+ # Add an Sequence element to the parent.
294
+ def each &blk
295
+ build_piece Sequence, blk
296
+ end
297
+ # Synonym for each()
298
+ alias :seq :each
299
+
300
+ # Add an Literal element to the parent.
301
+ def lit *values
302
+ if values.size == 1
303
+ build_piece Literal, nil, values.first
304
+ else
305
+ one{
306
+ for v in values
307
+ build_piece Literal, nil, v
308
+ end
309
+ }
310
+ end
311
+ end
312
+
313
+ # Add an AnyNumber element to the parent.
314
+ def many &blk
315
+ build_piece AnyNumber, blk
316
+ end
317
+
318
+ # Add an Optional element to the parent.
319
+ def opt &blk
320
+ build_piece Optional, blk
321
+ end
322
+
323
+ # Add an AtLeastOne element to the parent.
324
+ def some &blk
325
+ build_piece AtLeastOne, blk
326
+ end
327
+
328
+ def neg &blk
329
+ build_piece Negative, blk
330
+ end
331
+
332
+ def pos &blk
333
+ build_piece Positive, blk
334
+ end
335
+
336
+ def parse? goal, index=0
337
+ @building = nil
338
+ super
339
+ end
340
+
341
+ private
342
+
343
+ # Add an object of klass to the parent and yield to its block. If
344
+ # value is specified it is passed to the klass constructor.
345
+ def build_piece klass, blk=nil, value=nil
346
+ # puts "building #{klass.name} with #{value.inspect}"
347
+ elem = value ? klass.new(value) : klass.new
348
+ @parent << elem
349
+ if blk
350
+ parent = @parent
351
+ @parent = elem
352
+ blk.call
353
+ @parent = parent
354
+ end
355
+ end
356
+
357
+ end # Builder
358
+
359
+ end # Peggy
@@ -0,0 +1,203 @@
1
+ require 'pp'
2
+
3
+ # Peggy is a packrat parsing engine. Packrat parsers memoize every production so that
4
+ # parses can happen in linear time. No production needs to be processed more than once for
5
+ # a given position of the source. See http://pdos.csail.mit.edu/~baford/packrat/ for
6
+ # more details.
7
+ #
8
+ # Peggy also incorporates Parsing Expression Grammar (PEG) as proposed by Bryan Ford,
9
+ # as one of several input grammars. PEG is a formalized grammar specification needing
10
+ # no separate lexer/scanner step. See http://pdos.csail.mit.edu/~baford/packrat/popl04/
11
+ #
12
+ # As good as packrat parsers are, they have a few limitations. They cannot handle left
13
+ # recursion of a production, meaning a production cannot reference itself as the first
14
+ # element in a sequence. Also memoizing of production results means than memory consumption
15
+ # increasses with the size of the source being parsed. This is not usaly a concern, execpt
16
+ # when attempting to parse multi-megabyte source files, such as a huge XML database.
17
+ module Peggy
18
+
19
+ # Returned when a production did not match
20
+ NO_MATCH = false
21
+ # Used to prevent infinite (left) recursions
22
+ IN_USE = true
23
+
24
+ # Packrat parser class. Note all methods have a trailing exclamation (!) or question
25
+ # mark (?), or have long names with underscores (_). This is because productions are
26
+ # methods and we need to avoid name collisions. To use this class you must subclass
27
+ # Parser and provide your productions as methods. Your productions must call match?
28
+ # or one of the protected convenience routines to perform parsing. Productions must
29
+ # never call another production directly, or results will not get memoized and you
30
+ # will slow down your parse conciderably, and possibly risk getting into an infinite
31
+ # recursion (until the stack blows its top). Note, as a conveience in writting
32
+ # productions, you can call any match? function multiple times, passing each returned
33
+ # index, such as in a sequence, without checking the results of each production.
34
+ class Parser
35
+
36
+ # Tells parser to print intermediate results if set.
37
+ attr_accessor :debug_flag
38
+
39
+ # The source to parse, can be set prior to calling parse!().
40
+ attr_accessor :source_text
41
+
42
+ # The results of the parse. A hash (keys of indexs) of hashes (keys of production
43
+ # symbols and values of end indexes.
44
+ attr_reader :parse_results
45
+
46
+ # The productions to ignore.
47
+ attr_accessor :ignore_productions
48
+
49
+ # Return a range (or character) of the source_text.
50
+ def [] range
51
+ raise "source_text not set" if source_text.nil?
52
+ source_text[range]
53
+ end
54
+
55
+ # Envokes the parser from the beginning of the source on the given production goal.
56
+ # You sohuld provide the source here or you can set source_text prior to calling.
57
+ # If index is provided the parser will ignore characters previous to it.
58
+ def parse? goal, source = nil, index = 0
59
+ source_text = source unless source.nil?
60
+ # Hash of automatic hashes
61
+ @parse_results = Hash.new {|h1, k1| h1[k1] = {}}
62
+ @keys = nil
63
+ index = match? goal, index
64
+ puts pp(parse_results) if debug_flag
65
+ index
66
+ end
67
+
68
+ # Queries the parse results for a heirarchy of production matches. An array of
69
+ # index ranges is returned, or an empny array if none are found. This can only be
70
+ # called after parse_results have been set by a parse.
71
+ def query? *args
72
+ raise "You must first call parse!" unless parse_results
73
+ @keys = @parse_results.keys.sort unless @keys
74
+ found_list = []
75
+ index = 0
76
+ args.each do |arg|
77
+ index = find? arg, index
78
+ end
79
+ end
80
+
81
+ # Try to match a production from the given index. Returns the end index if found
82
+ # or start index if not found.
83
+ def allow? goal, index
84
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
85
+ found = match? goal, index
86
+ found == NO_MATCH ? index : found
87
+ end
88
+
89
+ # Try to match a production from the given index then backtrack. Returns index if
90
+ # found or NO_MATCH if not.
91
+ def check? goal, index
92
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
93
+ found = match? goal, index
94
+ found == NO_MATCH ? NO_MATCH : index
95
+ end
96
+
97
+ # Try not to match a production from the given index then backtrack. Returns index
98
+ # if not found or NO_MATCH if found.
99
+ def dissallow? goal, index
100
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
101
+ found = match? goal, index
102
+ found == NO_MATCH ? index : NO_MATCH
103
+ end
104
+
105
+ # Special production that only matches the end of source_text. Note, this function
106
+ # does not end in (?) or (!) because it is meant be used as a normal production.
107
+ def eof index
108
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
109
+ index >= source_text.length ? index : NO_MATCH
110
+ end
111
+
112
+ # Match a production from the given index. Returns the end index if found or NO_MATCH
113
+ # if not found.
114
+ def match? goal, index
115
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
116
+ index = ignore? index unless @ignoring
117
+ goal = goal.to_sym
118
+ position = parse_results[index]
119
+ found = position.fetch(goal) do
120
+ position[goal] = IN_USE # used to prevent inifinite recursion in case user attemts
121
+ # a left recursion
122
+ if (result = send goal, index)
123
+ position[:found_order] = [] unless position.has_key?(:found_order)
124
+ position[:found_order] << goal
125
+ end
126
+ position[goal] = result
127
+ end
128
+ puts "found #{goal} at #{index}...#{found} #{source_text[index...found].inspect}" if found && debug_flag
129
+ raise "Parser cannot handle infinite (left) recursions. Please rewrite usage of '#{goal}'." if found == IN_USE
130
+ found
131
+ end
132
+
133
+ # Match tokens that should be ignored. Used by match?(). Returns end index if found
134
+ # or start index if not found. Subclasses should override this method if they wish
135
+ # to ignore other text, such as comments.
136
+ def ignore? index
137
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
138
+ return index if @ignoring || ignore_productions.nil?
139
+ @ignoring = true
140
+ ignore_productions.each do |prod|
141
+ index = allow? prod, index
142
+ end
143
+ @ignoring = nil
144
+ index
145
+ end
146
+
147
+ # Match a literal string or regular expression from the given index. Returns
148
+ # the end index if found or NO_MATCH if not found.
149
+ def literal? value, index
150
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
151
+ case value
152
+ when String
153
+ string? value, index
154
+ when Regexp
155
+ regexp? value, index
156
+ else
157
+ raise "Unknown literal: #{value.inspect}"
158
+ end
159
+ end
160
+
161
+ # Match a string from the given index. Returns the end index if found
162
+ # or NO_MATCH if not found.
163
+ def string? value, index
164
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
165
+ value = value.to_s
166
+ index = ignore? index unless @ignoring
167
+ i2 = index + value.length
168
+ # puts source_text[index...i2].inspect + ' ' + value.inspect
169
+ source_text[index...i2] == value ? i2 : NO_MATCH
170
+ end
171
+
172
+ # Match a regular expression from the given index. Returns the end index
173
+ # if found or NO_MATCH if not found.
174
+ def regexp? value, index
175
+ return NO_MATCH if index == NO_MATCH # allow users to not check results of a sequence
176
+ value = correct_regexp! value
177
+ index = ignore? index unless @ignoring
178
+ found = value.match source_text[index..-1]
179
+ # puts "#{value.inspect} ~= #{found[0].inspect}" if found
180
+ found ? found.end(0) + index : NO_MATCH
181
+ end
182
+
183
+ # Make sure regular expressions match the beginning of the string, actually from
184
+ # the string from the given index.
185
+ def correct_regexp! re
186
+ source = re.source
187
+ source[0..1] == '\\A' ? re : Regexp.new("\\A(#{source})", re.options)
188
+ end
189
+
190
+ protected
191
+
192
+ def index_results!
193
+ raise "You must first call parse!" unless parse_results
194
+ @index = new Hash {|h, k| h[k] = []}
195
+ parse_results.each_pair do |index, prod_map|
196
+ prod_map[:found_order].reverse_each
197
+ prod_map.each_value
198
+ @index[prod]
199
+ end
200
+ end
201
+ end # Parser
202
+
203
+ end # Peggy