rltk 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +1 -0
- data/LICENSE +27 -0
- data/README +386 -0
- data/Rakefile +67 -0
- data/lib/rltk/ast.rb +264 -0
- data/lib/rltk/cfg.rb +491 -0
- data/lib/rltk/lexer.rb +298 -0
- data/lib/rltk/lexers/calculator.rb +41 -0
- data/lib/rltk/lexers/ebnf.rb +40 -0
- data/lib/rltk/parser.rb +1354 -0
- data/lib/rltk/parsers/infix_calc.rb +43 -0
- data/lib/rltk/parsers/postfix_calc.rb +34 -0
- data/lib/rltk/parsers/prefix_calc.rb +34 -0
- data/lib/rltk/token.rb +66 -0
- data/test/tc_ast.rb +85 -0
- data/test/tc_cfg.rb +149 -0
- data/test/tc_lexer.rb +217 -0
- data/test/tc_parser.rb +275 -0
- data/test/tc_token.rb +34 -0
- metadata +87 -0
data/lib/rltk/cfg.rb
ADDED
@@ -0,0 +1,491 @@
|
|
1
|
+
# Author: Chris Wailes <chris.wailes@gmail.com>
|
2
|
+
# Project: Ruby Language Toolkit
|
3
|
+
# Date: 2011/03/24
|
4
|
+
# Description: This file contains the a class representing a context-free
|
5
|
+
# grammar.
|
6
|
+
|
7
|
+
# encoding: utf-8
|
8
|
+
|
9
|
+
############
|
10
|
+
# Requires #
|
11
|
+
############
|
12
|
+
|
13
|
+
# Ruby Language Toolkit
|
14
|
+
require 'rltk/lexers/ebnf'
|
15
|
+
|
16
|
+
#######################
|
17
|
+
# Classes and Modules #
|
18
|
+
#######################
|
19
|
+
|
20
|
+
module RLTK # :nodoc:
|
21
|
+
|
22
|
+
# An exception class that represents a problem with a context-free
|
23
|
+
# grammar's definition.
|
24
|
+
class GrammarError < Exception; end
|
25
|
+
|
26
|
+
# The CFG class is used to represent context-free grammars. It is used by
|
27
|
+
# the RLTK::Parser class to represent the parser's grammar, but can also be
|
28
|
+
# used to manipulate arbitrary CFGs.
|
29
|
+
class CFG
|
30
|
+
|
31
|
+
# The start symbol for the grammar.
|
32
|
+
attr_reader :start_symbol
|
33
|
+
|
34
|
+
# The current left-hand side symbol. This is used by the
|
35
|
+
# CFG.production method to wrap CFG.clause calls.
|
36
|
+
attr_accessor :curr_lhs
|
37
|
+
|
38
|
+
#################
|
39
|
+
# Class Methods #
|
40
|
+
#################
|
41
|
+
|
42
|
+
# Tests to see if a symbol is a terminal symbol, as used by the CFG
|
43
|
+
# class.
|
44
|
+
def self.is_terminal?(sym)
|
45
|
+
sym and (s = sym.to_s) == s.upcase
|
46
|
+
end
|
47
|
+
|
48
|
+
# Tests to see if a symbol is a non-terminal symbol, as used by the
|
49
|
+
# CFG class.
|
50
|
+
def self.is_nonterminal?(sym)
|
51
|
+
sym and (s = sym.to_s) == s.downcase
|
52
|
+
end
|
53
|
+
|
54
|
+
####################
|
55
|
+
# Instance Methods #
|
56
|
+
####################
|
57
|
+
|
58
|
+
# Instantiates a new CFG object that uses _callback_ to inform the
|
59
|
+
# programmer of the generation of new productions due to EBNF
|
60
|
+
# operators.
|
61
|
+
def initialize(&callback)
|
62
|
+
@curr_lhs = nil
|
63
|
+
@callback = callback || Proc.new {}
|
64
|
+
@lexer = Lexers::EBNF.new
|
65
|
+
@production_counter = -1
|
66
|
+
@start_symbol = nil
|
67
|
+
@wrapper_symbol = nil
|
68
|
+
|
69
|
+
@productions_id = Hash.new
|
70
|
+
@productions_sym = Hash.new { |h, k| h[k] = [] }
|
71
|
+
@production_buffer = Array.new
|
72
|
+
|
73
|
+
@terms = Hash.new(false).update({:EOS => true})
|
74
|
+
@nonterms = Hash.new(false)
|
75
|
+
|
76
|
+
@firsts = Hash.new
|
77
|
+
@follows = Hash.new { |h,k| h[k] = Array.new }
|
78
|
+
end
|
79
|
+
|
80
|
+
# Adds _production_ to the appropriate internal data structures.
|
81
|
+
def add_production(production)
|
82
|
+
@productions_sym[production.lhs] << (@productions_id[production.id] = production)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Sets the EBNF callback to _callback_.
|
86
|
+
def callback(&callback)
|
87
|
+
@callback = callback || Proc.new {}
|
88
|
+
end
|
89
|
+
|
90
|
+
# This function MUST be called inside a CFG.production block. It will
|
91
|
+
# make a new production with the left-hand side specified by the
|
92
|
+
# CFG.production call's argument. This is the function that is
|
93
|
+
# responsible for removing EBNF symbols from the grammar.
|
94
|
+
def clause(expression)
|
95
|
+
if not @curr_lhs
|
96
|
+
raise GrammarError, 'CFG.clause called outside of CFG.production block.'
|
97
|
+
end
|
98
|
+
|
99
|
+
lhs = @curr_lhs.to_sym
|
100
|
+
rhs = Array.new
|
101
|
+
tokens = @lexer.lex(expression)
|
102
|
+
|
103
|
+
# Set this as the start symbol if there isn't one already
|
104
|
+
# defined.
|
105
|
+
@start_symbol ||= lhs
|
106
|
+
|
107
|
+
# Remove EBNF tokens and replace them with new productions.
|
108
|
+
tokens.each_index do |i|
|
109
|
+
ttype0 = tokens[i].type
|
110
|
+
tvalue0 = tokens[i].value
|
111
|
+
|
112
|
+
if ttype0 == :TERM or ttype0 == :NONTERM
|
113
|
+
|
114
|
+
# Add this symbol to the correct collection.
|
115
|
+
(ttype0 == :TERM ? @terms : @nonterms)[tvalue0] = true
|
116
|
+
|
117
|
+
if i + 1 < tokens.length
|
118
|
+
ttype1 = tokens[i + 1].type
|
119
|
+
tvalue1 = tokens[i + 1].value
|
120
|
+
|
121
|
+
rhs <<
|
122
|
+
case ttype1
|
123
|
+
when :'?'
|
124
|
+
self.get_question(tvalue0)
|
125
|
+
|
126
|
+
when :*
|
127
|
+
self.get_star(tvalue0)
|
128
|
+
|
129
|
+
when :+
|
130
|
+
self.get_plus(tvalue0)
|
131
|
+
|
132
|
+
else
|
133
|
+
tvalue0
|
134
|
+
end
|
135
|
+
else
|
136
|
+
rhs << tvalue0
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Make the production.
|
142
|
+
@production_buffer << (production = Production.new(self.next_id, lhs, rhs))
|
143
|
+
|
144
|
+
# Make sure the production symbol is collected.
|
145
|
+
@nonterms[lhs] = true
|
146
|
+
|
147
|
+
# Add the new production to our collections.
|
148
|
+
self.add_production(production)
|
149
|
+
|
150
|
+
return production
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns the _first_ set for _sentence_. _Sentence_ may be either a
|
154
|
+
# single symbol or an array of symbols.
|
155
|
+
def first_set(sentence)
|
156
|
+
if sentence.is_a?(Symbol)
|
157
|
+
self.first_set_prime(sentence)
|
158
|
+
|
159
|
+
elsif sentence.inject(true) { |m, sym| m and self.symbols.include?(sym) }
|
160
|
+
set0 = []
|
161
|
+
all_have_empty = true
|
162
|
+
|
163
|
+
sentence.each do |sym|
|
164
|
+
set0 |= (set1 = self.first_set(sym)) - [:'ɛ']
|
165
|
+
|
166
|
+
break if not (all_have_empty = set1.include?(:'ɛ'))
|
167
|
+
end
|
168
|
+
|
169
|
+
if all_have_empty then set0 + [:'ɛ'] else set0 end
|
170
|
+
else
|
171
|
+
nil
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# This function is responsible for calculating the _first_ set of
|
176
|
+
# individual symbols. CFG.first_set is a wrapper around this function
|
177
|
+
# to provide support for calculating the _first_ set for sentences.
|
178
|
+
def first_set_prime(sym0, seen_lh_sides = [])
|
179
|
+
if self.symbols.include?(sym0)
|
180
|
+
# Memoize the result for later.
|
181
|
+
@firsts[sym0] ||=
|
182
|
+
|
183
|
+
if CFG::is_terminal?(sym0)
|
184
|
+
# If the symbol is a terminal, it is the only symbol in
|
185
|
+
# its follow set.
|
186
|
+
[sym0]
|
187
|
+
else
|
188
|
+
set0 = []
|
189
|
+
|
190
|
+
@productions_sym[sym0].each do |production|
|
191
|
+
if production.rhs == []
|
192
|
+
# If this is an empty production we should
|
193
|
+
# add the empty string to the First set.
|
194
|
+
set0 << :'ɛ'
|
195
|
+
else
|
196
|
+
all_have_empty = true
|
197
|
+
|
198
|
+
production.rhs.each do |sym1|
|
199
|
+
|
200
|
+
set1 = []
|
201
|
+
|
202
|
+
# Grab the First set for the current
|
203
|
+
# symbol in this production.
|
204
|
+
if not seen_lh_sides.include?(sym1)
|
205
|
+
set0 |= (set1 = self.first_set_prime(sym1, seen_lh_sides << sym1)) - [:'ɛ']
|
206
|
+
end
|
207
|
+
|
208
|
+
break if not (all_have_empty = set1.include?(:'ɛ'))
|
209
|
+
end
|
210
|
+
|
211
|
+
# Add the empty production if this production
|
212
|
+
# is all non-terminals that can be reduced to
|
213
|
+
# the empty string.
|
214
|
+
set0 << :'ɛ' if all_have_empty
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
set0.uniq
|
219
|
+
end
|
220
|
+
else
|
221
|
+
nil
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# Returns the _follow_ set for a given symbol. The second argument is
|
226
|
+
# used to avoid infinite recursion when mutually recursive rules are
|
227
|
+
# encountered.
|
228
|
+
def follow_set(sym0, seen_lh_sides = [])
|
229
|
+
|
230
|
+
# Use the memoized set if possible.
|
231
|
+
return @follows[sym0] if @follows.has_key?(sym0)
|
232
|
+
|
233
|
+
if @nonterms[sym0]
|
234
|
+
set0 = []
|
235
|
+
|
236
|
+
# Add EOS to the start symbol's follow set.
|
237
|
+
set0 << :EOS if sym0 == @start_symbol
|
238
|
+
|
239
|
+
@productions_id.values.each do |production|
|
240
|
+
production.rhs.each_with_index do |sym1, i|
|
241
|
+
if i + 1 < production.rhs.length
|
242
|
+
if sym0 == sym1
|
243
|
+
set0 |= (set1 = self.first_set(production.rhs[(i + 1)..-1])) - [:'ɛ']
|
244
|
+
|
245
|
+
set0 |= self.follow_set(production.lhs) if set1.include?(:'ɛ')
|
246
|
+
end
|
247
|
+
elsif sym0 != production.lhs and sym0 == sym1 and not seen_lh_sides.include?(production.lhs)
|
248
|
+
set0 |= self.follow_set(production.lhs, seen_lh_sides << production.lhs)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
if seen_lh_sides.empty? or not set0.empty?
|
254
|
+
# Memoize the result for later.
|
255
|
+
@follows[sym0] |= set0
|
256
|
+
else
|
257
|
+
set0
|
258
|
+
end
|
259
|
+
else
|
260
|
+
[]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Builds productions used to eliminate the + EBNF operator.
|
265
|
+
def get_plus(symbol)
|
266
|
+
new_symbol = (symbol.to_s.downcase + '_plus').to_sym
|
267
|
+
|
268
|
+
if not @productions_sym.has_key?(new_symbol)
|
269
|
+
# Add the items for the following productions:
|
270
|
+
#
|
271
|
+
# token_plus: token | token token_plus
|
272
|
+
|
273
|
+
# 1st production
|
274
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
|
275
|
+
@callback.call(production, :+, :first)
|
276
|
+
|
277
|
+
# 2nd production
|
278
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
|
279
|
+
@callback.call(production, :+, :second)
|
280
|
+
|
281
|
+
# Add the new symbol to the list of nonterminals.
|
282
|
+
@nonterms[new_symbol] = true
|
283
|
+
end
|
284
|
+
|
285
|
+
return new_symbol
|
286
|
+
end
|
287
|
+
|
288
|
+
# Builds productions used to eliminate the ? EBNF operator.
|
289
|
+
def get_question(symbol)
|
290
|
+
new_symbol = (symbol.to_s.downcase + '_question').to_sym
|
291
|
+
|
292
|
+
if not @productions_sym.has_key?(new_symbol)
|
293
|
+
# Add the items for the following productions:
|
294
|
+
#
|
295
|
+
# nonterm_question: | nonterm
|
296
|
+
|
297
|
+
# 1st (empty) production.
|
298
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, []))
|
299
|
+
@callback.call(production, :'?', :first)
|
300
|
+
|
301
|
+
# 2nd production
|
302
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
|
303
|
+
@callback.call(production, :'?', :second)
|
304
|
+
|
305
|
+
# Add the new symbol to the list of nonterminals.
|
306
|
+
@nonterms[new_symbol] = true
|
307
|
+
end
|
308
|
+
|
309
|
+
return new_symbol
|
310
|
+
end
|
311
|
+
|
312
|
+
# Builds productions used to eliminate the * EBNF operator.
|
313
|
+
def get_star(symbol)
|
314
|
+
new_symbol = (symbol.to_s.downcase + '_star').to_sym
|
315
|
+
|
316
|
+
if not @productions_sym.has_key?(new_symbol)
|
317
|
+
# Add the items for the following productions:
|
318
|
+
#
|
319
|
+
# token_star: | token token_star
|
320
|
+
|
321
|
+
# 1st (empty) production
|
322
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, []))
|
323
|
+
@callback.call(production, :*, :first)
|
324
|
+
|
325
|
+
# 2nd production
|
326
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
|
327
|
+
@callback.call(production, :*, :second)
|
328
|
+
|
329
|
+
# Add the new symbol to the list of nonterminals.
|
330
|
+
@nonterms[new_symbol] = true
|
331
|
+
end
|
332
|
+
|
333
|
+
return new_symbol
|
334
|
+
end
|
335
|
+
|
336
|
+
# Returns the ID for the next production to be defined.
|
337
|
+
def next_id
|
338
|
+
@production_counter += 1
|
339
|
+
end
|
340
|
+
|
341
|
+
# Returns all of the non-terminal symbols used in the gramar's
|
342
|
+
# definition.
|
343
|
+
def nonterms
|
344
|
+
@nonterms.keys
|
345
|
+
end
|
346
|
+
|
347
|
+
# Builds a new production with the left-hand side value of _symbol_.
|
348
|
+
# If _expression_ is specified it is take as the right-hand side of
|
349
|
+
# production. If _expression_ is nil then _block_ is evaluated, and
|
350
|
+
# expected to make one or more calls to CFG.clause.
|
351
|
+
def production(symbol, expression = nil, &block)
|
352
|
+
@production_buffer = Array.new
|
353
|
+
@curr_lhs = symbol
|
354
|
+
|
355
|
+
if expression
|
356
|
+
self.clause(expression)
|
357
|
+
else
|
358
|
+
self.instance_exec(&block)
|
359
|
+
end
|
360
|
+
|
361
|
+
@curr_lhs = nil
|
362
|
+
return @production_buffer.clone
|
363
|
+
end
|
364
|
+
|
365
|
+
# If _by_ is :sym, returns a hash of the grammar's productions, using
|
366
|
+
# the productions' left-hand side symbol as the key. If _by_ is :id
|
367
|
+
# an array of productions is returned in the order of their
|
368
|
+
# definition.
|
369
|
+
def productions(by = :sym)
|
370
|
+
if by == :sym
|
371
|
+
@productions_sym
|
372
|
+
elsif by == :id
|
373
|
+
@productions_id
|
374
|
+
else
|
375
|
+
nil
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# Sets the start symbol for this grammar.
|
380
|
+
def start(symbol)
|
381
|
+
if not CFG::is_nonterminal?(symbol)
|
382
|
+
raise GrammarError, 'Start symbol must be a non-terminal.'
|
383
|
+
end
|
384
|
+
|
385
|
+
@start_symbol = symbol
|
386
|
+
end
|
387
|
+
|
388
|
+
# Returns a list of symbols encountered in the grammar's definition.
|
389
|
+
def symbols
|
390
|
+
self.terms + self.nonterms
|
391
|
+
end
|
392
|
+
|
393
|
+
# Returns a list of all terminal symbols encountered in the grammar's
|
394
|
+
# definition.
|
395
|
+
def terms
|
396
|
+
@terms.keys
|
397
|
+
end
|
398
|
+
|
399
|
+
# Oddly enough, the Production class represents a production in a
|
400
|
+
# context-free grammar.
|
401
|
+
class Production
|
402
|
+
attr_reader :id
|
403
|
+
attr_reader :lhs
|
404
|
+
attr_reader :rhs
|
405
|
+
|
406
|
+
# Instantiates a new Production object with the specified ID,
|
407
|
+
# and left- and right-hand sides.
|
408
|
+
def initialize(id, lhs, rhs)
|
409
|
+
@id = id
|
410
|
+
@lhs = lhs
|
411
|
+
@rhs = rhs
|
412
|
+
end
|
413
|
+
|
414
|
+
# Comparese on production to another. Returns true only if the
|
415
|
+
# left- and right- hand sides match.
|
416
|
+
def ==(other)
|
417
|
+
self.lhs == other.lhs and self.rhs == other.rhs
|
418
|
+
end
|
419
|
+
|
420
|
+
# Makes a new copy of the production.
|
421
|
+
def copy
|
422
|
+
Production.new(@id, @lhs, @rhs.clone)
|
423
|
+
end
|
424
|
+
|
425
|
+
# Locates the last terminal in the right-hand side of a
|
426
|
+
# production.
|
427
|
+
def last_terminal
|
428
|
+
@rhs.inject(nil) { |m, sym| if CFG::is_terminal?(sym) then sym else m end }
|
429
|
+
end
|
430
|
+
|
431
|
+
# Returns a new Item based on this production.
|
432
|
+
def to_item
|
433
|
+
Item.new(0, @id, @lhs, @rhs)
|
434
|
+
end
|
435
|
+
|
436
|
+
# Returns a string representation of this production.
|
437
|
+
def to_s(padding = 0)
|
438
|
+
"#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.join(' ')}"
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# The Item class represents a CFG production with dot in it.
|
443
|
+
class Item < Production
|
444
|
+
attr_reader :dot
|
445
|
+
|
446
|
+
# Instantiates a new Item object with a dot located before the
|
447
|
+
# symbol at index _dot_ of the right-hand side. The remaining
|
448
|
+
# arguments (_args_) should be as specified by
|
449
|
+
# Production.initialize.
|
450
|
+
def initialize(dot, *args)
|
451
|
+
super(*args)
|
452
|
+
|
453
|
+
# The Dot indicates the NEXT symbol to be read.
|
454
|
+
@dot = dot
|
455
|
+
end
|
456
|
+
|
457
|
+
# Compares two items.
|
458
|
+
def ==(other)
|
459
|
+
self.dot == other.dot and self.lhs == other.lhs and self.rhs == other.rhs
|
460
|
+
end
|
461
|
+
|
462
|
+
# Moves the items dot forward by one if the end of the right-hand
|
463
|
+
# side hasn't already been reached.
|
464
|
+
def advance
|
465
|
+
if @dot < @rhs.length
|
466
|
+
@dot += 1
|
467
|
+
end
|
468
|
+
end
|
469
|
+
|
470
|
+
# Tests to see if the dot is at the end of the right-hand side.
|
471
|
+
def at_end?
|
472
|
+
@dot == @rhs.length
|
473
|
+
end
|
474
|
+
|
475
|
+
# Produces a new copy of this item.
|
476
|
+
def copy
|
477
|
+
Item.new(@dot, @id, @lhs, @rhs.clone)
|
478
|
+
end
|
479
|
+
|
480
|
+
# Returns the symbol located after the dot.
|
481
|
+
def next_symbol
|
482
|
+
@rhs[@dot]
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns a string representation of this item.
|
486
|
+
def to_s(padding = 0)
|
487
|
+
"#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.insert(@dot, '·').join(' ') }"
|
488
|
+
end
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|