rltk 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/AUTHORS +1 -0
- data/LICENSE +27 -0
- data/README +386 -0
- data/Rakefile +67 -0
- data/lib/rltk/ast.rb +264 -0
- data/lib/rltk/cfg.rb +491 -0
- data/lib/rltk/lexer.rb +298 -0
- data/lib/rltk/lexers/calculator.rb +41 -0
- data/lib/rltk/lexers/ebnf.rb +40 -0
- data/lib/rltk/parser.rb +1354 -0
- data/lib/rltk/parsers/infix_calc.rb +43 -0
- data/lib/rltk/parsers/postfix_calc.rb +34 -0
- data/lib/rltk/parsers/prefix_calc.rb +34 -0
- data/lib/rltk/token.rb +66 -0
- data/test/tc_ast.rb +85 -0
- data/test/tc_cfg.rb +149 -0
- data/test/tc_lexer.rb +217 -0
- data/test/tc_parser.rb +275 -0
- data/test/tc_token.rb +34 -0
- metadata +87 -0
data/lib/rltk/cfg.rb
ADDED
@@ -0,0 +1,491 @@
|
|
1
|
+
# Author: Chris Wailes <chris.wailes@gmail.com>
|
2
|
+
# Project: Ruby Language Toolkit
|
3
|
+
# Date: 2011/03/24
|
4
|
+
# Description: This file contains the a class representing a context-free
|
5
|
+
# grammar.
|
6
|
+
|
7
|
+
# encoding: utf-8
|
8
|
+
|
9
|
+
############
|
10
|
+
# Requires #
|
11
|
+
############
|
12
|
+
|
13
|
+
# Ruby Language Toolkit
|
14
|
+
require 'rltk/lexers/ebnf'
|
15
|
+
|
16
|
+
#######################
|
17
|
+
# Classes and Modules #
|
18
|
+
#######################
|
19
|
+
|
20
|
+
module RLTK # :nodoc:
|
21
|
+
|
22
|
+
# An exception class that represents a problem with a context-free
|
23
|
+
# grammar's definition.
|
24
|
+
class GrammarError < Exception; end
|
25
|
+
|
26
|
+
# The CFG class is used to represent context-free grammars. It is used by
|
27
|
+
# the RLTK::Parser class to represent the parser's grammar, but can also be
|
28
|
+
# used to manipulate arbitrary CFGs.
|
29
|
+
class CFG
|
30
|
+
|
31
|
+
# The start symbol for the grammar.
|
32
|
+
attr_reader :start_symbol
|
33
|
+
|
34
|
+
# The current left-hand side symbol. This is used by the
|
35
|
+
# CFG.production method to wrap CFG.clause calls.
|
36
|
+
attr_accessor :curr_lhs
|
37
|
+
|
38
|
+
#################
|
39
|
+
# Class Methods #
|
40
|
+
#################
|
41
|
+
|
42
|
+
# Tests to see if a symbol is a terminal symbol, as used by the CFG
|
43
|
+
# class.
|
44
|
+
def self.is_terminal?(sym)
|
45
|
+
sym and (s = sym.to_s) == s.upcase
|
46
|
+
end
|
47
|
+
|
48
|
+
# Tests to see if a symbol is a non-terminal symbol, as used by the
|
49
|
+
# CFG class.
|
50
|
+
def self.is_nonterminal?(sym)
|
51
|
+
sym and (s = sym.to_s) == s.downcase
|
52
|
+
end
|
53
|
+
|
54
|
+
####################
|
55
|
+
# Instance Methods #
|
56
|
+
####################
|
57
|
+
|
58
|
+
# Instantiates a new CFG object that uses _callback_ to inform the
|
59
|
+
# programmer of the generation of new productions due to EBNF
|
60
|
+
# operators.
|
61
|
+
def initialize(&callback)
|
62
|
+
@curr_lhs = nil
|
63
|
+
@callback = callback || Proc.new {}
|
64
|
+
@lexer = Lexers::EBNF.new
|
65
|
+
@production_counter = -1
|
66
|
+
@start_symbol = nil
|
67
|
+
@wrapper_symbol = nil
|
68
|
+
|
69
|
+
@productions_id = Hash.new
|
70
|
+
@productions_sym = Hash.new { |h, k| h[k] = [] }
|
71
|
+
@production_buffer = Array.new
|
72
|
+
|
73
|
+
@terms = Hash.new(false).update({:EOS => true})
|
74
|
+
@nonterms = Hash.new(false)
|
75
|
+
|
76
|
+
@firsts = Hash.new
|
77
|
+
@follows = Hash.new { |h,k| h[k] = Array.new }
|
78
|
+
end
|
79
|
+
|
80
|
+
# Adds _production_ to the appropriate internal data structures.
|
81
|
+
def add_production(production)
|
82
|
+
@productions_sym[production.lhs] << (@productions_id[production.id] = production)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Sets the EBNF callback to _callback_.
|
86
|
+
def callback(&callback)
|
87
|
+
@callback = callback || Proc.new {}
|
88
|
+
end
|
89
|
+
|
90
|
+
# This function MUST be called inside a CFG.production block. It will
|
91
|
+
# make a new production with the left-hand side specified by the
|
92
|
+
# CFG.production call's argument. This is the function that is
|
93
|
+
# responsible for removing EBNF symbols from the grammar.
|
94
|
+
def clause(expression)
|
95
|
+
if not @curr_lhs
|
96
|
+
raise GrammarError, 'CFG.clause called outside of CFG.production block.'
|
97
|
+
end
|
98
|
+
|
99
|
+
lhs = @curr_lhs.to_sym
|
100
|
+
rhs = Array.new
|
101
|
+
tokens = @lexer.lex(expression)
|
102
|
+
|
103
|
+
# Set this as the start symbol if there isn't one already
|
104
|
+
# defined.
|
105
|
+
@start_symbol ||= lhs
|
106
|
+
|
107
|
+
# Remove EBNF tokens and replace them with new productions.
|
108
|
+
tokens.each_index do |i|
|
109
|
+
ttype0 = tokens[i].type
|
110
|
+
tvalue0 = tokens[i].value
|
111
|
+
|
112
|
+
if ttype0 == :TERM or ttype0 == :NONTERM
|
113
|
+
|
114
|
+
# Add this symbol to the correct collection.
|
115
|
+
(ttype0 == :TERM ? @terms : @nonterms)[tvalue0] = true
|
116
|
+
|
117
|
+
if i + 1 < tokens.length
|
118
|
+
ttype1 = tokens[i + 1].type
|
119
|
+
tvalue1 = tokens[i + 1].value
|
120
|
+
|
121
|
+
rhs <<
|
122
|
+
case ttype1
|
123
|
+
when :'?'
|
124
|
+
self.get_question(tvalue0)
|
125
|
+
|
126
|
+
when :*
|
127
|
+
self.get_star(tvalue0)
|
128
|
+
|
129
|
+
when :+
|
130
|
+
self.get_plus(tvalue0)
|
131
|
+
|
132
|
+
else
|
133
|
+
tvalue0
|
134
|
+
end
|
135
|
+
else
|
136
|
+
rhs << tvalue0
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Make the production.
|
142
|
+
@production_buffer << (production = Production.new(self.next_id, lhs, rhs))
|
143
|
+
|
144
|
+
# Make sure the production symbol is collected.
|
145
|
+
@nonterms[lhs] = true
|
146
|
+
|
147
|
+
# Add the new production to our collections.
|
148
|
+
self.add_production(production)
|
149
|
+
|
150
|
+
return production
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns the _first_ set for _sentence_. _Sentence_ may be either a
|
154
|
+
# single symbol or an array of symbols.
|
155
|
+
def first_set(sentence)
|
156
|
+
if sentence.is_a?(Symbol)
|
157
|
+
self.first_set_prime(sentence)
|
158
|
+
|
159
|
+
elsif sentence.inject(true) { |m, sym| m and self.symbols.include?(sym) }
|
160
|
+
set0 = []
|
161
|
+
all_have_empty = true
|
162
|
+
|
163
|
+
sentence.each do |sym|
|
164
|
+
set0 |= (set1 = self.first_set(sym)) - [:'ɛ']
|
165
|
+
|
166
|
+
break if not (all_have_empty = set1.include?(:'ɛ'))
|
167
|
+
end
|
168
|
+
|
169
|
+
if all_have_empty then set0 + [:'ɛ'] else set0 end
|
170
|
+
else
|
171
|
+
nil
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# This function is responsible for calculating the _first_ set of
|
176
|
+
# individual symbols. CFG.first_set is a wrapper around this function
|
177
|
+
# to provide support for calculating the _first_ set for sentences.
|
178
|
+
def first_set_prime(sym0, seen_lh_sides = [])
|
179
|
+
if self.symbols.include?(sym0)
|
180
|
+
# Memoize the result for later.
|
181
|
+
@firsts[sym0] ||=
|
182
|
+
|
183
|
+
if CFG::is_terminal?(sym0)
|
184
|
+
# If the symbol is a terminal, it is the only symbol in
|
185
|
+
# its follow set.
|
186
|
+
[sym0]
|
187
|
+
else
|
188
|
+
set0 = []
|
189
|
+
|
190
|
+
@productions_sym[sym0].each do |production|
|
191
|
+
if production.rhs == []
|
192
|
+
# If this is an empty production we should
|
193
|
+
# add the empty string to the First set.
|
194
|
+
set0 << :'ɛ'
|
195
|
+
else
|
196
|
+
all_have_empty = true
|
197
|
+
|
198
|
+
production.rhs.each do |sym1|
|
199
|
+
|
200
|
+
set1 = []
|
201
|
+
|
202
|
+
# Grab the First set for the current
|
203
|
+
# symbol in this production.
|
204
|
+
if not seen_lh_sides.include?(sym1)
|
205
|
+
set0 |= (set1 = self.first_set_prime(sym1, seen_lh_sides << sym1)) - [:'ɛ']
|
206
|
+
end
|
207
|
+
|
208
|
+
break if not (all_have_empty = set1.include?(:'ɛ'))
|
209
|
+
end
|
210
|
+
|
211
|
+
# Add the empty production if this production
|
212
|
+
# is all non-terminals that can be reduced to
|
213
|
+
# the empty string.
|
214
|
+
set0 << :'ɛ' if all_have_empty
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
set0.uniq
|
219
|
+
end
|
220
|
+
else
|
221
|
+
nil
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# Returns the _follow_ set for a given symbol. The second argument is
|
226
|
+
# used to avoid infinite recursion when mutually recursive rules are
|
227
|
+
# encountered.
|
228
|
+
def follow_set(sym0, seen_lh_sides = [])
|
229
|
+
|
230
|
+
# Use the memoized set if possible.
|
231
|
+
return @follows[sym0] if @follows.has_key?(sym0)
|
232
|
+
|
233
|
+
if @nonterms[sym0]
|
234
|
+
set0 = []
|
235
|
+
|
236
|
+
# Add EOS to the start symbol's follow set.
|
237
|
+
set0 << :EOS if sym0 == @start_symbol
|
238
|
+
|
239
|
+
@productions_id.values.each do |production|
|
240
|
+
production.rhs.each_with_index do |sym1, i|
|
241
|
+
if i + 1 < production.rhs.length
|
242
|
+
if sym0 == sym1
|
243
|
+
set0 |= (set1 = self.first_set(production.rhs[(i + 1)..-1])) - [:'ɛ']
|
244
|
+
|
245
|
+
set0 |= self.follow_set(production.lhs) if set1.include?(:'ɛ')
|
246
|
+
end
|
247
|
+
elsif sym0 != production.lhs and sym0 == sym1 and not seen_lh_sides.include?(production.lhs)
|
248
|
+
set0 |= self.follow_set(production.lhs, seen_lh_sides << production.lhs)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
if seen_lh_sides.empty? or not set0.empty?
|
254
|
+
# Memoize the result for later.
|
255
|
+
@follows[sym0] |= set0
|
256
|
+
else
|
257
|
+
set0
|
258
|
+
end
|
259
|
+
else
|
260
|
+
[]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Builds productions used to eliminate the + EBNF operator.
|
265
|
+
def get_plus(symbol)
|
266
|
+
new_symbol = (symbol.to_s.downcase + '_plus').to_sym
|
267
|
+
|
268
|
+
if not @productions_sym.has_key?(new_symbol)
|
269
|
+
# Add the items for the following productions:
|
270
|
+
#
|
271
|
+
# token_plus: token | token token_plus
|
272
|
+
|
273
|
+
# 1st production
|
274
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
|
275
|
+
@callback.call(production, :+, :first)
|
276
|
+
|
277
|
+
# 2nd production
|
278
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
|
279
|
+
@callback.call(production, :+, :second)
|
280
|
+
|
281
|
+
# Add the new symbol to the list of nonterminals.
|
282
|
+
@nonterms[new_symbol] = true
|
283
|
+
end
|
284
|
+
|
285
|
+
return new_symbol
|
286
|
+
end
|
287
|
+
|
288
|
+
# Builds productions used to eliminate the ? EBNF operator.
|
289
|
+
def get_question(symbol)
|
290
|
+
new_symbol = (symbol.to_s.downcase + '_question').to_sym
|
291
|
+
|
292
|
+
if not @productions_sym.has_key?(new_symbol)
|
293
|
+
# Add the items for the following productions:
|
294
|
+
#
|
295
|
+
# nonterm_question: | nonterm
|
296
|
+
|
297
|
+
# 1st (empty) production.
|
298
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, []))
|
299
|
+
@callback.call(production, :'?', :first)
|
300
|
+
|
301
|
+
# 2nd production
|
302
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol]))
|
303
|
+
@callback.call(production, :'?', :second)
|
304
|
+
|
305
|
+
# Add the new symbol to the list of nonterminals.
|
306
|
+
@nonterms[new_symbol] = true
|
307
|
+
end
|
308
|
+
|
309
|
+
return new_symbol
|
310
|
+
end
|
311
|
+
|
312
|
+
# Builds productions used to eliminate the * EBNF operator.
|
313
|
+
def get_star(symbol)
|
314
|
+
new_symbol = (symbol.to_s.downcase + '_star').to_sym
|
315
|
+
|
316
|
+
if not @productions_sym.has_key?(new_symbol)
|
317
|
+
# Add the items for the following productions:
|
318
|
+
#
|
319
|
+
# token_star: | token token_star
|
320
|
+
|
321
|
+
# 1st (empty) production
|
322
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, []))
|
323
|
+
@callback.call(production, :*, :first)
|
324
|
+
|
325
|
+
# 2nd production
|
326
|
+
self.add_production(production = Production.new(self.next_id, new_symbol, [symbol, new_symbol]))
|
327
|
+
@callback.call(production, :*, :second)
|
328
|
+
|
329
|
+
# Add the new symbol to the list of nonterminals.
|
330
|
+
@nonterms[new_symbol] = true
|
331
|
+
end
|
332
|
+
|
333
|
+
return new_symbol
|
334
|
+
end
|
335
|
+
|
336
|
+
# Returns the ID for the next production to be defined.
|
337
|
+
def next_id
|
338
|
+
@production_counter += 1
|
339
|
+
end
|
340
|
+
|
341
|
+
# Returns all of the non-terminal symbols used in the gramar's
|
342
|
+
# definition.
|
343
|
+
def nonterms
|
344
|
+
@nonterms.keys
|
345
|
+
end
|
346
|
+
|
347
|
+
# Builds a new production with the left-hand side value of _symbol_.
|
348
|
+
# If _expression_ is specified it is take as the right-hand side of
|
349
|
+
# production. If _expression_ is nil then _block_ is evaluated, and
|
350
|
+
# expected to make one or more calls to CFG.clause.
|
351
|
+
def production(symbol, expression = nil, &block)
|
352
|
+
@production_buffer = Array.new
|
353
|
+
@curr_lhs = symbol
|
354
|
+
|
355
|
+
if expression
|
356
|
+
self.clause(expression)
|
357
|
+
else
|
358
|
+
self.instance_exec(&block)
|
359
|
+
end
|
360
|
+
|
361
|
+
@curr_lhs = nil
|
362
|
+
return @production_buffer.clone
|
363
|
+
end
|
364
|
+
|
365
|
+
# If _by_ is :sym, returns a hash of the grammar's productions, using
|
366
|
+
# the productions' left-hand side symbol as the key. If _by_ is :id
|
367
|
+
# an array of productions is returned in the order of their
|
368
|
+
# definition.
|
369
|
+
def productions(by = :sym)
|
370
|
+
if by == :sym
|
371
|
+
@productions_sym
|
372
|
+
elsif by == :id
|
373
|
+
@productions_id
|
374
|
+
else
|
375
|
+
nil
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# Sets the start symbol for this grammar.
|
380
|
+
def start(symbol)
|
381
|
+
if not CFG::is_nonterminal?(symbol)
|
382
|
+
raise GrammarError, 'Start symbol must be a non-terminal.'
|
383
|
+
end
|
384
|
+
|
385
|
+
@start_symbol = symbol
|
386
|
+
end
|
387
|
+
|
388
|
+
# Returns a list of symbols encountered in the grammar's definition.
|
389
|
+
def symbols
|
390
|
+
self.terms + self.nonterms
|
391
|
+
end
|
392
|
+
|
393
|
+
# Returns a list of all terminal symbols encountered in the grammar's
|
394
|
+
# definition.
|
395
|
+
def terms
|
396
|
+
@terms.keys
|
397
|
+
end
|
398
|
+
|
399
|
+
# Oddly enough, the Production class represents a production in a
|
400
|
+
# context-free grammar.
|
401
|
+
class Production
|
402
|
+
attr_reader :id
|
403
|
+
attr_reader :lhs
|
404
|
+
attr_reader :rhs
|
405
|
+
|
406
|
+
# Instantiates a new Production object with the specified ID,
|
407
|
+
# and left- and right-hand sides.
|
408
|
+
def initialize(id, lhs, rhs)
|
409
|
+
@id = id
|
410
|
+
@lhs = lhs
|
411
|
+
@rhs = rhs
|
412
|
+
end
|
413
|
+
|
414
|
+
# Comparese on production to another. Returns true only if the
|
415
|
+
# left- and right- hand sides match.
|
416
|
+
def ==(other)
|
417
|
+
self.lhs == other.lhs and self.rhs == other.rhs
|
418
|
+
end
|
419
|
+
|
420
|
+
# Makes a new copy of the production.
|
421
|
+
def copy
|
422
|
+
Production.new(@id, @lhs, @rhs.clone)
|
423
|
+
end
|
424
|
+
|
425
|
+
# Locates the last terminal in the right-hand side of a
|
426
|
+
# production.
|
427
|
+
def last_terminal
|
428
|
+
@rhs.inject(nil) { |m, sym| if CFG::is_terminal?(sym) then sym else m end }
|
429
|
+
end
|
430
|
+
|
431
|
+
# Returns a new Item based on this production.
|
432
|
+
def to_item
|
433
|
+
Item.new(0, @id, @lhs, @rhs)
|
434
|
+
end
|
435
|
+
|
436
|
+
# Returns a string representation of this production.
|
437
|
+
def to_s(padding = 0)
|
438
|
+
"#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.join(' ')}"
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# The Item class represents a CFG production with dot in it.
|
443
|
+
class Item < Production
|
444
|
+
attr_reader :dot
|
445
|
+
|
446
|
+
# Instantiates a new Item object with a dot located before the
|
447
|
+
# symbol at index _dot_ of the right-hand side. The remaining
|
448
|
+
# arguments (_args_) should be as specified by
|
449
|
+
# Production.initialize.
|
450
|
+
def initialize(dot, *args)
|
451
|
+
super(*args)
|
452
|
+
|
453
|
+
# The Dot indicates the NEXT symbol to be read.
|
454
|
+
@dot = dot
|
455
|
+
end
|
456
|
+
|
457
|
+
# Compares two items.
|
458
|
+
def ==(other)
|
459
|
+
self.dot == other.dot and self.lhs == other.lhs and self.rhs == other.rhs
|
460
|
+
end
|
461
|
+
|
462
|
+
# Moves the items dot forward by one if the end of the right-hand
|
463
|
+
# side hasn't already been reached.
|
464
|
+
def advance
|
465
|
+
if @dot < @rhs.length
|
466
|
+
@dot += 1
|
467
|
+
end
|
468
|
+
end
|
469
|
+
|
470
|
+
# Tests to see if the dot is at the end of the right-hand side.
|
471
|
+
def at_end?
|
472
|
+
@dot == @rhs.length
|
473
|
+
end
|
474
|
+
|
475
|
+
# Produces a new copy of this item.
|
476
|
+
def copy
|
477
|
+
Item.new(@dot, @id, @lhs, @rhs.clone)
|
478
|
+
end
|
479
|
+
|
480
|
+
# Returns the symbol located after the dot.
|
481
|
+
def next_symbol
|
482
|
+
@rhs[@dot]
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns a string representation of this item.
|
486
|
+
def to_s(padding = 0)
|
487
|
+
"#{format("%-#{padding}s", @lhs)} -> #{@rhs.map { |s| s.to_s }.insert(@dot, '·').join(' ') }"
|
488
|
+
end
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|