whittle 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/README.md +468 -0
- data/Rakefile +1 -0
- data/lib/whittle/error.rb +9 -0
- data/lib/whittle/errors/grammar_error.rb +9 -0
- data/lib/whittle/errors/parse_error.rb +35 -0
- data/lib/whittle/errors/unconsumed_input_error.rb +9 -0
- data/lib/whittle/parser.rb +343 -0
- data/lib/whittle/rule.rb +239 -0
- data/lib/whittle/rule_set.rb +118 -0
- data/lib/whittle/version.rb +3 -0
- data/lib/whittle.rb +8 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/unit/parser/empty_rule_spec.rb +21 -0
- data/spec/unit/parser/empty_string_spec.rb +17 -0
- data/spec/unit/parser/error_reporting_spec.rb +55 -0
- data/spec/unit/parser/grouped_expr_spec.rb +27 -0
- data/spec/unit/parser/multiple_precedence_spec.rb +33 -0
- data/spec/unit/parser/noop_spec.rb +23 -0
- data/spec/unit/parser/pass_through_parser_spec.rb +17 -0
- data/spec/unit/parser/precedence_spec.rb +26 -0
- data/spec/unit/parser/self_referential_expr_spec.rb +26 -0
- data/spec/unit/parser/skipped_tokens_spec.rb +28 -0
- data/spec/unit/parser/sum_parser_spec.rb +23 -0
- data/spec/unit/parser/typecast_parser_spec.rb +17 -0
- data/whittle.gemspec +27 -0
- metadata +104 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
# Whittle: A little LALR(1) parser in pure ruby, without a generator.
|
2
|
+
#
|
3
|
+
# Copyright (c) Chris Corbyn, 2011
|
4
|
+
|
5
|
+
module Whittle
|
6
|
+
# Parsers are created by subclassing the Parser class and defining a context-free grammar.
|
7
|
+
#
|
8
|
+
# Unlike other LALR(1) parsers, Whittle does not rely on code-generation, instead it
|
9
|
+
# synthesizes a parse table from the grammar at runtime, on the first parse.
|
10
|
+
#
|
11
|
+
# While Whittle's implementation works a little differently to yacc/bison and ruby parser
|
12
|
+
# generators like racc and citrus, the parseable grammars are the same. LALR(1) parsers are
|
13
|
+
# very powerful and it is generally said that the languages they cannot parse are difficult
|
14
|
+
# for humans to understand.
|
15
|
+
#
|
16
|
+
# You should refer to the README for a full description of how to use the parser,
|
17
|
+
# but a quick example follows.
|
18
|
+
#
|
19
|
+
# @example A simple Whittle Parser
|
20
|
+
#
|
21
|
+
# class Calculator < Whittle::Parser
|
22
|
+
# rule(:wsp) do |r|
|
23
|
+
# r[/s+/] # skip whitespace
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# rule(:int) do |r|
|
27
|
+
# r[/[0-9]+/].as { |i| Integer(i) }
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# rule("+") % :left
|
31
|
+
# rule("-") % :left
|
32
|
+
# rule("/") % :left
|
33
|
+
# rule("*") % :left
|
34
|
+
#
|
35
|
+
# rule(:expr) do |r|
|
36
|
+
# r[:expr, "+", :expr].as { |left, _, right| left + right }
|
37
|
+
# r[:expr, "-", :expr].as { |left, _, right| left - right }
|
38
|
+
# r[:expr, "/", :expr].as { |left, _, right| left / right }
|
39
|
+
# r[:expr, "*", :expr].as { |left, _, right| left * right }
|
40
|
+
# r[:int].as(:value)
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# start(:expr)
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# calculator = Calculator.new
|
47
|
+
# calculator.parse("1 + (2 * 6) - 7")
|
48
|
+
# # => 6
|
49
|
+
class Parser
|
50
|
+
class << self
|
51
|
+
# Returns a Hash mapping rule names with their RuleSets.
|
52
|
+
#
|
53
|
+
# @return [Hash<String, RuleSet>]
|
54
|
+
# all rules defined by the parser
|
55
|
+
def rules
|
56
|
+
@rules ||= {}
|
57
|
+
end
|
58
|
+
|
59
|
+
# Declares a new rule.
|
60
|
+
#
|
61
|
+
# The are two ways to call this method. The most fundamental way is to pass a Symbol
|
62
|
+
# in the +name+ parameter, along with a block, in which you will add one more possible
|
63
|
+
# rules.
|
64
|
+
#
|
65
|
+
# @example Specifying multiple rules with a block
|
66
|
+
#
|
67
|
+
# rule(:expr) do |r|
|
68
|
+
# r[:expr, "+", :expr].as { |a, _, b| a + b }
|
69
|
+
# r[:expr, "-", :expr].as { |a, _, b| a - b }
|
70
|
+
# r[:expr, "/", :expr].as { |a, _, b| a / b }
|
71
|
+
# r[:expr, "*", :expr].as { |a, _, b| a * b }
|
72
|
+
# r[:integer].as { |i| Integer(i) }
|
73
|
+
# end
|
74
|
+
#
|
75
|
+
# Each rule specified in this way defines one of many possibilities to describe the input.
|
76
|
+
# Rules may refer back to themselves, which means in the above, any integer is a valid
|
77
|
+
# expr:
|
78
|
+
#
|
79
|
+
# 42
|
80
|
+
#
|
81
|
+
# Therefore any sum of integers as also a valid expr:
|
82
|
+
#
|
83
|
+
# 42 + 24
|
84
|
+
#
|
85
|
+
# Therefore any multiplication of sums of integers is also a valid expr, and so on.
|
86
|
+
#
|
87
|
+
# 42 + 24 * 7 + 52
|
88
|
+
#
|
89
|
+
# A rule like the above is called a 'nonterminal', because upon recognizing any expr, it
|
90
|
+
# is possible for the rule to continue collecting input and becoming a larger expr.
|
91
|
+
#
|
92
|
+
# In subtle contrast, a rule like the following:
|
93
|
+
#
|
94
|
+
# rule("+") do |r|
|
95
|
+
# r["+"].as { |plus| plus }
|
96
|
+
# end
|
97
|
+
#
|
98
|
+
# Is called a 'terminal' token, since upon recognizing a "+", the parser cannot
|
99
|
+
# add further input to the "+" itself... it is the tip of a branch in the parse tree; the
|
100
|
+
# branch terminates here, and subsequently the rule is terminal.
|
101
|
+
#
|
102
|
+
# There is a shorthand way to write the above rule:
|
103
|
+
#
|
104
|
+
# rule("+")
|
105
|
+
#
|
106
|
+
# Not given a block, #rule treats the name parameter as a literal token.
|
107
|
+
#
|
108
|
+
# Note that nonterminal rules are composed of other nonterminal rules and/or terminal
|
109
|
+
# rules. Terminal rules contain one, and only one Regexp pattern or fixed string.
|
110
|
+
#
|
111
|
+
# @param [Symbol, String] name
|
112
|
+
# the name of the ruleset (note the one ruleset can contain multiple rules)
|
113
|
+
#
|
114
|
+
# @return [RuleSet, Rule]
|
115
|
+
# the newly created RuleSet if a block was given, otherwise a rule representing a
|
116
|
+
# terminal token for the input string +name+.
|
117
|
+
def rule(name)
|
118
|
+
rules[name] = RuleSet.new(name)
|
119
|
+
|
120
|
+
if block_given?
|
121
|
+
rules[name].tap { |r| yield r }
|
122
|
+
else
|
123
|
+
rules[name][name].as(:value)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Declares most general rule that can be used to describe an entire input.
|
128
|
+
#
|
129
|
+
# Called without any arguments, returns the current start rule.
|
130
|
+
#
|
131
|
+
# @param [Symbol] name
|
132
|
+
# the name of a rule defined in the parser (does not need to be defined beforehand)
|
133
|
+
#
|
134
|
+
# @return [Symbol]
|
135
|
+
# the new (or current) start rule
|
136
|
+
def start(name = nil)
|
137
|
+
@start = name unless name.nil?
|
138
|
+
@start
|
139
|
+
end
|
140
|
+
|
141
|
+
# Returns the numeric value for the initial state (the state ID associated with the start
|
142
|
+
# rule).
|
143
|
+
#
|
144
|
+
# In most LALR(1) parsers, this would be zero, but for implementation reasons, this will
|
145
|
+
# be an unpredictably large (or small) number.
|
146
|
+
#
|
147
|
+
# @return [Fixnum]
|
148
|
+
# the ID for the initial state in the parse table
|
149
|
+
def initial_state
|
150
|
+
prepare_start_rule
|
151
|
+
[rules[start], 0].hash
|
152
|
+
end
|
153
|
+
|
154
|
+
# Returns the entire parse table used to interpret input into the parser.
|
155
|
+
#
|
156
|
+
# You should not need to call this method, though you may wish to inspect its contents
|
157
|
+
# during debugging.
|
158
|
+
#
|
159
|
+
# Note that the token +nil+ in the parse table represents "anything" and its action is
|
160
|
+
# always to reduce.
|
161
|
+
#
|
162
|
+
# Shift-reduce conflicts are resolved at runtime and therefore remain in the parse table.
|
163
|
+
#
|
164
|
+
# @return [Hash]
|
165
|
+
# a 2-dimensional Hash representing states with actions to perform for a given lookahead
|
166
|
+
def parse_table
|
167
|
+
@parse_table ||= begin
|
168
|
+
prepare_start_rule
|
169
|
+
rules[start].build_parse_table(
|
170
|
+
{},
|
171
|
+
self,
|
172
|
+
{
|
173
|
+
:state => initial_state,
|
174
|
+
:seen => [],
|
175
|
+
:offset => 0,
|
176
|
+
:prec => 0
|
177
|
+
}
|
178
|
+
)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
def prepare_start_rule
|
185
|
+
raise GrammarError, "Undefined start rule #{start.inspect}" unless rules.key?(start)
|
186
|
+
|
187
|
+
if rules[start].terminal?
|
188
|
+
rule(:*) do |r|
|
189
|
+
r[start].as { |prog| prog }
|
190
|
+
end
|
191
|
+
|
192
|
+
start(:*)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Alias for class method Parser.rules
|
198
|
+
#
|
199
|
+
# @see Parser.rules
|
200
|
+
def rules
|
201
|
+
self.class.rules
|
202
|
+
end
|
203
|
+
|
204
|
+
# Accepts input in the form of a String and attempts to parse it according to the grammar.
|
205
|
+
#
|
206
|
+
# The input is scanned using a lexical analysis routine, defined by the #lex method. Each
|
207
|
+
# token detected by the routine is used to pick an action from the parse table. Each
|
208
|
+
# reduction initially builds a branch in an AST (abstract syntax tree), until all input has
|
209
|
+
# been read and the start rule has been recognized, at which point the AST is evaluated by
|
210
|
+
# invoking the callbacks defined in the grammar in a depth-first fashion.
|
211
|
+
#
|
212
|
+
# If the parser encounters a token it does not recognise, a parse error will be raised,
|
213
|
+
# specifying what was expected, what was received, and on which line the error occurred.
|
214
|
+
#
|
215
|
+
# A successful parse returns the result of evaluating the start rule, whatever that may be.
|
216
|
+
#
|
217
|
+
# @param [String] input
|
218
|
+
# a complete input string to parse according to the grammar
|
219
|
+
#
|
220
|
+
# @return [Object]
|
221
|
+
# whatever the grammar defines
|
222
|
+
def parse(input)
|
223
|
+
table = self.class.parse_table
|
224
|
+
states = [self.class.initial_state]
|
225
|
+
args = []
|
226
|
+
line = 1
|
227
|
+
|
228
|
+
lex(input) do |token|
|
229
|
+
line = token[:line]
|
230
|
+
input = token
|
231
|
+
|
232
|
+
catch(:shifted) do
|
233
|
+
loop do
|
234
|
+
state = table[states.last]
|
235
|
+
|
236
|
+
if ins = state[input[:name]] || state[nil]
|
237
|
+
case ins[:action]
|
238
|
+
when :shift
|
239
|
+
input[:args] = [input.delete(:value)]
|
240
|
+
states << ins[:state]
|
241
|
+
args << input
|
242
|
+
throw :shifted
|
243
|
+
when :reduce
|
244
|
+
size = ins[:rule].components.length
|
245
|
+
input = {
|
246
|
+
:rule => ins[:rule],
|
247
|
+
:name => ins[:rule].name,
|
248
|
+
:line => line,
|
249
|
+
:args => args.pop(size)
|
250
|
+
}
|
251
|
+
states.pop(size)
|
252
|
+
args << input
|
253
|
+
|
254
|
+
return accept(args.pop) if states.length == 1 && token[:name] == :$end
|
255
|
+
when :goto
|
256
|
+
input = token
|
257
|
+
states << ins[:state]
|
258
|
+
end
|
259
|
+
else
|
260
|
+
error(state, input, :states => states, :args => args)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
# Accepts a String as input and repeatedly yields terminal tokens found in the grammar.
|
268
|
+
#
|
269
|
+
# The last token yielded is always named :$end and has the value of +nil+.
|
270
|
+
#
|
271
|
+
# You may override this method to define a smarter implementation, should you need to.
|
272
|
+
#
|
273
|
+
# @param [String] input
|
274
|
+
# the complete input string the lex
|
275
|
+
def lex(input)
|
276
|
+
line = 1
|
277
|
+
offset = 0
|
278
|
+
ending = input.length
|
279
|
+
|
280
|
+
until offset == ending do
|
281
|
+
next_token(input, offset, line).tap do |token|
|
282
|
+
raise UnconsumedInputError,
|
283
|
+
"Unmatched input #{input[offset..-1].inspect} on line #{line}" if token.nil?
|
284
|
+
|
285
|
+
offset += token[:value].length
|
286
|
+
line, token[:line] = token[:line], line
|
287
|
+
yield token unless token[:discarded]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
yield ({ :name => :$end, :line => line, :value => nil })
|
292
|
+
end
|
293
|
+
|
294
|
+
# Invoked when the parser detects an error.
|
295
|
+
#
|
296
|
+
# The default implementation raises a RuntimeError specifying the allowed inputs
|
297
|
+
# and the received input, along with a line number.
|
298
|
+
#
|
299
|
+
# You may override this method with your own implementation, which, at least in theory,
|
300
|
+
# can recover from the error and allow the parse to continue, though this is an extremely
|
301
|
+
# advanced topic and requires a good understanding of how LALR(1) parsers operate.
|
302
|
+
#
|
303
|
+
# @param [Hash] state
|
304
|
+
# the possible actions for the current parser state
|
305
|
+
#
|
306
|
+
# @param [Hash] input
|
307
|
+
# the received token (or, unlikely, a nonterminal symbol)
|
308
|
+
#
|
309
|
+
# @param [Hash] stack
|
310
|
+
# the current parse context (arg stack + state stack)
|
311
|
+
def error(state, input, stack)
|
312
|
+
expected = state.reject { |s, i| i[:action] == :goto }.keys
|
313
|
+
message = <<-ERROR.gsub(/\n\s+/, " ").strip
|
314
|
+
Parse error:
|
315
|
+
expected
|
316
|
+
#{expected.map { |k| k.inspect }.join("; or ")}
|
317
|
+
but got
|
318
|
+
#{input[:name].inspect}
|
319
|
+
on line
|
320
|
+
#{input[:line]}
|
321
|
+
ERROR
|
322
|
+
|
323
|
+
raise ParseError.new(message, input[:line], expected, input[:name])
|
324
|
+
end
|
325
|
+
|
326
|
+
private
|
327
|
+
|
328
|
+
def next_token(source, offset, line)
|
329
|
+
rules.each do |name, rule|
|
330
|
+
if token = rule.scan(source, offset, line)
|
331
|
+
token[:name] = name
|
332
|
+
return token
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
nil
|
337
|
+
end
|
338
|
+
|
339
|
+
def accept(tree)
|
340
|
+
tree[:rule].action.call(*tree[:args].map { |arg| Hash === arg ? accept(arg) : arg })
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|
data/lib/whittle/rule.rb
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
# Whittle: A little LALR(1) parser in pure ruby, without a generator.
|
2
|
+
#
|
3
|
+
# Copyright (c) Chris Corbyn, 2011
|
4
|
+
|
5
|
+
module Whittle
|
6
|
+
# Represents an individual Rule, forming part of an overall RuleSet.
|
7
|
+
class Rule
|
8
|
+
NULL_ACTION = Proc.new { }
|
9
|
+
DUMP_ACTION = Proc.new { |input| input }
|
10
|
+
|
11
|
+
attr_reader :name
|
12
|
+
attr_reader :action
|
13
|
+
attr_reader :components
|
14
|
+
attr_reader :assoc
|
15
|
+
attr_reader :prec
|
16
|
+
|
17
|
+
# Create a new Rule for the RuleSet named +name+.
|
18
|
+
#
|
19
|
+
# The components can either be names of other Rules, or for a terminal Rule,
|
20
|
+
# a single pattern to match in the input string.
|
21
|
+
#
|
22
|
+
# @param [String] name
|
23
|
+
# the name of the RuleSet to which this Rule belongs
|
24
|
+
#
|
25
|
+
# @param [Object...] components...
|
26
|
+
# a variable list of components that make up the Rule
|
27
|
+
def initialize(name, *components)
|
28
|
+
@components = components
|
29
|
+
@action = NULL_ACTION
|
30
|
+
@name = name
|
31
|
+
@terminal = components.length == 1 && !components.first.kind_of?(Symbol)
|
32
|
+
@assoc = :right
|
33
|
+
@prec = 0
|
34
|
+
|
35
|
+
@components.each do |c|
|
36
|
+
unless Regexp === c || String === c || Symbol === c
|
37
|
+
raise ArgumentError, "Unsupported rule component #{c.class}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
pattern = @components.first
|
42
|
+
|
43
|
+
if @terminal
|
44
|
+
@pattern = if pattern.kind_of?(Regexp)
|
45
|
+
Regexp.new("\\G#{pattern}")
|
46
|
+
else
|
47
|
+
Regexp.new("\\G#{Regexp.escape(pattern)}")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Predicate check for whether or not the Rule represents a terminal symbol.
|
53
|
+
#
|
54
|
+
# A terminal symbol is effectively any rule that directly matches some
|
55
|
+
# pattern in the input string and references no other rules.
|
56
|
+
#
|
57
|
+
# @return [Boolean]
|
58
|
+
# true if this rule represents a terminal symbol
|
59
|
+
def terminal?
|
60
|
+
@terminal
|
61
|
+
end
|
62
|
+
|
63
|
+
# Walks all possible branches from the given rule, building a parse table.
|
64
|
+
#
|
65
|
+
# The parse table is a list of instructions (transitions) that can be looked
|
66
|
+
# up, given the current parser state and the current lookahead token.
|
67
|
+
#
|
68
|
+
# @param [Hash<Fixnum,Hash>] table
|
69
|
+
# the table to construct for
|
70
|
+
#
|
71
|
+
# @param [Parser] parser
|
72
|
+
# the Parser containing all the Rules in the grammar
|
73
|
+
#
|
74
|
+
# @param [Hash] context
|
75
|
+
# a Hash used to track state as the grammar is analyzed
|
76
|
+
def build_parse_table(table, parser, context)
|
77
|
+
state = table[context[:state]] ||= {}
|
78
|
+
sym = components[context[:offset]]
|
79
|
+
rule = parser.rules[sym]
|
80
|
+
new_offset = context[:offset] + 1
|
81
|
+
new_state = if state.key?(sym)
|
82
|
+
state[sym][:state]
|
83
|
+
end || [self, new_offset].hash
|
84
|
+
|
85
|
+
if sym.nil?
|
86
|
+
state[sym] = {
|
87
|
+
:action => :reduce,
|
88
|
+
:rule => self,
|
89
|
+
:prec => context[:prec]
|
90
|
+
}
|
91
|
+
else
|
92
|
+
raise GrammarError, "Unreferenced rule #{sym.inspect}" if rule.nil?
|
93
|
+
|
94
|
+
new_prec = if rule.terminal?
|
95
|
+
rule.prec
|
96
|
+
else
|
97
|
+
context[:prec]
|
98
|
+
end
|
99
|
+
|
100
|
+
if rule.terminal?
|
101
|
+
state[sym] = {
|
102
|
+
:action => :shift,
|
103
|
+
:state => new_state,
|
104
|
+
:prec => new_prec,
|
105
|
+
:assoc => rule.assoc
|
106
|
+
}
|
107
|
+
else
|
108
|
+
state[sym] = {
|
109
|
+
:action => :goto,
|
110
|
+
:state => new_state
|
111
|
+
}
|
112
|
+
|
113
|
+
rule.build_parse_table(
|
114
|
+
table,
|
115
|
+
parser,
|
116
|
+
{
|
117
|
+
:state => context[:state],
|
118
|
+
:seen => context[:seen],
|
119
|
+
:offset => 0,
|
120
|
+
:prec => 0
|
121
|
+
}
|
122
|
+
)
|
123
|
+
end
|
124
|
+
|
125
|
+
build_parse_table(
|
126
|
+
table,
|
127
|
+
parser,
|
128
|
+
{
|
129
|
+
:state => new_state,
|
130
|
+
:seen => context[:seen],
|
131
|
+
:offset => new_offset,
|
132
|
+
:prec => new_prec
|
133
|
+
}
|
134
|
+
)
|
135
|
+
end
|
136
|
+
|
137
|
+
resolve_conflicts(state)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Specify how this Rule should be reduced.
|
141
|
+
#
|
142
|
+
# Given a block, the Rule will be reduced by passing the result of reducing
|
143
|
+
# all inputs as arguments to the block.
|
144
|
+
#
|
145
|
+
# Given the Symbol :value, the matched input will be returned verbatim.
|
146
|
+
# Given the Symbol :nothing, nil will be returned; you can use this to
|
147
|
+
# skip whitesapce and comments, for example.
|
148
|
+
#
|
149
|
+
# @param [Symbol] preset
|
150
|
+
# one of the preset actions, :value or :nothing; optional
|
151
|
+
#
|
152
|
+
# @return [Rule]
|
153
|
+
# returns self
|
154
|
+
def as(preset = nil, &block)
|
155
|
+
tap do
|
156
|
+
case preset
|
157
|
+
when :value then @action = DUMP_ACTION
|
158
|
+
when :nothing then @action = NULL_ACTION
|
159
|
+
when nil
|
160
|
+
raise ArgumentError, "Rule#as expected a block, not none given" unless block_given?
|
161
|
+
@action = block
|
162
|
+
else
|
163
|
+
raise ArgumentError, "Invalid preset #{preset.inspect} to Rule#as"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Set the associativity of this Rule.
|
169
|
+
#
|
170
|
+
# Accepts values of :left, :right (default) or :nonassoc.
|
171
|
+
#
|
172
|
+
# @param [Symbol] assoc
|
173
|
+
# one of :left, :right or :nonassoc
|
174
|
+
#
|
175
|
+
# @return [Rule]
|
176
|
+
# returns self
|
177
|
+
def %(assoc)
|
178
|
+
raise ArgumentError, "Invalid associativity #{assoc.inspect}" \
|
179
|
+
unless [:left, :right, :nonassoc].include?(assoc)
|
180
|
+
|
181
|
+
tap { @assoc = assoc }
|
182
|
+
end
|
183
|
+
|
184
|
+
# Set the precedence of this Rule, as an Integer.
|
185
|
+
#
|
186
|
+
# The higher the number, the higher the precedence.
|
187
|
+
#
|
188
|
+
# @param [Fixnum] prec
|
189
|
+
# the precedence (default is zero)
|
190
|
+
def ^(prec)
|
191
|
+
raise ArgumentError, "Invalid precedence level #{prec.inspect}" \
|
192
|
+
unless prec.respond_to?(:to_i)
|
193
|
+
|
194
|
+
tap { @prec = prec.to_i }
|
195
|
+
end
|
196
|
+
|
197
|
+
# Invoked for terminal rules during lexing, ignored for nonterminal rules.
|
198
|
+
#
|
199
|
+
# @param [String] source
|
200
|
+
# the input String the scan
|
201
|
+
#
|
202
|
+
# @param [Fixnum] offset
|
203
|
+
# the current index in the search
|
204
|
+
#
|
205
|
+
# @param [Fixnum] line
|
206
|
+
# the line the lexer was up to when the previous token was matched
|
207
|
+
#
|
208
|
+
# @return [Hash]
|
209
|
+
# a Hash representing the token, containing :rule, :value, :line and
|
210
|
+
# :discarded, if the token is to be skipped.
|
211
|
+
#
|
212
|
+
# Returns nil if nothing is matched.
|
213
|
+
def scan(source, offset, line)
|
214
|
+
return nil unless @terminal
|
215
|
+
|
216
|
+
if match = source.match(@pattern, offset)
|
217
|
+
{
|
218
|
+
:rule => self,
|
219
|
+
:value => match[0],
|
220
|
+
# FIXME: Optimize this line count in a cross-platform compatible way
|
221
|
+
:line => line + ("~" + match[0] + "~").lines.count - 1,
|
222
|
+
:discarded => @action.equal?(NULL_ACTION)
|
223
|
+
}
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
private
|
228
|
+
|
229
|
+
def resolve_conflicts(instructions)
|
230
|
+
if r = instructions.values.detect { |i| i[:action] == :reduce }
|
231
|
+
instructions.reject! do |s, i|
|
232
|
+
((i[:action] == :shift) &&
|
233
|
+
((r[:prec] > i[:prec]) ||
|
234
|
+
(r[:prec] == i[:prec] && i[:assoc] == :left)))
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# Whittle: A little LALR(1) parser in pure ruby, without a generator.
|
2
|
+
#
|
3
|
+
# Copyright (c) Chris Corbyn, 2011
|
4
|
+
|
5
|
+
module Whittle
|
6
|
+
# RuleSets are named collections of Rules.
|
7
|
+
#
|
8
|
+
# When you use the name of a rule in the grammar, you actually refer to the
|
9
|
+
# entire RuleSet and not an individual rule within it (unless of course, it
|
10
|
+
# only contains one Rule)
|
11
|
+
class RuleSet
|
12
|
+
include Enumerable
|
13
|
+
|
14
|
+
# Create a new RuleSet named +name+.
|
15
|
+
#
|
16
|
+
# @param [Symbol, String] name
|
17
|
+
# the name of the rule in the grammar
|
18
|
+
def initialize(name)
|
19
|
+
@name = name
|
20
|
+
@rules = []
|
21
|
+
end
|
22
|
+
|
23
|
+
# Enumerate all Rules in the set.
|
24
|
+
def each(&block)
|
25
|
+
@rules.each(&block)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Add a new Rule to the set.
|
29
|
+
#
|
30
|
+
# @param [Object...] components...
|
31
|
+
# a variable list of components (Symbols, Strings, or Regexps)
|
32
|
+
def [](*components)
|
33
|
+
Rule.new(@name, *components).tap do |rule|
|
34
|
+
@rules << rule
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Invoked during lexing, delegating to each rule in the set.
|
39
|
+
#
|
40
|
+
# @param [String] source
|
41
|
+
# the complete input string
|
42
|
+
#
|
43
|
+
# @param [Fixnum] offset
|
44
|
+
# the current index in the search
|
45
|
+
# @param [Fixnum] line
|
46
|
+
# the current line number
|
47
|
+
#
|
48
|
+
# @return [Hash]
|
49
|
+
# a Hash representing the found token, or nil
|
50
|
+
def scan(source, offset, line)
|
51
|
+
each do |rule|
|
52
|
+
if token = rule.scan(source, offset, line)
|
53
|
+
return token
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
|
60
|
+
# Recursively builds the parse table into +table+.
|
61
|
+
#
|
62
|
+
# @param [Hash<Fixnum,Hash>] table
|
63
|
+
# the parse table as constructed so far
|
64
|
+
#
|
65
|
+
# @param [Parser] parser
|
66
|
+
# the parser containing the grammar
|
67
|
+
#
|
68
|
+
# @param [Hash] context
|
69
|
+
# a Hash used to track state when building the parse table
|
70
|
+
#
|
71
|
+
# @return [Hash]
|
72
|
+
# the parse table
|
73
|
+
def build_parse_table(table, parser, context)
|
74
|
+
return table if context[:seen].include?([context[:state], self])
|
75
|
+
|
76
|
+
context[:seen] << [context[:state], self]
|
77
|
+
|
78
|
+
table.tap do
|
79
|
+
each do |rule|
|
80
|
+
rule.build_parse_table(table, parser, context)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Predicate test for whether or not this RuleSet references a single
|
86
|
+
# terminal Symbol.
|
87
|
+
#
|
88
|
+
# @return [Boolean]
|
89
|
+
# true if this rule is a terminal symbol
|
90
|
+
def terminal?
|
91
|
+
@rules.length == 1 && @rules.first.terminal?
|
92
|
+
end
|
93
|
+
|
94
|
+
# Predicate test for whether or not this RuleSet references a nonterminal Symbol.
|
95
|
+
#
|
96
|
+
# @return [Boolean]
|
97
|
+
# true if this rule is a nonterminal symbol
|
98
|
+
def nonterminal?
|
99
|
+
!terminal?
|
100
|
+
end
|
101
|
+
|
102
|
+
# Convenience method to access the precedence of a RuleSet representing a terminal.
|
103
|
+
#
|
104
|
+
# @return [Fixnum]
|
105
|
+
# the precedence of the terminal Symbol, or zero for nonterminals.
|
106
|
+
def prec
|
107
|
+
terminal? ? @rules.first.prec : 0
|
108
|
+
end
|
109
|
+
|
110
|
+
# Convenience method to access the associativity of a RuleSet representing a terminal.
|
111
|
+
#
|
112
|
+
# @return [Symbol]
|
113
|
+
# the associativty of the terminal Symbol.
|
114
|
+
def assoc
|
115
|
+
terminal? ? @rules.first.assoc : :right
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
data/lib/whittle.rb
ADDED