whittle 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,343 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # Parsers are created by subclassing the Parser class and defining a context-free grammar.
7
+ #
8
+ # Unlike other LALR(1) parsers, Whittle does not rely on code-generation, instead it
9
+ # synthesizes a parse table from the grammar at runtime, on the first parse.
10
+ #
11
+ # While Whittle's implementation works a little differently to yacc/bison and ruby parser
12
+ # generators like racc and citrus, the parseable grammars are the same. LALR(1) parsers are
13
+ # very powerful and it is generally said that the languages they cannot parse are difficult
14
+ # for humans to understand.
15
+ #
16
+ # You should refer to the README for a full description of how to use the parser,
17
+ # but a quick example follows.
18
+ #
19
+ # @example A simple Whittle Parser
20
+ #
21
+ # class Calculator < Whittle::Parser
22
+ # rule(:wsp) do |r|
23
+ # r[/s+/] # skip whitespace
24
+ # end
25
+ #
26
+ # rule(:int) do |r|
27
+ # r[/[0-9]+/].as { |i| Integer(i) }
28
+ # end
29
+ #
30
+ # rule("+") % :left
31
+ # rule("-") % :left
32
+ # rule("/") % :left
33
+ # rule("*") % :left
34
+ #
35
+ # rule(:expr) do |r|
36
+ # r[:expr, "+", :expr].as { |left, _, right| left + right }
37
+ # r[:expr, "-", :expr].as { |left, _, right| left - right }
38
+ # r[:expr, "/", :expr].as { |left, _, right| left / right }
39
+ # r[:expr, "*", :expr].as { |left, _, right| left * right }
40
+ # r[:int].as(:value)
41
+ # end
42
+ #
43
+ # start(:expr)
44
+ # end
45
+ #
46
+ # calculator = Calculator.new
47
+ # calculator.parse("1 + (2 * 6) - 7")
48
+ # # => 6
49
+ class Parser
50
+ class << self
51
+ # Returns a Hash mapping rule names with their RuleSets.
52
+ #
53
+ # @return [Hash<String, RuleSet>]
54
+ # all rules defined by the parser
55
+ def rules
56
+ @rules ||= {}
57
+ end
58
+
59
+ # Declares a new rule.
60
+ #
61
+ # The are two ways to call this method. The most fundamental way is to pass a Symbol
62
+ # in the +name+ parameter, along with a block, in which you will add one more possible
63
+ # rules.
64
+ #
65
+ # @example Specifying multiple rules with a block
66
+ #
67
+ # rule(:expr) do |r|
68
+ # r[:expr, "+", :expr].as { |a, _, b| a + b }
69
+ # r[:expr, "-", :expr].as { |a, _, b| a - b }
70
+ # r[:expr, "/", :expr].as { |a, _, b| a / b }
71
+ # r[:expr, "*", :expr].as { |a, _, b| a * b }
72
+ # r[:integer].as { |i| Integer(i) }
73
+ # end
74
+ #
75
+ # Each rule specified in this way defines one of many possibilities to describe the input.
76
+ # Rules may refer back to themselves, which means in the above, any integer is a valid
77
+ # expr:
78
+ #
79
+ # 42
80
+ #
81
+ # Therefore any sum of integers as also a valid expr:
82
+ #
83
+ # 42 + 24
84
+ #
85
+ # Therefore any multiplication of sums of integers is also a valid expr, and so on.
86
+ #
87
+ # 42 + 24 * 7 + 52
88
+ #
89
+ # A rule like the above is called a 'nonterminal', because upon recognizing any expr, it
90
+ # is possible for the rule to continue collecting input and becoming a larger expr.
91
+ #
92
+ # In subtle contrast, a rule like the following:
93
+ #
94
+ # rule("+") do |r|
95
+ # r["+"].as { |plus| plus }
96
+ # end
97
+ #
98
+ # Is called a 'terminal' token, since upon recognizing a "+", the parser cannot
99
+ # add further input to the "+" itself... it is the tip of a branch in the parse tree; the
100
+ # branch terminates here, and subsequently the rule is terminal.
101
+ #
102
+ # There is a shorthand way to write the above rule:
103
+ #
104
+ # rule("+")
105
+ #
106
+ # Not given a block, #rule treats the name parameter as a literal token.
107
+ #
108
+ # Note that nonterminal rules are composed of other nonterminal rules and/or terminal
109
+ # rules. Terminal rules contain one, and only one Regexp pattern or fixed string.
110
+ #
111
+ # @param [Symbol, String] name
112
+ # the name of the ruleset (note the one ruleset can contain multiple rules)
113
+ #
114
+ # @return [RuleSet, Rule]
115
+ # the newly created RuleSet if a block was given, otherwise a rule representing a
116
+ # terminal token for the input string +name+.
117
+ def rule(name)
118
+ rules[name] = RuleSet.new(name)
119
+
120
+ if block_given?
121
+ rules[name].tap { |r| yield r }
122
+ else
123
+ rules[name][name].as(:value)
124
+ end
125
+ end
126
+
127
+ # Declares most general rule that can be used to describe an entire input.
128
+ #
129
+ # Called without any arguments, returns the current start rule.
130
+ #
131
+ # @param [Symbol] name
132
+ # the name of a rule defined in the parser (does not need to be defined beforehand)
133
+ #
134
+ # @return [Symbol]
135
+ # the new (or current) start rule
136
+ def start(name = nil)
137
+ @start = name unless name.nil?
138
+ @start
139
+ end
140
+
141
+ # Returns the numeric value for the initial state (the state ID associated with the start
142
+ # rule).
143
+ #
144
+ # In most LALR(1) parsers, this would be zero, but for implementation reasons, this will
145
+ # be an unpredictably large (or small) number.
146
+ #
147
+ # @return [Fixnum]
148
+ # the ID for the initial state in the parse table
149
+ def initial_state
150
+ prepare_start_rule
151
+ [rules[start], 0].hash
152
+ end
153
+
154
+ # Returns the entire parse table used to interpret input into the parser.
155
+ #
156
+ # You should not need to call this method, though you may wish to inspect its contents
157
+ # during debugging.
158
+ #
159
+ # Note that the token +nil+ in the parse table represents "anything" and its action is
160
+ # always to reduce.
161
+ #
162
+ # Shift-reduce conflicts are resolved at runtime and therefore remain in the parse table.
163
+ #
164
+ # @return [Hash]
165
+ # a 2-dimensional Hash representing states with actions to perform for a given lookahead
166
+ def parse_table
167
+ @parse_table ||= begin
168
+ prepare_start_rule
169
+ rules[start].build_parse_table(
170
+ {},
171
+ self,
172
+ {
173
+ :state => initial_state,
174
+ :seen => [],
175
+ :offset => 0,
176
+ :prec => 0
177
+ }
178
+ )
179
+ end
180
+ end
181
+
182
+ private
183
+
184
+ def prepare_start_rule
185
+ raise GrammarError, "Undefined start rule #{start.inspect}" unless rules.key?(start)
186
+
187
+ if rules[start].terminal?
188
+ rule(:*) do |r|
189
+ r[start].as { |prog| prog }
190
+ end
191
+
192
+ start(:*)
193
+ end
194
+ end
195
+ end
196
+
197
+ # Alias for class method Parser.rules
198
+ #
199
+ # @see Parser.rules
200
+ def rules
201
+ self.class.rules
202
+ end
203
+
204
+ # Accepts input in the form of a String and attempts to parse it according to the grammar.
205
+ #
206
+ # The input is scanned using a lexical analysis routine, defined by the #lex method. Each
207
+ # token detected by the routine is used to pick an action from the parse table. Each
208
+ # reduction initially builds a branch in an AST (abstract syntax tree), until all input has
209
+ # been read and the start rule has been recognized, at which point the AST is evaluated by
210
+ # invoking the callbacks defined in the grammar in a depth-first fashion.
211
+ #
212
+ # If the parser encounters a token it does not recognise, a parse error will be raised,
213
+ # specifying what was expected, what was received, and on which line the error occurred.
214
+ #
215
+ # A successful parse returns the result of evaluating the start rule, whatever that may be.
216
+ #
217
+ # @param [String] input
218
+ # a complete input string to parse according to the grammar
219
+ #
220
+ # @return [Object]
221
+ # whatever the grammar defines
222
+ def parse(input)
223
+ table = self.class.parse_table
224
+ states = [self.class.initial_state]
225
+ args = []
226
+ line = 1
227
+
228
+ lex(input) do |token|
229
+ line = token[:line]
230
+ input = token
231
+
232
+ catch(:shifted) do
233
+ loop do
234
+ state = table[states.last]
235
+
236
+ if ins = state[input[:name]] || state[nil]
237
+ case ins[:action]
238
+ when :shift
239
+ input[:args] = [input.delete(:value)]
240
+ states << ins[:state]
241
+ args << input
242
+ throw :shifted
243
+ when :reduce
244
+ size = ins[:rule].components.length
245
+ input = {
246
+ :rule => ins[:rule],
247
+ :name => ins[:rule].name,
248
+ :line => line,
249
+ :args => args.pop(size)
250
+ }
251
+ states.pop(size)
252
+ args << input
253
+
254
+ return accept(args.pop) if states.length == 1 && token[:name] == :$end
255
+ when :goto
256
+ input = token
257
+ states << ins[:state]
258
+ end
259
+ else
260
+ error(state, input, :states => states, :args => args)
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+
267
+ # Accepts a String as input and repeatedly yields terminal tokens found in the grammar.
268
+ #
269
+ # The last token yielded is always named :$end and has the value of +nil+.
270
+ #
271
+ # You may override this method to define a smarter implementation, should you need to.
272
+ #
273
+ # @param [String] input
274
+ # the complete input string the lex
275
+ def lex(input)
276
+ line = 1
277
+ offset = 0
278
+ ending = input.length
279
+
280
+ until offset == ending do
281
+ next_token(input, offset, line).tap do |token|
282
+ raise UnconsumedInputError,
283
+ "Unmatched input #{input[offset..-1].inspect} on line #{line}" if token.nil?
284
+
285
+ offset += token[:value].length
286
+ line, token[:line] = token[:line], line
287
+ yield token unless token[:discarded]
288
+ end
289
+ end
290
+
291
+ yield ({ :name => :$end, :line => line, :value => nil })
292
+ end
293
+
294
+ # Invoked when the parser detects an error.
295
+ #
296
+ # The default implementation raises a RuntimeError specifying the allowed inputs
297
+ # and the received input, along with a line number.
298
+ #
299
+ # You may override this method with your own implementation, which, at least in theory,
300
+ # can recover from the error and allow the parse to continue, though this is an extremely
301
+ # advanced topic and requires a good understanding of how LALR(1) parsers operate.
302
+ #
303
+ # @param [Hash] state
304
+ # the possible actions for the current parser state
305
+ #
306
+ # @param [Hash] input
307
+ # the received token (or, unlikely, a nonterminal symbol)
308
+ #
309
+ # @param [Hash] stack
310
+ # the current parse context (arg stack + state stack)
311
+ def error(state, input, stack)
312
+ expected = state.reject { |s, i| i[:action] == :goto }.keys
313
+ message = <<-ERROR.gsub(/\n\s+/, " ").strip
314
+ Parse error:
315
+ expected
316
+ #{expected.map { |k| k.inspect }.join("; or ")}
317
+ but got
318
+ #{input[:name].inspect}
319
+ on line
320
+ #{input[:line]}
321
+ ERROR
322
+
323
+ raise ParseError.new(message, input[:line], expected, input[:name])
324
+ end
325
+
326
+ private
327
+
328
+ def next_token(source, offset, line)
329
+ rules.each do |name, rule|
330
+ if token = rule.scan(source, offset, line)
331
+ token[:name] = name
332
+ return token
333
+ end
334
+ end
335
+
336
+ nil
337
+ end
338
+
339
+ def accept(tree)
340
+ tree[:rule].action.call(*tree[:args].map { |arg| Hash === arg ? accept(arg) : arg })
341
+ end
342
+ end
343
+ end
@@ -0,0 +1,239 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # Represents an individual Rule, forming part of an overall RuleSet.
7
+ class Rule
8
+ NULL_ACTION = Proc.new { }
9
+ DUMP_ACTION = Proc.new { |input| input }
10
+
11
+ attr_reader :name
12
+ attr_reader :action
13
+ attr_reader :components
14
+ attr_reader :assoc
15
+ attr_reader :prec
16
+
17
+ # Create a new Rule for the RuleSet named +name+.
18
+ #
19
+ # The components can either be names of other Rules, or for a terminal Rule,
20
+ # a single pattern to match in the input string.
21
+ #
22
+ # @param [String] name
23
+ # the name of the RuleSet to which this Rule belongs
24
+ #
25
+ # @param [Object...] components...
26
+ # a variable list of components that make up the Rule
27
+ def initialize(name, *components)
28
+ @components = components
29
+ @action = NULL_ACTION
30
+ @name = name
31
+ @terminal = components.length == 1 && !components.first.kind_of?(Symbol)
32
+ @assoc = :right
33
+ @prec = 0
34
+
35
+ @components.each do |c|
36
+ unless Regexp === c || String === c || Symbol === c
37
+ raise ArgumentError, "Unsupported rule component #{c.class}"
38
+ end
39
+ end
40
+
41
+ pattern = @components.first
42
+
43
+ if @terminal
44
+ @pattern = if pattern.kind_of?(Regexp)
45
+ Regexp.new("\\G#{pattern}")
46
+ else
47
+ Regexp.new("\\G#{Regexp.escape(pattern)}")
48
+ end
49
+ end
50
+ end
51
+
52
+ # Predicate check for whether or not the Rule represents a terminal symbol.
53
+ #
54
+ # A terminal symbol is effectively any rule that directly matches some
55
+ # pattern in the input string and references no other rules.
56
+ #
57
+ # @return [Boolean]
58
+ # true if this rule represents a terminal symbol
59
+ def terminal?
60
+ @terminal
61
+ end
62
+
63
+ # Walks all possible branches from the given rule, building a parse table.
64
+ #
65
+ # The parse table is a list of instructions (transitions) that can be looked
66
+ # up, given the current parser state and the current lookahead token.
67
+ #
68
+ # @param [Hash<Fixnum,Hash>] table
69
+ # the table to construct for
70
+ #
71
+ # @param [Parser] parser
72
+ # the Parser containing all the Rules in the grammar
73
+ #
74
+ # @param [Hash] context
75
+ # a Hash used to track state as the grammar is analyzed
76
+ def build_parse_table(table, parser, context)
77
+ state = table[context[:state]] ||= {}
78
+ sym = components[context[:offset]]
79
+ rule = parser.rules[sym]
80
+ new_offset = context[:offset] + 1
81
+ new_state = if state.key?(sym)
82
+ state[sym][:state]
83
+ end || [self, new_offset].hash
84
+
85
+ if sym.nil?
86
+ state[sym] = {
87
+ :action => :reduce,
88
+ :rule => self,
89
+ :prec => context[:prec]
90
+ }
91
+ else
92
+ raise GrammarError, "Unreferenced rule #{sym.inspect}" if rule.nil?
93
+
94
+ new_prec = if rule.terminal?
95
+ rule.prec
96
+ else
97
+ context[:prec]
98
+ end
99
+
100
+ if rule.terminal?
101
+ state[sym] = {
102
+ :action => :shift,
103
+ :state => new_state,
104
+ :prec => new_prec,
105
+ :assoc => rule.assoc
106
+ }
107
+ else
108
+ state[sym] = {
109
+ :action => :goto,
110
+ :state => new_state
111
+ }
112
+
113
+ rule.build_parse_table(
114
+ table,
115
+ parser,
116
+ {
117
+ :state => context[:state],
118
+ :seen => context[:seen],
119
+ :offset => 0,
120
+ :prec => 0
121
+ }
122
+ )
123
+ end
124
+
125
+ build_parse_table(
126
+ table,
127
+ parser,
128
+ {
129
+ :state => new_state,
130
+ :seen => context[:seen],
131
+ :offset => new_offset,
132
+ :prec => new_prec
133
+ }
134
+ )
135
+ end
136
+
137
+ resolve_conflicts(state)
138
+ end
139
+
140
+ # Specify how this Rule should be reduced.
141
+ #
142
+ # Given a block, the Rule will be reduced by passing the result of reducing
143
+ # all inputs as arguments to the block.
144
+ #
145
+ # Given the Symbol :value, the matched input will be returned verbatim.
146
+ # Given the Symbol :nothing, nil will be returned; you can use this to
147
+ # skip whitesapce and comments, for example.
148
+ #
149
+ # @param [Symbol] preset
150
+ # one of the preset actions, :value or :nothing; optional
151
+ #
152
+ # @return [Rule]
153
+ # returns self
154
+ def as(preset = nil, &block)
155
+ tap do
156
+ case preset
157
+ when :value then @action = DUMP_ACTION
158
+ when :nothing then @action = NULL_ACTION
159
+ when nil
160
+ raise ArgumentError, "Rule#as expected a block, not none given" unless block_given?
161
+ @action = block
162
+ else
163
+ raise ArgumentError, "Invalid preset #{preset.inspect} to Rule#as"
164
+ end
165
+ end
166
+ end
167
+
168
+ # Set the associativity of this Rule.
169
+ #
170
+ # Accepts values of :left, :right (default) or :nonassoc.
171
+ #
172
+ # @param [Symbol] assoc
173
+ # one of :left, :right or :nonassoc
174
+ #
175
+ # @return [Rule]
176
+ # returns self
177
+ def %(assoc)
178
+ raise ArgumentError, "Invalid associativity #{assoc.inspect}" \
179
+ unless [:left, :right, :nonassoc].include?(assoc)
180
+
181
+ tap { @assoc = assoc }
182
+ end
183
+
184
+ # Set the precedence of this Rule, as an Integer.
185
+ #
186
+ # The higher the number, the higher the precedence.
187
+ #
188
+ # @param [Fixnum] prec
189
+ # the precedence (default is zero)
190
+ def ^(prec)
191
+ raise ArgumentError, "Invalid precedence level #{prec.inspect}" \
192
+ unless prec.respond_to?(:to_i)
193
+
194
+ tap { @prec = prec.to_i }
195
+ end
196
+
197
+ # Invoked for terminal rules during lexing, ignored for nonterminal rules.
198
+ #
199
+ # @param [String] source
200
+ # the input String the scan
201
+ #
202
+ # @param [Fixnum] offset
203
+ # the current index in the search
204
+ #
205
+ # @param [Fixnum] line
206
+ # the line the lexer was up to when the previous token was matched
207
+ #
208
+ # @return [Hash]
209
+ # a Hash representing the token, containing :rule, :value, :line and
210
+ # :discarded, if the token is to be skipped.
211
+ #
212
+ # Returns nil if nothing is matched.
213
+ def scan(source, offset, line)
214
+ return nil unless @terminal
215
+
216
+ if match = source.match(@pattern, offset)
217
+ {
218
+ :rule => self,
219
+ :value => match[0],
220
+ # FIXME: Optimize this line count in a cross-platform compatible way
221
+ :line => line + ("~" + match[0] + "~").lines.count - 1,
222
+ :discarded => @action.equal?(NULL_ACTION)
223
+ }
224
+ end
225
+ end
226
+
227
+ private
228
+
229
+ def resolve_conflicts(instructions)
230
+ if r = instructions.values.detect { |i| i[:action] == :reduce }
231
+ instructions.reject! do |s, i|
232
+ ((i[:action] == :shift) &&
233
+ ((r[:prec] > i[:prec]) ||
234
+ (r[:prec] == i[:prec] && i[:assoc] == :left)))
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,118 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # RuleSets are named collections of Rules.
7
+ #
8
+ # When you use the name of a rule in the grammar, you actually refer to the
9
+ # entire RuleSet and not an individual rule within it (unless of course, it
10
+ # only contains one Rule)
11
+ class RuleSet
12
+ include Enumerable
13
+
14
+ # Create a new RuleSet named +name+.
15
+ #
16
+ # @param [Symbol, String] name
17
+ # the name of the rule in the grammar
18
+ def initialize(name)
19
+ @name = name
20
+ @rules = []
21
+ end
22
+
23
+ # Enumerate all Rules in the set.
24
+ def each(&block)
25
+ @rules.each(&block)
26
+ end
27
+
28
+ # Add a new Rule to the set.
29
+ #
30
+ # @param [Object...] components...
31
+ # a variable list of components (Symbols, Strings, or Regexps)
32
+ def [](*components)
33
+ Rule.new(@name, *components).tap do |rule|
34
+ @rules << rule
35
+ end
36
+ end
37
+
38
+ # Invoked during lexing, delegating to each rule in the set.
39
+ #
40
+ # @param [String] source
41
+ # the complete input string
42
+ #
43
+ # @param [Fixnum] offset
44
+ # the current index in the search
45
+ # @param [Fixnum] line
46
+ # the current line number
47
+ #
48
+ # @return [Hash]
49
+ # a Hash representing the found token, or nil
50
+ def scan(source, offset, line)
51
+ each do |rule|
52
+ if token = rule.scan(source, offset, line)
53
+ return token
54
+ end
55
+ end
56
+
57
+ nil
58
+ end
59
+
60
+ # Recursively builds the parse table into +table+.
61
+ #
62
+ # @param [Hash<Fixnum,Hash>] table
63
+ # the parse table as constructed so far
64
+ #
65
+ # @param [Parser] parser
66
+ # the parser containing the grammar
67
+ #
68
+ # @param [Hash] context
69
+ # a Hash used to track state when building the parse table
70
+ #
71
+ # @return [Hash]
72
+ # the parse table
73
+ def build_parse_table(table, parser, context)
74
+ return table if context[:seen].include?([context[:state], self])
75
+
76
+ context[:seen] << [context[:state], self]
77
+
78
+ table.tap do
79
+ each do |rule|
80
+ rule.build_parse_table(table, parser, context)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Predicate test for whether or not this RuleSet references a single
86
+ # terminal Symbol.
87
+ #
88
+ # @return [Boolean]
89
+ # true if this rule is a terminal symbol
90
+ def terminal?
91
+ @rules.length == 1 && @rules.first.terminal?
92
+ end
93
+
94
+ # Predicate test for whether or not this RuleSet references a nonterminal Symbol.
95
+ #
96
+ # @return [Boolean]
97
+ # true if this rule is a nonterminal symbol
98
+ def nonterminal?
99
+ !terminal?
100
+ end
101
+
102
+ # Convenience method to access the precedence of a RuleSet representing a terminal.
103
+ #
104
+ # @return [Fixnum]
105
+ # the precedence of the terminal Symbol, or zero for nonterminals.
106
+ def prec
107
+ terminal? ? @rules.first.prec : 0
108
+ end
109
+
110
+ # Convenience method to access the associativity of a RuleSet representing a terminal.
111
+ #
112
+ # @return [Symbol]
113
+ # the associativty of the terminal Symbol.
114
+ def assoc
115
+ terminal? ? @rules.first.assoc : :right
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,3 @@
1
+ module Whittle
2
+ VERSION = "0.0.1"
3
+ end
data/lib/whittle.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "whittle/version"
2
+ require "whittle/error"
3
+ require "whittle/errors/unconsumed_input_error"
4
+ require "whittle/errors/parse_error"
5
+ require "whittle/errors/grammar_error"
6
+ require "whittle/rule"
7
+ require "whittle/rule_set"
8
+ require "whittle/parser"