whittle 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # Parsers are created by subclassing the Parser class and defining a context-free grammar.
7
+ #
8
+ # Unlike other LALR(1) parsers, Whittle does not rely on code-generation, instead it
9
+ # synthesizes a parse table from the grammar at runtime, on the first parse.
10
+ #
11
+ # While Whittle's implementation works a little differently to yacc/bison and ruby parser
12
+ # generators like racc and citrus, the parseable grammars are the same. LALR(1) parsers are
13
+ # very powerful and it is generally said that the languages they cannot parse are difficult
14
+ # for humans to understand.
15
+ #
16
+ # You should refer to the README for a full description of how to use the parser,
17
+ # but a quick example follows.
18
+ #
19
+ # @example A simple Whittle Parser
20
+ #
21
+ # class Calculator < Whittle::Parser
22
+ # rule(:wsp) do |r|
23
+ # r[/s+/] # skip whitespace
24
+ # end
25
+ #
26
+ # rule(:int) do |r|
27
+ # r[/[0-9]+/].as { |i| Integer(i) }
28
+ # end
29
+ #
30
+ # rule("+") % :left
31
+ # rule("-") % :left
32
+ # rule("/") % :left
33
+ # rule("*") % :left
34
+ #
35
+ # rule(:expr) do |r|
36
+ # r[:expr, "+", :expr].as { |left, _, right| left + right }
37
+ # r[:expr, "-", :expr].as { |left, _, right| left - right }
38
+ # r[:expr, "/", :expr].as { |left, _, right| left / right }
39
+ # r[:expr, "*", :expr].as { |left, _, right| left * right }
40
+ # r[:int].as(:value)
41
+ # end
42
+ #
43
+ # start(:expr)
44
+ # end
45
+ #
46
+ # calculator = Calculator.new
47
+ # calculator.parse("1 + (2 * 6) - 7")
48
+ # # => 6
49
+ class Parser
50
+ class << self
51
+ # Returns a Hash mapping rule names with their RuleSets.
52
+ #
53
+ # @return [Hash<String, RuleSet>]
54
+ # all rules defined by the parser
55
+ def rules
56
+ @rules ||= {}
57
+ end
58
+
59
+ # Declares a new rule.
60
+ #
61
+ # The are two ways to call this method. The most fundamental way is to pass a Symbol
62
+ # in the +name+ parameter, along with a block, in which you will add one more possible
63
+ # rules.
64
+ #
65
+ # @example Specifying multiple rules with a block
66
+ #
67
+ # rule(:expr) do |r|
68
+ # r[:expr, "+", :expr].as { |a, _, b| a + b }
69
+ # r[:expr, "-", :expr].as { |a, _, b| a - b }
70
+ # r[:expr, "/", :expr].as { |a, _, b| a / b }
71
+ # r[:expr, "*", :expr].as { |a, _, b| a * b }
72
+ # r[:integer].as { |i| Integer(i) }
73
+ # end
74
+ #
75
+ # Each rule specified in this way defines one of many possibilities to describe the input.
76
+ # Rules may refer back to themselves, which means in the above, any integer is a valid
77
+ # expr:
78
+ #
79
+ # 42
80
+ #
81
+ # Therefore any sum of integers as also a valid expr:
82
+ #
83
+ # 42 + 24
84
+ #
85
+ # Therefore any multiplication of sums of integers is also a valid expr, and so on.
86
+ #
87
+ # 42 + 24 * 7 + 52
88
+ #
89
+ # A rule like the above is called a 'nonterminal', because upon recognizing any expr, it
90
+ # is possible for the rule to continue collecting input and becoming a larger expr.
91
+ #
92
+ # In subtle contrast, a rule like the following:
93
+ #
94
+ # rule("+") do |r|
95
+ # r["+"].as { |plus| plus }
96
+ # end
97
+ #
98
+ # Is called a 'terminal' token, since upon recognizing a "+", the parser cannot
99
+ # add further input to the "+" itself... it is the tip of a branch in the parse tree; the
100
+ # branch terminates here, and subsequently the rule is terminal.
101
+ #
102
+ # There is a shorthand way to write the above rule:
103
+ #
104
+ # rule("+")
105
+ #
106
+ # Not given a block, #rule treats the name parameter as a literal token.
107
+ #
108
+ # Note that nonterminal rules are composed of other nonterminal rules and/or terminal
109
+ # rules. Terminal rules contain one, and only one Regexp pattern or fixed string.
110
+ #
111
+ # @param [Symbol, String] name
112
+ # the name of the ruleset (note the one ruleset can contain multiple rules)
113
+ #
114
+ # @return [RuleSet, Rule]
115
+ # the newly created RuleSet if a block was given, otherwise a rule representing a
116
+ # terminal token for the input string +name+.
117
+ def rule(name)
118
+ rules[name] = RuleSet.new(name)
119
+
120
+ if block_given?
121
+ rules[name].tap { |r| yield r }
122
+ else
123
+ rules[name][name].as(:value)
124
+ end
125
+ end
126
+
127
+ # Declares most general rule that can be used to describe an entire input.
128
+ #
129
+ # Called without any arguments, returns the current start rule.
130
+ #
131
+ # @param [Symbol] name
132
+ # the name of a rule defined in the parser (does not need to be defined beforehand)
133
+ #
134
+ # @return [Symbol]
135
+ # the new (or current) start rule
136
+ def start(name = nil)
137
+ @start = name unless name.nil?
138
+ @start
139
+ end
140
+
141
+ # Returns the numeric value for the initial state (the state ID associated with the start
142
+ # rule).
143
+ #
144
+ # In most LALR(1) parsers, this would be zero, but for implementation reasons, this will
145
+ # be an unpredictably large (or small) number.
146
+ #
147
+ # @return [Fixnum]
148
+ # the ID for the initial state in the parse table
149
+ def initial_state
150
+ prepare_start_rule
151
+ [rules[start], 0].hash
152
+ end
153
+
154
+ # Returns the entire parse table used to interpret input into the parser.
155
+ #
156
+ # You should not need to call this method, though you may wish to inspect its contents
157
+ # during debugging.
158
+ #
159
+ # Note that the token +nil+ in the parse table represents "anything" and its action is
160
+ # always to reduce.
161
+ #
162
+ # Shift-reduce conflicts are resolved at runtime and therefore remain in the parse table.
163
+ #
164
+ # @return [Hash]
165
+ # a 2-dimensional Hash representing states with actions to perform for a given lookahead
166
+ def parse_table
167
+ @parse_table ||= begin
168
+ prepare_start_rule
169
+ rules[start].build_parse_table(
170
+ {},
171
+ self,
172
+ {
173
+ :state => initial_state,
174
+ :seen => [],
175
+ :offset => 0,
176
+ :prec => 0
177
+ }
178
+ )
179
+ end
180
+ end
181
+
182
+ private
183
+
184
+ def prepare_start_rule
185
+ raise GrammarError, "Undefined start rule #{start.inspect}" unless rules.key?(start)
186
+
187
+ if rules[start].terminal?
188
+ rule(:*) do |r|
189
+ r[start].as { |prog| prog }
190
+ end
191
+
192
+ start(:*)
193
+ end
194
+ end
195
+ end
196
+
197
+ # Alias for class method Parser.rules
198
+ #
199
+ # @see Parser.rules
200
+ def rules
201
+ self.class.rules
202
+ end
203
+
204
+ # Accepts input in the form of a String and attempts to parse it according to the grammar.
205
+ #
206
+ # The input is scanned using a lexical analysis routine, defined by the #lex method. Each
207
+ # token detected by the routine is used to pick an action from the parse table. Each
208
+ # reduction initially builds a branch in an AST (abstract syntax tree), until all input has
209
+ # been read and the start rule has been recognized, at which point the AST is evaluated by
210
+ # invoking the callbacks defined in the grammar in a depth-first fashion.
211
+ #
212
+ # If the parser encounters a token it does not recognise, a parse error will be raised,
213
+ # specifying what was expected, what was received, and on which line the error occurred.
214
+ #
215
+ # A successful parse returns the result of evaluating the start rule, whatever that may be.
216
+ #
217
+ # @param [String] input
218
+ # a complete input string to parse according to the grammar
219
+ #
220
+ # @return [Object]
221
+ # whatever the grammar defines
222
+ def parse(input)
223
+ table = self.class.parse_table
224
+ states = [self.class.initial_state]
225
+ args = []
226
+ line = 1
227
+
228
+ lex(input) do |token|
229
+ line = token[:line]
230
+ input = token
231
+
232
+ catch(:shifted) do
233
+ loop do
234
+ state = table[states.last]
235
+
236
+ if ins = state[input[:name]] || state[nil]
237
+ case ins[:action]
238
+ when :shift
239
+ input[:args] = [input.delete(:value)]
240
+ states << ins[:state]
241
+ args << input
242
+ throw :shifted
243
+ when :reduce
244
+ size = ins[:rule].components.length
245
+ input = {
246
+ :rule => ins[:rule],
247
+ :name => ins[:rule].name,
248
+ :line => line,
249
+ :args => args.pop(size)
250
+ }
251
+ states.pop(size)
252
+ args << input
253
+
254
+ return accept(args.pop) if states.length == 1 && token[:name] == :$end
255
+ when :goto
256
+ input = token
257
+ states << ins[:state]
258
+ end
259
+ else
260
+ error(state, input, :states => states, :args => args)
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+
267
+ # Accepts a String as input and repeatedly yields terminal tokens found in the grammar.
268
+ #
269
+ # The last token yielded is always named :$end and has the value of +nil+.
270
+ #
271
+ # You may override this method to define a smarter implementation, should you need to.
272
+ #
273
+ # @param [String] input
274
+ # the complete input string the lex
275
+ def lex(input)
276
+ line = 1
277
+ offset = 0
278
+ ending = input.length
279
+
280
+ until offset == ending do
281
+ next_token(input, offset, line).tap do |token|
282
+ raise UnconsumedInputError,
283
+ "Unmatched input #{input[offset..-1].inspect} on line #{line}" if token.nil?
284
+
285
+ offset += token[:value].length
286
+ line, token[:line] = token[:line], line
287
+ yield token unless token[:discarded]
288
+ end
289
+ end
290
+
291
+ yield ({ :name => :$end, :line => line, :value => nil })
292
+ end
293
+
294
+ # Invoked when the parser detects an error.
295
+ #
296
+ # The default implementation raises a RuntimeError specifying the allowed inputs
297
+ # and the received input, along with a line number.
298
+ #
299
+ # You may override this method with your own implementation, which, at least in theory,
300
+ # can recover from the error and allow the parse to continue, though this is an extremely
301
+ # advanced topic and requires a good understanding of how LALR(1) parsers operate.
302
+ #
303
+ # @param [Hash] state
304
+ # the possible actions for the current parser state
305
+ #
306
+ # @param [Hash] input
307
+ # the received token (or, unlikely, a nonterminal symbol)
308
+ #
309
+ # @param [Hash] stack
310
+ # the current parse context (arg stack + state stack)
311
+ def error(state, input, stack)
312
+ expected = state.reject { |s, i| i[:action] == :goto }.keys
313
+ message = <<-ERROR.gsub(/\n\s+/, " ").strip
314
+ Parse error:
315
+ expected
316
+ #{expected.map { |k| k.inspect }.join("; or ")}
317
+ but got
318
+ #{input[:name].inspect}
319
+ on line
320
+ #{input[:line]}
321
+ ERROR
322
+
323
+ raise ParseError.new(message, input[:line], expected, input[:name])
324
+ end
325
+
326
+ private
327
+
328
+ def next_token(source, offset, line)
329
+ rules.each do |name, rule|
330
+ if token = rule.scan(source, offset, line)
331
+ token[:name] = name
332
+ return token
333
+ end
334
+ end
335
+
336
+ nil
337
+ end
338
+
339
+ def accept(tree)
340
+ tree[:rule].action.call(*tree[:args].map { |arg| Hash === arg ? accept(arg) : arg })
341
+ end
342
+ end
343
+ end
@@ -0,0 +1,239 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # Represents an individual Rule, forming part of an overall RuleSet.
7
+ class Rule
8
+ NULL_ACTION = Proc.new { }
9
+ DUMP_ACTION = Proc.new { |input| input }
10
+
11
+ attr_reader :name
12
+ attr_reader :action
13
+ attr_reader :components
14
+ attr_reader :assoc
15
+ attr_reader :prec
16
+
17
+ # Create a new Rule for the RuleSet named +name+.
18
+ #
19
+ # The components can either be names of other Rules, or for a terminal Rule,
20
+ # a single pattern to match in the input string.
21
+ #
22
+ # @param [String] name
23
+ # the name of the RuleSet to which this Rule belongs
24
+ #
25
+ # @param [Object...] components...
26
+ # a variable list of components that make up the Rule
27
+ def initialize(name, *components)
28
+ @components = components
29
+ @action = NULL_ACTION
30
+ @name = name
31
+ @terminal = components.length == 1 && !components.first.kind_of?(Symbol)
32
+ @assoc = :right
33
+ @prec = 0
34
+
35
+ @components.each do |c|
36
+ unless Regexp === c || String === c || Symbol === c
37
+ raise ArgumentError, "Unsupported rule component #{c.class}"
38
+ end
39
+ end
40
+
41
+ pattern = @components.first
42
+
43
+ if @terminal
44
+ @pattern = if pattern.kind_of?(Regexp)
45
+ Regexp.new("\\G#{pattern}")
46
+ else
47
+ Regexp.new("\\G#{Regexp.escape(pattern)}")
48
+ end
49
+ end
50
+ end
51
+
52
+ # Predicate check for whether or not the Rule represents a terminal symbol.
53
+ #
54
+ # A terminal symbol is effectively any rule that directly matches some
55
+ # pattern in the input string and references no other rules.
56
+ #
57
+ # @return [Boolean]
58
+ # true if this rule represents a terminal symbol
59
+ def terminal?
60
+ @terminal
61
+ end
62
+
63
+ # Walks all possible branches from the given rule, building a parse table.
64
+ #
65
+ # The parse table is a list of instructions (transitions) that can be looked
66
+ # up, given the current parser state and the current lookahead token.
67
+ #
68
+ # @param [Hash<Fixnum,Hash>] table
69
+ # the table to construct for
70
+ #
71
+ # @param [Parser] parser
72
+ # the Parser containing all the Rules in the grammar
73
+ #
74
+ # @param [Hash] context
75
+ # a Hash used to track state as the grammar is analyzed
76
+ def build_parse_table(table, parser, context)
77
+ state = table[context[:state]] ||= {}
78
+ sym = components[context[:offset]]
79
+ rule = parser.rules[sym]
80
+ new_offset = context[:offset] + 1
81
+ new_state = if state.key?(sym)
82
+ state[sym][:state]
83
+ end || [self, new_offset].hash
84
+
85
+ if sym.nil?
86
+ state[sym] = {
87
+ :action => :reduce,
88
+ :rule => self,
89
+ :prec => context[:prec]
90
+ }
91
+ else
92
+ raise GrammarError, "Unreferenced rule #{sym.inspect}" if rule.nil?
93
+
94
+ new_prec = if rule.terminal?
95
+ rule.prec
96
+ else
97
+ context[:prec]
98
+ end
99
+
100
+ if rule.terminal?
101
+ state[sym] = {
102
+ :action => :shift,
103
+ :state => new_state,
104
+ :prec => new_prec,
105
+ :assoc => rule.assoc
106
+ }
107
+ else
108
+ state[sym] = {
109
+ :action => :goto,
110
+ :state => new_state
111
+ }
112
+
113
+ rule.build_parse_table(
114
+ table,
115
+ parser,
116
+ {
117
+ :state => context[:state],
118
+ :seen => context[:seen],
119
+ :offset => 0,
120
+ :prec => 0
121
+ }
122
+ )
123
+ end
124
+
125
+ build_parse_table(
126
+ table,
127
+ parser,
128
+ {
129
+ :state => new_state,
130
+ :seen => context[:seen],
131
+ :offset => new_offset,
132
+ :prec => new_prec
133
+ }
134
+ )
135
+ end
136
+
137
+ resolve_conflicts(state)
138
+ end
139
+
140
+ # Specify how this Rule should be reduced.
141
+ #
142
+ # Given a block, the Rule will be reduced by passing the result of reducing
143
+ # all inputs as arguments to the block.
144
+ #
145
+ # Given the Symbol :value, the matched input will be returned verbatim.
146
+ # Given the Symbol :nothing, nil will be returned; you can use this to
147
+ # skip whitesapce and comments, for example.
148
+ #
149
+ # @param [Symbol] preset
150
+ # one of the preset actions, :value or :nothing; optional
151
+ #
152
+ # @return [Rule]
153
+ # returns self
154
+ def as(preset = nil, &block)
155
+ tap do
156
+ case preset
157
+ when :value then @action = DUMP_ACTION
158
+ when :nothing then @action = NULL_ACTION
159
+ when nil
160
+ raise ArgumentError, "Rule#as expected a block, not none given" unless block_given?
161
+ @action = block
162
+ else
163
+ raise ArgumentError, "Invalid preset #{preset.inspect} to Rule#as"
164
+ end
165
+ end
166
+ end
167
+
168
+ # Set the associativity of this Rule.
169
+ #
170
+ # Accepts values of :left, :right (default) or :nonassoc.
171
+ #
172
+ # @param [Symbol] assoc
173
+ # one of :left, :right or :nonassoc
174
+ #
175
+ # @return [Rule]
176
+ # returns self
177
+ def %(assoc)
178
+ raise ArgumentError, "Invalid associativity #{assoc.inspect}" \
179
+ unless [:left, :right, :nonassoc].include?(assoc)
180
+
181
+ tap { @assoc = assoc }
182
+ end
183
+
184
+ # Set the precedence of this Rule, as an Integer.
185
+ #
186
+ # The higher the number, the higher the precedence.
187
+ #
188
+ # @param [Fixnum] prec
189
+ # the precedence (default is zero)
190
+ def ^(prec)
191
+ raise ArgumentError, "Invalid precedence level #{prec.inspect}" \
192
+ unless prec.respond_to?(:to_i)
193
+
194
+ tap { @prec = prec.to_i }
195
+ end
196
+
197
+ # Invoked for terminal rules during lexing, ignored for nonterminal rules.
198
+ #
199
+ # @param [String] source
200
+ # the input String the scan
201
+ #
202
+ # @param [Fixnum] offset
203
+ # the current index in the search
204
+ #
205
+ # @param [Fixnum] line
206
+ # the line the lexer was up to when the previous token was matched
207
+ #
208
+ # @return [Hash]
209
+ # a Hash representing the token, containing :rule, :value, :line and
210
+ # :discarded, if the token is to be skipped.
211
+ #
212
+ # Returns nil if nothing is matched.
213
+ def scan(source, offset, line)
214
+ return nil unless @terminal
215
+
216
+ if match = source.match(@pattern, offset)
217
+ {
218
+ :rule => self,
219
+ :value => match[0],
220
+ # FIXME: Optimize this line count in a cross-platform compatible way
221
+ :line => line + ("~" + match[0] + "~").lines.count - 1,
222
+ :discarded => @action.equal?(NULL_ACTION)
223
+ }
224
+ end
225
+ end
226
+
227
+ private
228
+
229
+ def resolve_conflicts(instructions)
230
+ if r = instructions.values.detect { |i| i[:action] == :reduce }
231
+ instructions.reject! do |s, i|
232
+ ((i[:action] == :shift) &&
233
+ ((r[:prec] > i[:prec]) ||
234
+ (r[:prec] == i[:prec] && i[:assoc] == :left)))
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,118 @@
1
+ # Whittle: A little LALR(1) parser in pure ruby, without a generator.
2
+ #
3
+ # Copyright (c) Chris Corbyn, 2011
4
+
5
+ module Whittle
6
+ # RuleSets are named collections of Rules.
7
+ #
8
+ # When you use the name of a rule in the grammar, you actually refer to the
9
+ # entire RuleSet and not an individual rule within it (unless of course, it
10
+ # only contains one Rule)
11
+ class RuleSet
12
+ include Enumerable
13
+
14
+ # Create a new RuleSet named +name+.
15
+ #
16
+ # @param [Symbol, String] name
17
+ # the name of the rule in the grammar
18
+ def initialize(name)
19
+ @name = name
20
+ @rules = []
21
+ end
22
+
23
+ # Enumerate all Rules in the set.
24
+ def each(&block)
25
+ @rules.each(&block)
26
+ end
27
+
28
+ # Add a new Rule to the set.
29
+ #
30
+ # @param [Object...] components...
31
+ # a variable list of components (Symbols, Strings, or Regexps)
32
+ def [](*components)
33
+ Rule.new(@name, *components).tap do |rule|
34
+ @rules << rule
35
+ end
36
+ end
37
+
38
+ # Invoked during lexing, delegating to each rule in the set.
39
+ #
40
+ # @param [String] source
41
+ # the complete input string
42
+ #
43
+ # @param [Fixnum] offset
44
+ # the current index in the search
45
+ # @param [Fixnum] line
46
+ # the current line number
47
+ #
48
+ # @return [Hash]
49
+ # a Hash representing the found token, or nil
50
+ def scan(source, offset, line)
51
+ each do |rule|
52
+ if token = rule.scan(source, offset, line)
53
+ return token
54
+ end
55
+ end
56
+
57
+ nil
58
+ end
59
+
60
+ # Recursively builds the parse table into +table+.
61
+ #
62
+ # @param [Hash<Fixnum,Hash>] table
63
+ # the parse table as constructed so far
64
+ #
65
+ # @param [Parser] parser
66
+ # the parser containing the grammar
67
+ #
68
+ # @param [Hash] context
69
+ # a Hash used to track state when building the parse table
70
+ #
71
+ # @return [Hash]
72
+ # the parse table
73
+ def build_parse_table(table, parser, context)
74
+ return table if context[:seen].include?([context[:state], self])
75
+
76
+ context[:seen] << [context[:state], self]
77
+
78
+ table.tap do
79
+ each do |rule|
80
+ rule.build_parse_table(table, parser, context)
81
+ end
82
+ end
83
+ end
84
+
85
+ # Predicate test for whether or not this RuleSet references a single
86
+ # terminal Symbol.
87
+ #
88
+ # @return [Boolean]
89
+ # true if this rule is a terminal symbol
90
+ def terminal?
91
+ @rules.length == 1 && @rules.first.terminal?
92
+ end
93
+
94
+ # Predicate test for whether or not this RuleSet references a nonterminal Symbol.
95
+ #
96
+ # @return [Boolean]
97
+ # true if this rule is a nonterminal symbol
98
+ def nonterminal?
99
+ !terminal?
100
+ end
101
+
102
+ # Convenience method to access the precedence of a RuleSet representing a terminal.
103
+ #
104
+ # @return [Fixnum]
105
+ # the precedence of the terminal Symbol, or zero for nonterminals.
106
+ def prec
107
+ terminal? ? @rules.first.prec : 0
108
+ end
109
+
110
+ # Convenience method to access the associativity of a RuleSet representing a terminal.
111
+ #
112
+ # @return [Symbol]
113
+ # the associativty of the terminal Symbol.
114
+ def assoc
115
+ terminal? ? @rules.first.assoc : :right
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,3 @@
1
+ module Whittle
2
+ VERSION = "0.0.1"
3
+ end
data/lib/whittle.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "whittle/version"
2
+ require "whittle/error"
3
+ require "whittle/errors/unconsumed_input_error"
4
+ require "whittle/errors/parse_error"
5
+ require "whittle/errors/grammar_error"
6
+ require "whittle/rule"
7
+ require "whittle/rule_set"
8
+ require "whittle/parser"