antlr3 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/ANTLR-LICENSE.txt +26 -0
  2. data/History.txt +66 -0
  3. data/README.txt +139 -0
  4. data/bin/antlr4ruby +33 -0
  5. data/java/RubyTarget.java +524 -0
  6. data/java/antlr-full-3.2.1.jar +0 -0
  7. data/lib/antlr3.rb +176 -0
  8. data/lib/antlr3/constants.rb +88 -0
  9. data/lib/antlr3/debug.rb +701 -0
  10. data/lib/antlr3/debug/event-hub.rb +210 -0
  11. data/lib/antlr3/debug/record-event-listener.rb +25 -0
  12. data/lib/antlr3/debug/rule-tracer.rb +55 -0
  13. data/lib/antlr3/debug/socket.rb +360 -0
  14. data/lib/antlr3/debug/trace-event-listener.rb +92 -0
  15. data/lib/antlr3/dfa.rb +247 -0
  16. data/lib/antlr3/dot.rb +174 -0
  17. data/lib/antlr3/error.rb +657 -0
  18. data/lib/antlr3/main.rb +561 -0
  19. data/lib/antlr3/modes/ast-builder.rb +41 -0
  20. data/lib/antlr3/modes/filter.rb +56 -0
  21. data/lib/antlr3/profile.rb +322 -0
  22. data/lib/antlr3/recognizers.rb +1280 -0
  23. data/lib/antlr3/streams.rb +985 -0
  24. data/lib/antlr3/streams/interactive.rb +91 -0
  25. data/lib/antlr3/streams/rewrite.rb +412 -0
  26. data/lib/antlr3/test/call-stack.rb +57 -0
  27. data/lib/antlr3/test/config.rb +23 -0
  28. data/lib/antlr3/test/core-extensions.rb +269 -0
  29. data/lib/antlr3/test/diff.rb +165 -0
  30. data/lib/antlr3/test/functional.rb +207 -0
  31. data/lib/antlr3/test/grammar.rb +371 -0
  32. data/lib/antlr3/token.rb +592 -0
  33. data/lib/antlr3/tree.rb +1415 -0
  34. data/lib/antlr3/tree/debug.rb +163 -0
  35. data/lib/antlr3/tree/visitor.rb +84 -0
  36. data/lib/antlr3/tree/wizard.rb +481 -0
  37. data/lib/antlr3/util.rb +149 -0
  38. data/lib/antlr3/version.rb +27 -0
  39. data/samples/ANTLRv3Grammar.g +621 -0
  40. data/samples/Cpp.g +749 -0
  41. data/templates/AST.stg +335 -0
  42. data/templates/ASTDbg.stg +40 -0
  43. data/templates/ASTParser.stg +153 -0
  44. data/templates/ASTTreeParser.stg +272 -0
  45. data/templates/Dbg.stg +192 -0
  46. data/templates/Ruby.stg +1514 -0
  47. data/test/functional/ast-output/auto-ast.rb +797 -0
  48. data/test/functional/ast-output/construction.rb +555 -0
  49. data/test/functional/ast-output/hetero-nodes.rb +753 -0
  50. data/test/functional/ast-output/rewrites.rb +1327 -0
  51. data/test/functional/ast-output/tree-rewrite.rb +1662 -0
  52. data/test/functional/debugging/debug-mode.rb +689 -0
  53. data/test/functional/debugging/profile-mode.rb +165 -0
  54. data/test/functional/debugging/rule-tracing.rb +74 -0
  55. data/test/functional/delegation/import.rb +379 -0
  56. data/test/functional/lexer/basic.rb +559 -0
  57. data/test/functional/lexer/filter-mode.rb +245 -0
  58. data/test/functional/lexer/nuances.rb +47 -0
  59. data/test/functional/lexer/properties.rb +104 -0
  60. data/test/functional/lexer/syn-pred.rb +32 -0
  61. data/test/functional/lexer/xml.rb +206 -0
  62. data/test/functional/main/main-scripts.rb +245 -0
  63. data/test/functional/parser/actions.rb +224 -0
  64. data/test/functional/parser/backtracking.rb +244 -0
  65. data/test/functional/parser/basic.rb +282 -0
  66. data/test/functional/parser/calc.rb +98 -0
  67. data/test/functional/parser/ll-star.rb +143 -0
  68. data/test/functional/parser/nuances.rb +165 -0
  69. data/test/functional/parser/predicates.rb +103 -0
  70. data/test/functional/parser/properties.rb +242 -0
  71. data/test/functional/parser/rule-methods.rb +132 -0
  72. data/test/functional/parser/scopes.rb +274 -0
  73. data/test/functional/token-rewrite/basic.rb +318 -0
  74. data/test/functional/token-rewrite/via-parser.rb +100 -0
  75. data/test/functional/tree-parser/basic.rb +750 -0
  76. data/test/unit/sample-input/file-stream-1 +2 -0
  77. data/test/unit/sample-input/teststreams.input2 +2 -0
  78. data/test/unit/test-dfa.rb +52 -0
  79. data/test/unit/test-exceptions.rb +44 -0
  80. data/test/unit/test-recognizers.rb +55 -0
  81. data/test/unit/test-scheme.rb +62 -0
  82. data/test/unit/test-streams.rb +459 -0
  83. data/test/unit/test-tree-wizard.rb +535 -0
  84. data/test/unit/test-trees.rb +854 -0
  85. metadata +205 -0
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+ require 'antlr3'
4
+ require 'antlr3/tree'
5
+
6
+ module ANTLR3
7
+ module ASTBuilder
8
+ extend ClassMacros
9
+
10
+ def self.included(klass)
11
+ def klass.return_scope_members
12
+ super.push(:tree)
13
+ end
14
+ end
15
+
16
+ def initialize( input, options = {} )
17
+ @adaptor = options[:adaptor] ||= begin
18
+ (input.adaptor rescue nil) or
19
+ AST::CommonTreeAdaptor.new( token_class )
20
+ end
21
+ super( input, options )
22
+ end
23
+
24
+ shared_attribute( :adaptor )
25
+
26
+ private
27
+
28
+ def subtree_stream(desc, element = nil)
29
+ AST::RewriteRuleSubtreeStream.new(@adaptor, desc, element)
30
+ end
31
+
32
+ def token_stream(desc, element = nil)
33
+ AST::RewriteRuleTokenStream.new(@adaptor, desc, element)
34
+ end
35
+
36
+ def node_stream(desc, element = nil)
37
+ AST::RewriteRuleNodeStream.new(@adaptor, desc, element)
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ require 'antlr3'
5
+
6
+ module ANTLR3
7
+ =begin rdoc ANTLR3::FilterMode
8
+
9
+ If a lexer grammar specifies the <tt>filter = true</t> option, the generated
10
+ Lexer code will include this module. It modifies the standard
11
+ <tt>next_token</tt> to catch RecognitionErrors and skip ahead in the input until
12
+ the token! method can match a token without raising a RecognitionError.
13
+
14
+ See http://www.antlr.org/wiki/display/ANTLR3/Lexical+filters for more info on
15
+ lexer filter mode.
16
+
17
+ =end
18
+ module FilterMode
19
+ def next_token
20
+ @input.peek == ANTLR3::EOF and return ANTLR3::EOF_TOKEN
21
+ @state.token = nil
22
+ @state.channel = ANTLR3::DEFAULT_CHANNEL
23
+ @state.token_start_position = @input.index
24
+ @state.token_start_column = @input.column
25
+ @state.token_start_line = @input.line
26
+ @state.text = nil
27
+ @state.backtracking = 1
28
+ m = @input.mark
29
+ # means we won't throw slow exception
30
+ token!
31
+ @input.release(m)
32
+ emit
33
+ return @state.token
34
+ rescue ANTLR3::BacktrackingFailed
35
+ # token! backtracks with synpred at backtracking==2
36
+ # and we set the synpredgate to allow actions at level 1.
37
+ @input.rewind(m)
38
+ @input.consume # advance one char and try again
39
+ retry
40
+ rescue ANTLR3::Error::RecognitionError => re
41
+ # shouldn't happen in backtracking mode, but...
42
+ report_error(re)
43
+ recover(re)
44
+ ensure
45
+ @state.backtracking = 0
46
+ end
47
+
48
+ def memoize(rule, start_index, success)
49
+ super(rule, start_index, success) if @state.backtracking > 1
50
+ end
51
+
52
+ def already_parsed_rule?(rule)
53
+ @state.backtracking > 1 ? super(rule) : false
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,322 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ module ANTLR3
5
+ module Profile
6
+ =begin rdoc ANTLR3::Profile::ParserEvents
7
+
8
+ ANTLR3::Profile::ParserEvents expands basic debugging events for use by
9
+ recognition code generated by ANTLR when called with the <tt>-profile</tt>
10
+ switch.
11
+
12
+ =end
13
+ module ParserEvents
14
+ include ANTLR3::Debug::ParserEvents
15
+
16
+ def initialize(stream, options = {})
17
+ options[:debug_listener] ||= Profiler.new( self )
18
+ super( stream, options )
19
+ end
20
+
21
+ def already_parsed_rule?(rule)
22
+ @debug_listener.examine_rule_memoization(rule)
23
+ super
24
+ end
25
+
26
+ def profile
27
+ @debug_listener.profile
28
+ end
29
+
30
+ def memoize(rule, start_index, success)
31
+ @debug_listener.memoize(rule, rule_start_index, sucess)
32
+ super
33
+ end
34
+ end
35
+
36
+ class DataSet < ::Array
37
+ include ::Math
38
+ def total
39
+ inject(:+)
40
+ end
41
+ def average
42
+ length > 0 ? (total.to_f / length) : 0
43
+ end
44
+ def variance
45
+ length.zero? and return(0.0)
46
+ mean = average
47
+ inject(0.0) { |t, i| t + (i - mean)**2 } / (length - 1)
48
+ end
49
+ def standard_deviation
50
+ sqrt(variance)
51
+ end
52
+ end
53
+
54
+
55
+
56
+
57
+
58
+ unless const_defined?(:Profile)
59
+ Profile = Struct.new(
60
+ :grammar_file, :parser_class, :top_rule,
61
+ :rule_invocations, :guessing_rule_invocations, :rule_invocation_depth,
62
+ :fixed_looks, :cyclic_looks, :syntactic_predicate_looks,
63
+ :memoization_cache_entries, :memoization_cache_hits,
64
+ :memoization_cache_misses, :tokens, :hidden_tokens,
65
+ :characters_matched, :hidden_characters_matched, :semantic_predicates,
66
+ :syntactic_predicates, :reported_errors
67
+ )
68
+ end
69
+
70
+ class Profile
71
+ def initialize
72
+ init_values = Array.new(self.class.members.length, 0)
73
+ super(*init_values)
74
+ self.top_rule = self.parser_class = self.grammar_file = nil
75
+ self.fixed_looks = DataSet.new
76
+ self.cyclic_looks = DataSet.new
77
+ self.syntactic_predicate_looks = DataSet.new
78
+ end
79
+
80
+ def fixed_decisions
81
+ fixed_looks.length
82
+ end
83
+
84
+ def cyclic_decisions
85
+ cyclic_looks.length
86
+ end
87
+
88
+ def backtracking_decisions
89
+ syntactic_predicate_looks.length
90
+ end
91
+
92
+ def generate_report
93
+ report = '+' << '-' * 78 << "+\n"
94
+ report << '| ' << "ANTLR Rule Profile".center(76) << " |\n"
95
+ report << '+' << '-' * 78 << "+\n"
96
+ report << "| Generated at #{Time.now}".ljust(78) << " |\n"
97
+ report << "| Profiled #{parser_class.name}##{top_rule}".ljust(78) << " |\n"
98
+ report << "| Rule source generated from grammar file #{grammar_file}".ljust(78) << " |\n"
99
+ report << '+' << '-' * 78 << "+\n"
100
+
101
+ report << '| ' << "Rule Invocations".center(76) << " |\n"
102
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
103
+ report << "| %-66s | %7i |\n" % ["Total Invocations", rule_invocations]
104
+ report << "| %-66s | %7i |\n" % ["``Guessing'' Invocations", guessing_rule_invocations]
105
+ report << "| %-66s | %7i |\n" % ["Deepest Level of Invocation", rule_invocation_depth]
106
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
107
+
108
+ report << '| ' << "Execution Events".center(76) << " |\n"
109
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
110
+ report << "| %-66s | %7i |\n" % ["Semantic Predicates Evaluated", semantic_predicates]
111
+ report << "| %-66s | %7i |\n" % ["Syntactic Predicates Evaluated", syntactic_predicates]
112
+ report << "| %-66s | %7i |\n" % ["Errors Reported", reported_errors]
113
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
114
+
115
+ report << '| ' << "Token and Character Data".center(76) << " |\n"
116
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
117
+ report << "| %-66s | %7i |\n" % ["Tokens Consumed", tokens]
118
+ report << "| %-66s | %7i |\n" % ["Hidden Tokens Consumed", hidden_tokens]
119
+ report << "| %-66s | %7i |\n" % ["Characters Matched", characters_matched]
120
+ report << "| %-66s | %7i |\n" % ["Hidden Characters Matched", hidden_characters_matched]
121
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
122
+
123
+ report << '| ' << "Memoization".center(76) << " |\n"
124
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
125
+ report << "| %-66s | %7i |\n" % ["Cache Entries", memoization_cache_entries]
126
+ report << "| %-66s | %7i |\n" % ["Cache Hits", memoization_cache_hits]
127
+ report << "| %-66s | %7i |\n" % ["Cache Misses", memoization_cache_misses]
128
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
129
+
130
+ [
131
+ ['Fixed Lookahead (k)', fixed_looks],
132
+ ['Arbitrary Lookahead (k)', cyclic_looks],
133
+ ['Backtracking (Syntactic Predicate)', syntactic_predicate_looks]
134
+ ].each do |name, set|
135
+ mean, stdev = '%4.2f' % set.average, '%4.2f' % set.standard_deviation
136
+ report << '| ' << "#{name} Decisions".center(76) << " |\n"
137
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
138
+ report << "| %-66s | %7i |\n" % ["Count", set.length]
139
+ report << "| %-66s | %7i |\n" % ["Minimum k", set.min]
140
+ report << "| %-66s | %7i |\n" % ["Maximum k", set.max]
141
+ report << "| %-66s | %7s |\n" % ["Average k", mean]
142
+ report << "| %-66s | %7s |\n" % ["Standard Deviation of k", stdev]
143
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
144
+ end
145
+ return(report)
146
+ end
147
+ end
148
+
149
+ =begin rdoc ANTLR3::Profile::Profiler
150
+
151
+ When ANTLR is run with the <tt>-profile</tt> switch, it generates recognition
152
+ code that performs accounting about the decision logic performed while parsing
153
+ any given input. This information can be used to help refactor a slow grammar.
154
+ Profiler is an event-listener that performs all of the profiling accounting and
155
+ builds a simple report to present the various statistics.
156
+
157
+ =end
158
+ class Profiler
159
+ include ANTLR3::Debug::EventListener
160
+
161
+ PROTOCOL_VERSION = 2
162
+
163
+ attr_accessor :parser
164
+ attr_reader :rule_level
165
+ attr_reader :decision_level
166
+
167
+ # tracks the maximum look value for the current decision
168
+ # (maxLookaheadInCurrentDecision in java Profiler)
169
+ attr_reader :decision_look
170
+
171
+ # the last token consumed
172
+ # (lastTokenConsumed in java Profiler)
173
+ attr_reader :last_token
174
+ attr_reader :look_stack
175
+ attr_reader :profile
176
+
177
+ attr_accessor :output
178
+
179
+ def initialize(parser = nil, output = nil)
180
+ @parser = parser
181
+ @profile = nil
182
+ @rule_level = 0
183
+ @decision_level = 0
184
+ @decision_look = 0
185
+ @last_token = nil
186
+ @look_stack = []
187
+ @output = output
188
+ end
189
+
190
+ def commence
191
+ @profile = Profile.new
192
+ @rule_level = 0
193
+ @decision_level = 0
194
+ @decision_look = 0
195
+ @last_token = nil
196
+ @look_stack = []
197
+ end
198
+
199
+ def enter_rule(grammar_file_name, rule_name)
200
+ if @rule_level.zero?
201
+ commence
202
+ @profile.grammar_file = grammar_file_name
203
+ @profile.parser_class = @parser.class
204
+ @profile.top_rule = rule_name
205
+ end
206
+ @rule_level += 1
207
+ @profile.rule_invocations += 1
208
+ @profile.rule_invocation_depth < @rule_level and
209
+ @profile.rule_invocation_depth = @rule_level
210
+ end
211
+
212
+ def exit_rule(grammar_file_name, rule_name)
213
+ @rule_level -= 1
214
+ end
215
+
216
+ def examine_rule_memoization(rule)
217
+ stop_index = parser.rule_memoization(rule, @parser.input.index)
218
+ if stop_index == BaseRecognizer::MEMO_RULE_UNKNOWN
219
+ @profile.memoization_cache_misses += 1
220
+ @profile.guessing_rule_invocations += 1
221
+ else
222
+ @profile.memoization_cache_hits += 1
223
+ end
224
+ end
225
+
226
+ def memoize(rule, start_index, success)
227
+ @profile.memoization_cache_entries += 1
228
+ end
229
+
230
+
231
+ def enter_decision(decision_number)
232
+ @decision_level += 1
233
+ starting_look_index = @parser.token_stream.index
234
+ @look_stack << starting_look_index
235
+ end
236
+
237
+ def exit_decision(decision_number)
238
+ @look_stack.pop
239
+ @decision_level -= 1
240
+ if @parser.cyclic_decision? then
241
+ @profile.cyclic_looks << @decision_look
242
+ else @profile.fixed_looks << @decision_look
243
+ end
244
+
245
+ @parser.cyclic_decision = false
246
+ @decision_look = 0
247
+ end
248
+
249
+ def consume_token(token)
250
+ @last_token = token
251
+ end
252
+
253
+ def in_decision?
254
+ return(@decision_level > 0)
255
+ end
256
+
257
+ def consume_hidden_token(token)
258
+ @last_token = token
259
+ end
260
+
261
+ def look(i, token)
262
+ in_decision? or return
263
+ starting_index = look_stack.last
264
+ input = @parser.token_stream
265
+ this_ref_index = input.index
266
+ num_hidden = input.tokens(starting_index, this_ref_index).count { |t| t.hidden? }
267
+ depth = i + this_ref_index - starting_index - num_hidden
268
+ if depth > @decision_look
269
+ @decision_look = depth
270
+ end
271
+ end
272
+
273
+ def end_backtrack(level, successful)
274
+ @profile.syntactic_predicate_looks << @decision_look
275
+ end
276
+
277
+ def recognition_exception(error)
278
+ @profile.reported_errors += 1
279
+ end
280
+
281
+ def semantic_predicate(result, predicate)
282
+ in_decision? and @profile.semantic_predicates += 1
283
+ end
284
+
285
+ def terminate
286
+ input = @parser.token_stream
287
+ hidden_tokens = input.select { |token| token.hidden? }
288
+ @profile.hidden_tokens = hidden_tokens.length
289
+ @profile.tokens = input.tokens.length
290
+ @profile.hidden_characters_matched = hidden_tokens.inject(0) do |count, token|
291
+ count + token.text.length rescue count
292
+ end
293
+ @profile.characters_matched = (@last_token || input.tokens.last).stop + 1
294
+ write_report
295
+ end
296
+
297
+
298
+ def write_report
299
+ @output << @profile.generate_report unless @output.nil?
300
+ rescue NoMethodError => error
301
+ if error.name.to_s == '<<'
302
+ warn(<<-END.strip! % [__FILE__, __LINE__, @output])
303
+ [%s @ %s]: failed to write report to %p as it does not respond to :<<
304
+ END
305
+ else raise
306
+ end
307
+ rescue IOError => error
308
+ $stderr.puts( Util.tidy(<<-END) % [__FILE__, __LINE__, @output, error.class, error.message])
309
+ | [%s @ %s]: failed to write profile report to %p due to an IO Error:
310
+ | %s: %s
311
+ END
312
+ $stderr.puts(error.backtrace.map { |call| " - #{call}" }.join("\n"))
313
+ end
314
+
315
+ def report
316
+ @profile.generate_report
317
+ end
318
+
319
+ alias to_s report
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,1280 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ =begin LICENSE
5
+
6
+ [The "BSD licence"]
7
+ Copyright (c) 2009 Kyle Yetter
8
+ All rights reserved.
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions
12
+ are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright
15
+ notice, this list of conditions and the following disclaimer.
16
+ 2. Redistributions in binary form must reproduce the above copyright
17
+ notice, this list of conditions and the following disclaimer in the
18
+ documentation and/or other materials provided with the distribution.
19
+ 3. The name of the author may not be used to endorse or promote products
20
+ derived from this software without specific prior written permission.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25
+ IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27
+ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31
+ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+
33
+ =end
34
+
35
+ module ANTLR3
36
+ unless const_defined?(:RecognizerSharedState)
37
+
38
+ RecognizerSharedState = Struct.new(
39
+ :following,
40
+ :error_recovery,
41
+ :last_error_index,
42
+ :backtracking,
43
+ :rule_memory,
44
+ :syntax_errors,
45
+ :token,
46
+ :token_start_position,
47
+ :token_start_line,
48
+ :token_start_column,
49
+ :channel,
50
+ :type,
51
+ :text
52
+ )
53
+
54
+ =begin rdoc ANTLR3::RecognizerSharedState
55
+
56
+ A big Struct-based class containing most of the data that makes up a
57
+ recognizer's state. These attributes are externalized from the recognizer itself
58
+ so that recognizer delegation (which occurs when you import other grammars into
59
+ your grammar) can function; multiple recognizers can share a common state.
60
+
61
+ == Structure Attributes
62
+
63
+ following::
64
+ a stack that tracks follow sets for error recovery
65
+ error_recovery::
66
+ a flag indicating whether or not the recognizer is in error recovery mode
67
+ last_error_index::
68
+ the index in the input stream of the last error
69
+ backtracking::
70
+ tracks the backtracking depth
71
+ rule_memory::
72
+ if a grammar is compiled with the memoization option, this will be
73
+ set to a hash mapping previously parsed rules to cached indices
74
+ syntax_errors::
75
+ tracks the number of syntax errors seen so far
76
+ token::
77
+ holds newly constructed tokens for lexer rules
78
+ token_start_position::
79
+ the input stream index at which the token starts
80
+ token_start_line::
81
+ the input stream line number at which the token starts
82
+ token_start_column::
83
+ the input stream column at which the token starts
84
+ channel::
85
+ the channel value of the target token
86
+ type::
87
+ the type value of the target token
88
+ text::
89
+ the text of the target token
90
+
91
+ =end
92
+ class RecognizerSharedState
93
+ def initialize
94
+ super([], false, -1, 0, nil, 0, nil, -1)
95
+ # ^-- same as this --v
96
+ # self.following = []
97
+ # self.error_recovery = false
98
+ # self.last_error_index = -1
99
+ # self.backtracking = 0
100
+ # self.syntax_errors = 0
101
+ # self.token_start_position = -1
102
+ end
103
+
104
+
105
+ # restores all of the state variables to their respective
106
+ # initial default values
107
+ def reset!
108
+ self.following.clear
109
+ self.error_recovery = false
110
+ self.last_error_index = -1
111
+ self.backtracking = 0
112
+ self.rule_memory and rule_memory.clear
113
+ self.syntax_errors = 0
114
+ self.token = nil
115
+ self.token_start_position = -1
116
+ self.token_start_line = nil
117
+ self.token_start_column = nil
118
+ self.channel = nil
119
+ self.type = nil
120
+ self.text = nil
121
+ end
122
+
123
+ end
124
+ end
125
+
126
+
127
+ =begin rdoc ANTLR3::BaseRecognizer
128
+
129
+ = BaseRecognizer
130
+
131
+ As the base class of all ANTLR-generated recognizers, BaseRecognizer provides
132
+ much of the shared functionality and structure used in the recognition process.
133
+ For all effective purposes, the class and its immediate subclasses Lexer,
134
+ Parser, and TreeParser are abstract classes. They can be instantiated, but
135
+ they're pretty useless on their own. Instead, to make useful code, you write an
136
+ ANTLR grammar and ANTLR will generate classes which inherit from one of the
137
+ recognizer base classes, providing the implementation of the grammar rules
138
+ itself. this group of classes to implement necessary tasks. BaseRecognizer
139
+ defines methods related to:
140
+
141
+ * token and character matching
142
+ * prediction and recognition strategy
143
+ * recovering from errors
144
+ * reporting errors
145
+ * memoization
146
+ * simple rule tracing and debugging
147
+
148
+ =end
149
+ class BaseRecognizer
150
+ include Constants
151
+ include Error
152
+ include TokenFactory
153
+ extend ClassMacros
154
+
155
+ MEMO_RULE_FAILED = -2
156
+ MEMO_RULE_UNKNOWN = -1
157
+ DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
158
+ HIDDEN = HIDDEN_CHANNEL
159
+
160
+ @rules = {}
161
+
162
+ # inherited class methods and hooks
163
+ class << self
164
+
165
+ attr_reader :grammar_file_name,
166
+ :antlr_version,
167
+ :antlr_version_string,
168
+ :grammar_home
169
+
170
+ attr_accessor :token_scheme, :default_rule
171
+
172
+ # generated recognizer code uses this method to stamp
173
+ # the code with the name of the grammar file and
174
+ # the current version of ANTLR being used to generate
175
+ # the code
176
+ def generated_using(grammar_file, version_string)
177
+ @grammar_file_name = grammar_file.freeze
178
+ @antlr_version_string = version_string.freeze
179
+ if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
180
+ @antlr_version = [$1, $2, $3, $4].map! { |str| str.to_i }
181
+ timestamp = $5.strip
182
+ #@antlr_release_time = $5.empty? ? nil : Time.parse($5)
183
+ else
184
+ raise "bad version string: %p" % version_string
185
+ end
186
+ end
187
+
188
+ # this method is used to generate return-value structures for
189
+ # rules with multiple return values. To avoid generating
190
+ # a special class for ever rule in AST parsers and such
191
+ # (where most rules have the same default set of return values),
192
+ # each recognizer gets a default return value structure
193
+ # assigned to the constant +Return+. Rules which don't
194
+ # require additional custom members will have a rule-return
195
+ # name constant that just points to the generic return
196
+ # value.
197
+ def define_return_scope(*members)
198
+ if members.empty? then generic_return_scope
199
+ else
200
+ members += return_scope_members
201
+ Struct.new(*members)
202
+ end
203
+ end
204
+
205
+ # used as a hook to add additional default members
206
+ # to default return value structures
207
+ # For example, all AST-building parsers override
208
+ # this method to add an extra +:tree+ field to
209
+ # all rule return structures.
210
+ def return_scope_members
211
+ [:start, :stop]
212
+ end
213
+
214
+ # sets up and returns the generic rule return
215
+ # scope for a recognizer
216
+ def generic_return_scope
217
+ @generic_return_scope ||= begin
218
+ struct = Struct.new(*return_scope_members)
219
+ const_set(:Return, struct)
220
+ end
221
+ end
222
+
223
+ def imported_grammars
224
+ @imported_grammars ||= Set.new
225
+ end
226
+
227
+ def master_grammars
228
+ @master_grammars ||= []
229
+ end
230
+
231
+ def master
232
+ master_grammars.last
233
+ end
234
+
235
+ def masters( *grammar_names )
236
+ for grammar in grammar_names
237
+ unless master_grammars.include?( grammar )
238
+ master_grammars << grammar
239
+ attr_reader( Util.snake_case( grammar ) )
240
+ end
241
+ end
242
+ end
243
+ private :masters
244
+
245
+ def imports( *grammar_names )
246
+ for grammar in grammar_names
247
+ imported_grammars.add?(grammar.to_sym) and
248
+ attr_reader( Util.snake_case( grammar ) )
249
+ end
250
+ return imported_grammars
251
+ end
252
+ private :imports
253
+
254
+ def rules
255
+ self::RULE_METHODS.dup rescue []
256
+ end
257
+
258
+ def default_rule
259
+ @default_rule ||= rules.first
260
+ end
261
+
262
+ def debug?
263
+ return false
264
+ end
265
+
266
+ def token_class
267
+ @token_class ||= begin
268
+ self::Token rescue
269
+ superclass.token_class rescue
270
+ ANTLR3::CommonToken
271
+ end
272
+ end
273
+ private :generated_using
274
+ end
275
+
276
+ @grammar_file_name = nil
277
+ @antlr_version = ANTLR3::ANTLR_VERSION
278
+ @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
279
+
280
+ def grammar_file_name
281
+ self.class.grammar_file_name
282
+ end
283
+
284
+ def antlr_version
285
+ self.class.antlr_version
286
+ end
287
+
288
+ def antlr_version_string
289
+ self.class.antlr_version_string
290
+ end
291
+
292
+ attr_accessor :input
293
+ attr_reader :state
294
+
295
+ def each_delegate
296
+ block_given? or return enum_for( __method__ )
297
+ for grammar in self.class.imported_grammars
298
+ del = __send__( Util.snake_case( grammar ) ) and
299
+ yield( del )
300
+ end
301
+ end
302
+
303
+ # Create a new recognizer. The constructor simply ensures that
304
+ # all recognizers are initialized with a shared state object.
305
+ # See the main recognizer subclasses for more specific
306
+ # information about creating recognizer objects like
307
+ # lexers and parsers.
308
+ def initialize(options = {})
309
+ @state = options[:state] || RecognizerSharedState.new
310
+ @error_output = options.fetch(:error_output, $stderr)
311
+ defined?(@input) or @input = nil
312
+ initialize_dfas
313
+ end
314
+
315
+ # Resets the recognizer's state data to initial values.
316
+ # As a result, all error tracking and error recovery
317
+ # data accumulated in the current state will be cleared.
318
+ # It will also attempt to reset the input stream
319
+ # via input.reset, but it ignores any errors received
320
+ # from doing so. Thus the input stream is not guarenteed
321
+ # to be rewound to its initial position
322
+ def reset
323
+ @state and @state.reset!
324
+ @input and @input.reset rescue nil
325
+ end
326
+
327
+ # Attempt to match the current input symbol the token type
328
+ # specified by +type+. If the symbol matches the type,
329
+ # consume the current symbol and return its value. If
330
+ # the symbol doesn't match, attempt to use the follow-set
331
+ # data provided by +follow+ to recover from the mismatched
332
+ # token.
333
+ def match(type, follow)
334
+ matched_symbol = current_input_symbol
335
+ if @input.peek == type
336
+ @input.consume
337
+ @state.error_recovery = false
338
+ return matched_symbol
339
+ end
340
+ raise(BacktrackingFailed) if @state.backtracking > 0
341
+ matched_symbol = recover_from_mismatched_token(type, follow)
342
+ return matched_symbol
343
+ end
344
+
345
+ # match anything -- i.e. wildcard match. Simply consume
346
+ # the current symbol from the input stream.
347
+ def match_any
348
+ @state.error_recovery = false
349
+ @input.consume
350
+ end
351
+
352
+ ##############################################################################################
353
+ ###################################### Error Reporting #######################################
354
+ ##############################################################################################
355
+ ##############################################################################################
356
+
357
+ # When a recognition error occurs, this method is the main
358
+ # hook for carrying out the error reporting process. The
359
+ # default implementation calls +display_recognition_error+
360
+ # to display the error info on $stderr.
361
+ def report_error(e = $!)
362
+ @state.error_recovery and return
363
+ @state.error_recovery = true
364
+ display_recognition_error(e)
365
+ end
366
+
367
+ # error reporting hook for presenting the information
368
+ # The default implementation builds appropriate error
369
+ # message text using +error_header+ and +error_message+,
370
+ # and calls +emit_error_message+ to write the error
371
+ # message out to some source
372
+ def display_recognition_error(e = $!)
373
+ header = error_header(e)
374
+ message = error_message(e)
375
+ emit_error_message("#{header} #{message}")
376
+ end
377
+
378
+ # used to construct an appropriate error message
379
+ # based on the specific type of error and the
380
+ # error's attributes
381
+ def error_message(e = $!)
382
+ case e
383
+ when Error::UnwantedToken
384
+ token_name = token_name(e.expecting)
385
+ "extraneous input #{token_error_display(e.unexpected_token)} expecting #{token_name}"
386
+ when Error::MissingToken
387
+ token_name = token_name(e.expecting)
388
+ "missing #{token_name} at #{token_error_display(e.symbol)}"
389
+ when Error::MismatchedToken
390
+ token_name = token_name(e.expecting)
391
+ "mismatched input #{token_error_display(e.symbol)} expecting #{token_name}"
392
+ when Error::MismatchedTreeNode
393
+ token_name = token_name(e.expecting)
394
+ "mismatched tree node: #{e.symbol} expecting #{token_name}"
395
+ when Error::NoViableAlternative
396
+ "no viable alternative at input " << token_error_display(e.symbol)
397
+ when Error::MismatchedSet
398
+ "mismatched input %s expecting set %s" %
399
+ [token_error_display(e.symbol), e.expecting.inspect]
400
+ when Error::MismatchedNotSet
401
+ "mismatched input %s expecting set %s" %
402
+ [token_error_display(e.symbol), e.expecting.inspect]
403
+ when Error::FailedPredicate
404
+ "rule %s failed predicate: { %s }?" % [e.rule_name, e.predicate_text]
405
+ else e.message
406
+ end
407
+ end
408
+
409
+ # used to add a tag to the error message that indicates
410
+ # the location of the input stream when the error
411
+ # occurred
412
+ def error_header(e = $!)
413
+ e.location
414
+ end
415
+
416
+ # formats a token object appropriately for inspection
417
+ # within an error message
418
+ def token_error_display(token)
419
+ unless text = token.text
420
+ if token.type == EOF then text = '<EOF>'
421
+ elsif name = token_name(token.type) rescue false
422
+ text = "<#{name}>"
423
+ elsif token.respond_to?(:name) then text = "<#{token.name}>"
424
+ else "<#{token.type}>"
425
+ end
426
+ end
427
+ return text.inspect
428
+ end
429
+
430
+ # Write the error report data out to some source. By default,
431
+ # the error message is written to $stderr
432
+ def emit_error_message(message)
433
+ @error_output.puts(message) if @error_output
434
+ end
435
+
436
+ ##############################################################################################
437
+ ###################################### Error Recovery ########################################
438
+ ##############################################################################################
439
+ def recover(error = $!)
440
+ @state.last_error_index == @input.index and @input.consume
441
+ @state.last_error_index = @input.index
442
+
443
+ follow_set = compute_error_recovery_set
444
+
445
+ resync { consume_until(follow_set) }
446
+ end
447
+
448
+ def resync
449
+ begin_resync
450
+ value = yield
451
+ end_resync
452
+ return(value)
453
+ end
454
+
455
+ # overridable hook method that is executed at the start of the
456
+ # resyncing procedure in recover
457
+ #
458
+ # by default, it does nothing
459
+ def begin_resync
460
+ # do nothing
461
+ end
462
+
463
+ # overridable hook method that is after the resyncing procedure has completed
464
+ #
465
+ # by default, it does nothing
466
+ def end_resync
467
+ # do nothing
468
+ end
469
+
470
+ # (The following explanation has been lifted directly from the
471
+ # source code documentation of the ANTLR Java runtime library)
472
+ #
473
+ # Compute the error recovery set for the current rule. During
474
+ # rule invocation, the parser pushes the set of tokens that can
475
+ # follow that rule reference on the stack; this amounts to
476
+ # computing FIRST of what follows the rule reference in the
477
+ # enclosing rule. This local follow set only includes tokens
478
+ # from within the rule; i.e., the FIRST computation done by
479
+ # ANTLR stops at the end of a rule.
480
+ #
481
+ # EXAMPLE
482
+ #
483
+ # When you find a "no viable alt exception", the input is not
484
+ # consistent with any of the alternatives for rule r. The best
485
+ # thing to do is to consume tokens until you see something that
486
+ # can legally follow a call to r *or* any rule that called r.
487
+ # You don't want the exact set of viable next tokens because the
488
+ # input might just be missing a token--you might consume the
489
+ # rest of the input looking for one of the missing tokens.
490
+ #
491
+ # Consider grammar:
492
+ #
493
+ # a : '[' b ']'
494
+ # | '(' b ')'
495
+ # ;
496
+ # b : c '^' INT ;
497
+ # c : ID
498
+ # | INT
499
+ # ;
500
+ #
501
+ # At each rule invocation, the set of tokens that could follow
502
+ # that rule is pushed on a stack. Here are the various "local"
503
+ # follow sets:
504
+ #
505
+ # FOLLOW(b1_in_a) = FIRST(']') = ']'
506
+ # FOLLOW(b2_in_a) = FIRST(')') = ')'
507
+ # FOLLOW(c_in_b) = FIRST('^') = '^'
508
+ #
509
+ # Upon erroneous input "[]", the call chain is
510
+ #
511
+ # a -> b -> c
512
+ #
513
+ # and, hence, the follow context stack is:
514
+ #
515
+ # depth local follow set after call to rule
516
+ # 0 \<EOF> a (from main())
517
+ # 1 ']' b
518
+ # 3 '^' c
519
+ #
520
+ # Notice that <tt>')'</tt> is not included, because b would have to have
521
+ # been called from a different context in rule a for ')' to be
522
+ # included.
523
+ #
524
+ # For error recovery, we cannot consider FOLLOW(c)
525
+ # (context-sensitive or otherwise). We need the combined set of
526
+ # all context-sensitive FOLLOW sets--the set of all tokens that
527
+ # could follow any reference in the call chain. We need to
528
+ # resync to one of those tokens. Note that FOLLOW(c)='^' and if
529
+ # we resync'd to that token, we'd consume until EOF. We need to
530
+ # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
531
+ # In this case, for input "[]", LA(1) is in this set so we would
532
+ # not consume anything and after printing an error rule c would
533
+ # return normally. It would not find the required '^' though.
534
+ # At this point, it gets a mismatched token error and throws an
535
+ # exception (since LA(1) is not in the viable following token
536
+ # set). The rule exception handler tries to recover, but finds
537
+ # the same recovery set and doesn't consume anything. Rule b
538
+ # exits normally returning to rule a. Now it finds the ']' (and
539
+ # with the successful match exits errorRecovery mode).
540
+ #
541
+ # So, you cna see that the parser walks up call chain looking
542
+ # for the token that was a member of the recovery set.
543
+ #
544
+ # Errors are not generated in errorRecovery mode.
545
+ #
546
+ # ANTLR's error recovery mechanism is based upon original ideas:
547
+ #
548
+ # "Algorithms + Data Structures = Programs" by Niklaus Wirth
549
+ #
550
+ # and
551
+ #
552
+ # "A note on error recovery in recursive descent parsers":
553
+ # http://portal.acm.org/citation.cfm?id=947902.947905
554
+ #
555
+ # Later, Josef Grosch had some good ideas:
556
+ #
557
+ # "Efficient and Comfortable Error Recovery in Recursive Descent
558
+ # Parsers":
559
+ # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
560
+ #
561
+ # Like Grosch I implemented local FOLLOW sets that are combined
562
+ # at run-time upon error to avoid overhead during parsing.
563
+ def compute_error_recovery_set
564
+ combine_follows(false)
565
+ end
566
+
567
+ def recover_from_mismatched_token(type, follow)
568
+ if mismatch_is_unwanted_token?(type)
569
+ err = UnwantedToken(type)
570
+
571
+ begin_resync
572
+ @input.consume
573
+ end_resync
574
+
575
+ report_error(err)
576
+
577
+ matched_symbol = current_input_symbol
578
+ @input.consume
579
+ return matched_symbol
580
+ end
581
+
582
+ if mismatch_is_missing_token?(follow)
583
+ inserted = missing_symbol(err, type, follow)
584
+ err = MissingToken(type, inserted)
585
+
586
+ report_error(err)
587
+ return inserted
588
+ end
589
+
590
+ err = MismatchedToken(type)
591
+ raise err
592
+ end
593
+
594
+ def recover_from_mismatched_set(e, follow)
595
+ if mismatch_is_missing_token?(follow)
596
+ report_error(e)
597
+ return missing_symbol(e, INVALID_TOKEN_TYPE, follow)
598
+ end
599
+ raise e
600
+ end
601
+
602
+ # Conjure up a missing token during error recovery.
603
+ #
604
+ # The recognizer attempts to recover from single missing
605
+ # symbols. But, actions might refer to that missing symbol.
606
+ # For example, x=ID {f($x);}. The action clearly assumes
607
+ # that there has been an identifier matched previously and that
608
+ # $x points at that token. If that token is missing, but
609
+ # the next token in the stream is what we want we assume that
610
+ # this token is missing and we keep going. Because we
611
+ # have to return some token to replace the missing token,
612
+ # we have to conjure one up. This method gives the user control
613
+ # over the tokens returned for missing tokens. Mostly,
614
+ # you will want to create something special for identifier
615
+ # tokens. For literals such as '{' and ',', the default
616
+ # action in the parser or tree parser works. It simply creates
617
+ # a CommonToken of the appropriate type. The text will be the token.
618
+ # If you change what tokens must be created by the lexer,
619
+ # override this method to create the appropriate tokens.
620
+ def missing_symbol(error, expected_token_type, follow)
621
+ return nil
622
+ end
623
+
624
+ def recover_from_mismatched_element(e, follow)
625
+ follow.nil? and return false
626
+ if follow.include?(EOR_TOKEN_TYPE)
627
+ viable_tokens = compute_context_sensitive_rule_follow()
628
+ follow = (follow | viable_tokens) - Set.new([EOR_TOKEN_TYPE])
629
+ end
630
+ if follow.include?(@input.peek)
631
+ report_error(e)
632
+ return true
633
+ end
634
+ return false
635
+ end
636
+
637
+ def mismatch_is_unwanted_token?(type)
638
+ @input.peek(2) == type
639
+ end
640
+
641
+ def mismatch_is_missing_token?(follow)
642
+ follow.nil? and return false
643
+ if follow.include?(EOR_TOKEN_TYPE)
644
+ viable_tokens = compute_context_sensitive_rule_follow
645
+ follow = follow | viable_tokens
646
+
647
+ follow.delete(EOR_TOKEN_TYPE) unless @state.following.empty?
648
+ end
649
+ if follow.include?(@input.peek) or follow.include?(EOR_TOKEN_TYPE)
650
+ return true
651
+ end
652
+ return false
653
+ end
654
+
655
+ # factor out what to do upon token mismatch so
656
+ # tree parsers can behave differently.
657
+ #
658
+ # * override this method in your parser to do things
659
+ # like bailing out after the first error
660
+ # * just raise the exception instead of
661
+ # calling the recovery method.
662
+ #
663
+ def number_of_syntax_errors
664
+ @state.syntax_errors
665
+ end
666
+
667
+ # Compute the context-sensitive FOLLOW set for current rule.
668
+ # This is set of token types that can follow a specific rule
669
+ # reference given a specific call chain. You get the set of
670
+ # viable tokens that can possibly come next (look depth 1)
671
+ # given the current call chain. Contrast this with the
672
+ # definition of plain FOLLOW for rule r:
673
+ #
674
+ # FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
675
+ #
676
+ # where x in T* and alpha, beta in V*; T is set of terminals and
677
+ # V is the set of terminals and nonterminals. In other words,
678
+ # FOLLOW(r) is the set of all tokens that can possibly follow
679
+ # references to r in *any* sentential form (context). At
680
+ # runtime, however, we know precisely which context applies as
681
+ # we have the call chain. We may compute the exact (rather
682
+ # than covering superset) set of following tokens.
683
+ #
684
+ # For example, consider grammar:
685
+ #
686
+ # stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
687
+ # | "return" expr '.'
688
+ # ;
689
+ # expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
690
+ # atom : INT // FOLLOW(atom)=={'+',')',';','.'}
691
+ # | '(' expr ')'
692
+ # ;
693
+ #
694
+ # The FOLLOW sets are all inclusive whereas context-sensitive
695
+ # FOLLOW sets are precisely what could follow a rule reference.
696
+ # For input input "i=(3);", here is the derivation:
697
+ #
698
+ # stat => ID '=' expr ';'
699
+ # => ID '=' atom ('+' atom)* ';'
700
+ # => ID '=' '(' expr ')' ('+' atom)* ';'
701
+ # => ID '=' '(' atom ')' ('+' atom)* ';'
702
+ # => ID '=' '(' INT ')' ('+' atom)* ';'
703
+ # => ID '=' '(' INT ')' ';'
704
+ #
705
+ # At the "3" token, you'd have a call chain of
706
+ #
707
+ # stat -> expr -> atom -> expr -> atom
708
+ #
709
+ # What can follow that specific nested ref to atom? Exactly ')'
710
+ # as you can see by looking at the derivation of this specific
711
+ # input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
712
+ #
713
+ # You want the exact viable token set when recovering from a
714
+ # token mismatch. Upon token mismatch, if LA(1) is member of
715
+ # the viable next token set, then you know there is most likely
716
+ # a missing token in the input stream. "Insert" one by just not
717
+ # throwing an exception.
718
+ def compute_context_sensitive_rule_follow
719
+ combine_follows(true)
720
+ end
721
+
722
+ def combine_follows(exact)
723
+ follow_set = Set.new
724
+ @state.following.each_with_index.reverse_each do |local_follow_set, index|
725
+ follow_set |= local_follow_set
726
+ if exact
727
+ if local_follow_set.include?(EOR_TOKEN_TYPE)
728
+ follow_set.delete(EOR_TOKEN_TYPE) if index > 0
729
+ else
730
+ break
731
+ end
732
+ end
733
+ end
734
+ return follow_set
735
+ end
736
+
737
+ # Match needs to return the current input symbol, which gets put
738
+ # into the label for the associated token ref; e.g., x=ID. Token
739
+ # and tree parsers need to return different objects. Rather than test
740
+ # for input stream type or change the IntStream interface, I use
741
+ # a simple method to ask the recognizer to tell me what the current
742
+ # input symbol is.
743
+ #
744
+ # This is ignored for lexers.
745
+
746
+ def current_input_symbol
747
+ return nil
748
+ end
749
+
750
+ # Consume tokens until one matches the given token or token set
751
+ #
752
+ # tokenTypes can be a single token type or a set of token types
753
+ def consume_until(token_types)
754
+ token_types.is_a?(Set) or token_types = Set.new(token_types.to_a)
755
+ type = @input.peek
756
+ until type == EOF or token_types.include?(type)
757
+ @input.consume
758
+ type = @input.peek
759
+ end
760
+ return(type)
761
+ end
762
+
763
+ def backtracking_level
764
+ @state.backtracking
765
+ end
766
+
767
+ def backtracking_level=(n)
768
+ @state.backtracking = n
769
+ end
770
+
771
+ def backtrack
772
+ @state.backtracking += 1
773
+ start = @input.mark
774
+ success =
775
+ begin yield
776
+ rescue BacktrackingFailed then false
777
+ else true
778
+ end
779
+ return success
780
+ ensure
781
+ @input.rewind(start)
782
+ @state.backtracking -= 1
783
+ end
784
+
785
+ def syntactic_predicate?(name)
786
+ backtrack { send(name) }
787
+ end
788
+
789
+ alias backtracking backtracking_level
790
+ alias backtracking= backtracking_level=
791
+
792
+ def rule_memoization(rule, start_index)
793
+ @state.rule_memory[rule] ||= Hash.new(MEMO_RULE_UNKNOWN)
794
+ @state.rule_memory[rule][start_index]
795
+ end
796
+
797
+ def already_parsed_rule?(rule)
798
+ stop_index = rule_memoization(rule, @input.index)
799
+ case stop_index
800
+ when MEMO_RULE_UNKNOWN then return false
801
+ when MEMO_RULE_FAILED then return true
802
+ else
803
+ @input.seek(stop_index + 1)
804
+ end
805
+ return true
806
+ end
807
+
808
+ def memoize(rule, start_index, success)
809
+ stop_index = success ? (@input.index - 1) : MEMO_RULE_FAILED
810
+ memo = @state.rule_memory[rule] and memo[start_index] = stop_index
811
+ end
812
+
813
+ def trace_in(rule_name, rule_index, input_symbol)
814
+ @error_output.printf("--> enter %s on %s", rule_name, input_symbol)
815
+ @state.backtracking > 0 and @error_output.printf(
816
+ " (in backtracking mode: depth = %s)", @state.backtracking
817
+ )
818
+ @error_output.print("\n")
819
+ end
820
+
821
+ def trace_out(rule_name, rule_index, input_symbol)
822
+ @error_output.printf("<-- exit %s on %s", rule_name, input_symbol)
823
+ @state.backtracking > 0 and @error_output.printf(
824
+ " (in backtracking mode: depth = %s)", @state.backtracking
825
+ )
826
+ @error_output.print("\n")
827
+ end
828
+
829
+ private
830
+
831
+ def initialize_dfas
832
+ # do nothing
833
+ end
834
+ end
835
+
836
+ =begin rdoc ANTLR3::Lexer
837
+
838
+ = Lexer
839
+
840
+ Lexer is the default superclass of all lexers generated by ANTLR. The class
841
+ tailors the core functionality provided by BaseRecognizer to the task of
842
+ matching patterns in the text input and breaking the input into tokens.
843
+
844
+ == About Lexers
845
+
846
+ A lexer's job is to take input text and break it up into _tokens_ -- objects
847
+ that encapsulate a piece of text, a type label (such as ID or INTEGER), and the
848
+ position of the text with respect to the input. Thus, a lexer is essentially a
849
+ complicated iterator that steps through an input stream and produces a sequence
850
+ of tokens. Sometimes lexers are enough to carry out a goal on their own, such as
851
+ tasks like source code highlighting and simple code analysis. Usually, however,
852
+ the lexer converts text into tokens for use by a parser, which recognizes larger
853
+ structures within the text.
854
+
855
+ ANTLR parsers have a variety of entry points specified by parser rules, each of
856
+ which defines the structure of a specific type of sentence in a grammar. Lexers,
857
+ however, are primarily intended to have a single entry point. It looks at the
858
+ characters starting at the current input position, decides if the chunk of text
859
+ matches one of a number of possible token type definitions, wraps the chunk into
860
+ a token with information on its type and location, and advances the input stream
861
+ to the next place.
862
+
863
+ == ANTLR Lexers and the Lexer API
864
+
865
+ ANTLR-generated lexers will subclass this class, unless specified otherwise
866
+ within a grammar file. The generated class will provide an implementation of
867
+ each lexer rule as a method of the same name. The subclass will also provide an
868
+ implementation for the abstract method #m_tokens, the purpose of which is to
869
+ multiplex the token type definitions and predict what rule definition to execute
870
+ to fetch a token. The primary method in the lexer API, #next_token, uses
871
+ #m_tokens to fetch the next token and drive the iteration.
872
+
873
+ If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
874
+ will generally be used to build a TokenStream object. The following code example
875
+ demonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
876
+
877
+ # in HypotheticalLexer.rb
878
+ module Hypothetical
879
+ class Lexer < ANTLR3::Lexer
880
+ # ...
881
+ # ANTLR generated code
882
+ # ...
883
+ end
884
+ end
885
+
886
+ # in HypotheticalParser.rb
887
+ module Hypothetical
888
+ class Parser < ANTLR3::Parser
889
+ # ...
890
+ # more ANTLR generated code
891
+ # ...
892
+ end
893
+ end
894
+
895
+ # to take hypothetical source code and prepare it for parsing,
896
+ # there is generally a four-step construction process
897
+
898
+ source = "some hypothetical source code"
899
+ input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
900
+ lexer = Hypothetical::Lexer.new(input)
901
+ tokens = ANTLR3::CommonTokenStream.new(lexer)
902
+ parser = Hypothetical::Parser.new(tokens)
903
+
904
+ # if you're using the standard streams, ANTLR3::StringStream and
905
+ # ANTLR3::CommonTokenStream, you can write the same process
906
+ # shown above more succinctly:
907
+
908
+ lexer = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
909
+ parser = Hypothetical::Parser.new(lexer)
910
+
911
+ =end
912
+ class Lexer < BaseRecognizer
913
+ include TokenSource
914
+ @token_class = CommonToken
915
+
916
+ def self.default_rule
917
+ @default_rule ||= :token!
918
+ end
919
+
920
+ def self.main(argv = ARGV, options = {})
921
+ if argv.is_a?(::Hash) then argv, options = ARGV, argv end
922
+ main = ANTLR3::Main::LexerMain.new(self, options)
923
+ block_given? ? yield(main) : main.execute(argv)
924
+ end
925
+
926
+ def self.associated_parser
927
+ @grammar_home and @grammar_home::Parser
928
+ rescue NameError
929
+ grammar_name = @grammar_home.name.split("::").last
930
+ begin
931
+ require "#{grammar_name}Parser"
932
+ rescue LoadError => e
933
+ return nil
934
+ end
935
+ return @grammar_home::Parser rescue nil
936
+ end
937
+
938
+ def self.associated_parser
939
+ @associated_parser ||= begin
940
+ @grammar_home and @grammar_home::Parser
941
+ rescue NameError
942
+ grammar_name = @grammar_home.name.split("::").last
943
+ begin
944
+ require "#{grammar_name}Parser"
945
+ @grammar_home::Parser
946
+ rescue LoadError, NameError
947
+ end
948
+ end
949
+ end
950
+
951
+ def initialize(input, options = {})
952
+ super(options)
953
+ @input =
954
+ case input
955
+ when ::String then StringStream.new(input, options)
956
+ when ::IO then FileStream.new(input, options)
957
+ else input
958
+ end
959
+ end
960
+
961
+ def next_token
962
+ loop do
963
+ @state.token = nil
964
+ @state.channel = DEFAULT_CHANNEL
965
+ @state.token_start_position = @input.index
966
+ @state.token_start_column = @input.column
967
+ @state.token_start_line = @input.line
968
+ @state.text = nil
969
+ @input.peek == EOF and return EOF_TOKEN
970
+ begin
971
+ token!
972
+
973
+ case token = @state.token
974
+ when nil then return(emit())
975
+ when SKIP_TOKEN then next
976
+ else
977
+ return token
978
+ end
979
+ rescue NoViableAlternative => re
980
+ report_error(re)
981
+ recover(re)
982
+ rescue Error::RecognitionError => re
983
+ report_error(re)
984
+ end
985
+ end
986
+ end
987
+
988
+ def skip
989
+ @state.token = SKIP_TOKEN
990
+ end
991
+
992
+ abstract :token!
993
+
994
+ def exhaust
995
+ self.to_a
996
+ end
997
+
998
+ def char_stream=(input)
999
+ @input = nil
1000
+ reset()
1001
+ @input = input
1002
+ end
1003
+
1004
+ def source_name
1005
+ @input.source_name
1006
+ end
1007
+
1008
+ def emit(token = nil)
1009
+ token ||= create_token
1010
+ @state.token = token
1011
+ return token
1012
+ end
1013
+
1014
+ def match(expected)
1015
+ case expected
1016
+ when String
1017
+ expected.each_byte do |char|
1018
+ unless @input.peek == char
1019
+ @state.backtracking > 0 and raise BacktrackingFailed
1020
+ error = MismatchedToken(char)
1021
+ recover(error)
1022
+ raise error
1023
+ end
1024
+ @input.consume()
1025
+ end
1026
+ else # single integer character
1027
+ unless @input.peek == expected
1028
+ @state.backtracking > 0 and raise BacktrackingFailed
1029
+ error = MismatchedToken(expected)
1030
+ recover(error)
1031
+ raise error
1032
+ end
1033
+ @input.consume
1034
+ end
1035
+ return true
1036
+ end
1037
+
1038
+ def match_any
1039
+ @input.consume
1040
+ end
1041
+
1042
+ def match_range(min, max)
1043
+ char = @input.peek
1044
+ if char.between?(min, max) then @input.consume
1045
+ else
1046
+ @state.backtracking > 0 and raise BacktrackingFailed
1047
+ error = MismatchedRange(min.chr, max.chr)
1048
+ recover(error)
1049
+ raise(error)
1050
+ end
1051
+ return true
1052
+ end
1053
+
1054
+ def line
1055
+ @input.line
1056
+ end
1057
+
1058
+ def column
1059
+ @input.column
1060
+ end
1061
+
1062
+ def character_index
1063
+ @input.index
1064
+ end
1065
+
1066
+ def text
1067
+ @state.text and return @state.text
1068
+ @input.substring(@state.token_start_position, character_index - 1)
1069
+ end
1070
+
1071
+ def text=(text)
1072
+ @state.text = text
1073
+ end
1074
+
1075
+ def report_error(e)
1076
+ display_recognition_error(e)
1077
+ end
1078
+
1079
+ def error_message(e)
1080
+ char = character_error_display(e.symbol) rescue nil
1081
+ case e
1082
+ when Error::MismatchedToken
1083
+ expecting = character_error_display(e.expecting)
1084
+ "mismatched character #{char}; expecting #{expecting}"
1085
+ when Error::NoViableAlternative
1086
+ "no viable alternative at character #{char}"
1087
+ when Error::EarlyExit
1088
+ "required (...)+ loop did not match anything at character #{char}"
1089
+ when Error::MismatchedNotSet
1090
+ "mismatched character %s; expecting set %p" % [char, e.expecting]
1091
+ when Error::MismatchedSet
1092
+ "mismatched character %s; expecting set %p" % [char, e.expecting]
1093
+ when Error::MismatchedRange
1094
+ a = character_error_display(e.min)
1095
+ b = character_error_display(e.max)
1096
+ "mismatched character %s; expecting set %s..%s" % [char, a, b]
1097
+ else super
1098
+ end
1099
+ end
1100
+
1101
+ def character_error_display(char)
1102
+ case char
1103
+ when EOF then '<EOF>'
1104
+ when Integer then char.chr.inspect
1105
+ else char.inspect
1106
+ end
1107
+ end
1108
+
1109
+ def recover(re)
1110
+ @input.consume
1111
+ end
1112
+
1113
+ private
1114
+
1115
+ def trace_in(rule_name, rule_index)
1116
+ if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1117
+ else symbol = '<EOF>' end
1118
+ input_symbol = "#{symbol} @ line #{line} / col #{column}"
1119
+ super(rule_name, rule_index, input_symbol)
1120
+ end
1121
+
1122
+ def trace_out(rule_name, rule_index)
1123
+ if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1124
+ else symbol = '<EOF>' end
1125
+ input_symbol = "#{symbol} @ line #{line} / col #{column}"
1126
+ super(rule_name, rule_index, input_symbol)
1127
+ end
1128
+
1129
+ def create_token(&b)
1130
+ if block_given? then super(&b)
1131
+ else
1132
+ super do |t|
1133
+ t.input = @input
1134
+ t.type = @state.type
1135
+ t.channel = @state.channel
1136
+ t.start = @state.token_start_position
1137
+ t.stop = @input.index - 1
1138
+ t.line = @state.token_start_line
1139
+ t.text = self.text
1140
+ t.column = @state.token_start_column
1141
+ end
1142
+ end
1143
+ end
1144
+ end
1145
+
1146
+
1147
+ =begin rdoc ANTLR3::Parser
1148
+
1149
+ = Parser
1150
+
1151
+ Parser is the default base class of ANTLR-generated parser classes. The class
1152
+ tailors the functionality provided by BaseRecognizer to the task of parsing.
1153
+
1154
+ == About Parsing
1155
+
1156
+ This is just a lose overview of parsing. For considerably more in-depth coverage
1157
+ of the topic, read the ANTLR documentation or check out the ANTLR website
1158
+ (http://www.antlr.org).
1159
+
1160
+ A grammar defines the vocabulary and the sentence structure of a language. While
1161
+ a lexer concerns the basic vocabulary symbols of the language, a parser's
1162
+ primary task is to implement the sentence structure.
1163
+
1164
+ Parsers are set up by providing a stream of tokens, which is usually created by
1165
+ a corresponding lexer. Then, the user requests a specific sentence-structure
1166
+ within the grammar, such as "class_definition" or "xml_node", from the parser.
1167
+ It iterates through the tokens, verifying the syntax of the sentence and
1168
+ performing actions specified by the grammar. It stops when it encounters an
1169
+ error or when it has matched the full sentence according to its defined
1170
+ structure.
1171
+
1172
+ == ANTLR Parsers and the Parser API
1173
+
1174
+ Plain ANTLR-generated parsers directly subclass this class, unless specified
1175
+ otherwise within the grammar options. The generated code will provide a method
1176
+ for each parser rule defined in the ANTLR grammar, as well as any other
1177
+ customized member attributes and methods specified in the source grammar.
1178
+
1179
+ This class does not override much of the functionality in BaseRecognizer, and
1180
+ thus the API closely mirrors BaseRecognizer.
1181
+
1182
+ =end
1183
+ class Parser < BaseRecognizer
1184
+ def self.main(argv = ARGV, options = {})
1185
+ if argv.is_a?(::Hash) then argv, options = ARGV, argv end
1186
+ main = ANTLR3::Main::ParserMain.new(self, options)
1187
+ block_given? ? yield(main) : main.execute(argv)
1188
+ end
1189
+
1190
+ def self.associated_lexer
1191
+ @associated_lexer ||= begin
1192
+ @grammar_home and @grammar_home::Lexer
1193
+ rescue NameError
1194
+ grammar_name = @grammar_home.name.split("::").last
1195
+ begin
1196
+ require "#{grammar_name}Lexer"
1197
+ @grammar_home::Lexer
1198
+ rescue LoadError, NameError
1199
+ end
1200
+ end
1201
+ end
1202
+
1203
+ def initialize(input, options = {})
1204
+ super(options)
1205
+ @input = nil
1206
+ reset
1207
+ input = cast_input( input, options ) unless TokenStream === input
1208
+ @input = input
1209
+ end
1210
+
1211
+ def current_input_symbol
1212
+ @input.look
1213
+ end
1214
+
1215
+ def missing_symbol(error, expected_type, follow)
1216
+ current = @input.look
1217
+ current = @input.look(-1) if current == ANTLR3::EOF_TOKEN
1218
+ t =
1219
+ case
1220
+ when current && current != ANTLR3::EOF_TOKEN then current.clone
1221
+ when @input.token_class then @input.token_class.new
1222
+ else (create_token rescue CommonToken.new)
1223
+ end
1224
+
1225
+ t.type = expected_type
1226
+ name = t.name.gsub(/(^<)|(>$)/,'')
1227
+ t.text = "<missing #{name}>"
1228
+ t.channel = DEFAULT_CHANNEL
1229
+ return(t)
1230
+ end
1231
+
1232
+ def token_stream=(input)
1233
+ @input = nil
1234
+ reset
1235
+ @input = input
1236
+ end
1237
+ alias token_stream input
1238
+
1239
+ def source_name
1240
+ @input.source_name
1241
+ end
1242
+
1243
+ private
1244
+
1245
+ def trace_in(rule_name, rule_index)
1246
+ super(rule_name, rule_index, @input.look.inspect)
1247
+ end
1248
+
1249
+ def trace_out(rule_name, rule_index)
1250
+ super(rule_name, rule_index, @input.look.inspect)
1251
+ end
1252
+
1253
+ def cast_input( input, options )
1254
+ case input
1255
+ when TokenSource then CommonTokenStream.new( input, options )
1256
+ when IO, String
1257
+ if lexer_class = self.class.associated_lexer
1258
+ CommonTokenStream.new( lexer_class.new( input, options ), options )
1259
+ else
1260
+ raise ArgumentError, Util.tidy( <<-END, true )
1261
+ | unable to automatically convert input #{ input.inspect }
1262
+ | to a ANTLR3::TokenStream object as #{ self.class }
1263
+ | does not appear to have an associated lexer class
1264
+ END
1265
+ end
1266
+ else
1267
+ # assume it's a stream if it at least implements peek and consume
1268
+ unless input.respond_to?( :peek ) and input.respond_to?( :consume )
1269
+ raise ArgumentError, Util.tidy(<<-END, true)
1270
+ | #{ self.class } requires a token stream as input, but
1271
+ | #{ input.inspect } was provided
1272
+ END
1273
+ end
1274
+ input
1275
+ end
1276
+ end
1277
+
1278
+ end
1279
+
1280
+ end