antlr3 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/ANTLR-LICENSE.txt +26 -0
  2. data/History.txt +66 -0
  3. data/README.txt +139 -0
  4. data/bin/antlr4ruby +33 -0
  5. data/java/RubyTarget.java +524 -0
  6. data/java/antlr-full-3.2.1.jar +0 -0
  7. data/lib/antlr3.rb +176 -0
  8. data/lib/antlr3/constants.rb +88 -0
  9. data/lib/antlr3/debug.rb +701 -0
  10. data/lib/antlr3/debug/event-hub.rb +210 -0
  11. data/lib/antlr3/debug/record-event-listener.rb +25 -0
  12. data/lib/antlr3/debug/rule-tracer.rb +55 -0
  13. data/lib/antlr3/debug/socket.rb +360 -0
  14. data/lib/antlr3/debug/trace-event-listener.rb +92 -0
  15. data/lib/antlr3/dfa.rb +247 -0
  16. data/lib/antlr3/dot.rb +174 -0
  17. data/lib/antlr3/error.rb +657 -0
  18. data/lib/antlr3/main.rb +561 -0
  19. data/lib/antlr3/modes/ast-builder.rb +41 -0
  20. data/lib/antlr3/modes/filter.rb +56 -0
  21. data/lib/antlr3/profile.rb +322 -0
  22. data/lib/antlr3/recognizers.rb +1280 -0
  23. data/lib/antlr3/streams.rb +985 -0
  24. data/lib/antlr3/streams/interactive.rb +91 -0
  25. data/lib/antlr3/streams/rewrite.rb +412 -0
  26. data/lib/antlr3/test/call-stack.rb +57 -0
  27. data/lib/antlr3/test/config.rb +23 -0
  28. data/lib/antlr3/test/core-extensions.rb +269 -0
  29. data/lib/antlr3/test/diff.rb +165 -0
  30. data/lib/antlr3/test/functional.rb +207 -0
  31. data/lib/antlr3/test/grammar.rb +371 -0
  32. data/lib/antlr3/token.rb +592 -0
  33. data/lib/antlr3/tree.rb +1415 -0
  34. data/lib/antlr3/tree/debug.rb +163 -0
  35. data/lib/antlr3/tree/visitor.rb +84 -0
  36. data/lib/antlr3/tree/wizard.rb +481 -0
  37. data/lib/antlr3/util.rb +149 -0
  38. data/lib/antlr3/version.rb +27 -0
  39. data/samples/ANTLRv3Grammar.g +621 -0
  40. data/samples/Cpp.g +749 -0
  41. data/templates/AST.stg +335 -0
  42. data/templates/ASTDbg.stg +40 -0
  43. data/templates/ASTParser.stg +153 -0
  44. data/templates/ASTTreeParser.stg +272 -0
  45. data/templates/Dbg.stg +192 -0
  46. data/templates/Ruby.stg +1514 -0
  47. data/test/functional/ast-output/auto-ast.rb +797 -0
  48. data/test/functional/ast-output/construction.rb +555 -0
  49. data/test/functional/ast-output/hetero-nodes.rb +753 -0
  50. data/test/functional/ast-output/rewrites.rb +1327 -0
  51. data/test/functional/ast-output/tree-rewrite.rb +1662 -0
  52. data/test/functional/debugging/debug-mode.rb +689 -0
  53. data/test/functional/debugging/profile-mode.rb +165 -0
  54. data/test/functional/debugging/rule-tracing.rb +74 -0
  55. data/test/functional/delegation/import.rb +379 -0
  56. data/test/functional/lexer/basic.rb +559 -0
  57. data/test/functional/lexer/filter-mode.rb +245 -0
  58. data/test/functional/lexer/nuances.rb +47 -0
  59. data/test/functional/lexer/properties.rb +104 -0
  60. data/test/functional/lexer/syn-pred.rb +32 -0
  61. data/test/functional/lexer/xml.rb +206 -0
  62. data/test/functional/main/main-scripts.rb +245 -0
  63. data/test/functional/parser/actions.rb +224 -0
  64. data/test/functional/parser/backtracking.rb +244 -0
  65. data/test/functional/parser/basic.rb +282 -0
  66. data/test/functional/parser/calc.rb +98 -0
  67. data/test/functional/parser/ll-star.rb +143 -0
  68. data/test/functional/parser/nuances.rb +165 -0
  69. data/test/functional/parser/predicates.rb +103 -0
  70. data/test/functional/parser/properties.rb +242 -0
  71. data/test/functional/parser/rule-methods.rb +132 -0
  72. data/test/functional/parser/scopes.rb +274 -0
  73. data/test/functional/token-rewrite/basic.rb +318 -0
  74. data/test/functional/token-rewrite/via-parser.rb +100 -0
  75. data/test/functional/tree-parser/basic.rb +750 -0
  76. data/test/unit/sample-input/file-stream-1 +2 -0
  77. data/test/unit/sample-input/teststreams.input2 +2 -0
  78. data/test/unit/test-dfa.rb +52 -0
  79. data/test/unit/test-exceptions.rb +44 -0
  80. data/test/unit/test-recognizers.rb +55 -0
  81. data/test/unit/test-scheme.rb +62 -0
  82. data/test/unit/test-streams.rb +459 -0
  83. data/test/unit/test-tree-wizard.rb +535 -0
  84. data/test/unit/test-trees.rb +854 -0
  85. metadata +205 -0
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+ require 'antlr3'
4
+ require 'antlr3/tree'
5
+
6
+ module ANTLR3
7
+ module ASTBuilder
8
+ extend ClassMacros
9
+
10
+ def self.included(klass)
11
+ def klass.return_scope_members
12
+ super.push(:tree)
13
+ end
14
+ end
15
+
16
+ def initialize( input, options = {} )
17
+ @adaptor = options[:adaptor] ||= begin
18
+ (input.adaptor rescue nil) or
19
+ AST::CommonTreeAdaptor.new( token_class )
20
+ end
21
+ super( input, options )
22
+ end
23
+
24
+ shared_attribute( :adaptor )
25
+
26
+ private
27
+
28
+ def subtree_stream(desc, element = nil)
29
+ AST::RewriteRuleSubtreeStream.new(@adaptor, desc, element)
30
+ end
31
+
32
+ def token_stream(desc, element = nil)
33
+ AST::RewriteRuleTokenStream.new(@adaptor, desc, element)
34
+ end
35
+
36
+ def node_stream(desc, element = nil)
37
+ AST::RewriteRuleNodeStream.new(@adaptor, desc, element)
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ require 'antlr3'
5
+
6
+ module ANTLR3
7
+ =begin rdoc ANTLR3::FilterMode
8
+
9
+ If a lexer grammar specifies the <tt>filter = true</t> option, the generated
10
+ Lexer code will include this module. It modifies the standard
11
+ <tt>next_token</tt> to catch RecognitionErrors and skip ahead in the input until
12
+ the token! method can match a token without raising a RecognitionError.
13
+
14
+ See http://www.antlr.org/wiki/display/ANTLR3/Lexical+filters for more info on
15
+ lexer filter mode.
16
+
17
+ =end
18
+ module FilterMode
19
+ def next_token
20
+ @input.peek == ANTLR3::EOF and return ANTLR3::EOF_TOKEN
21
+ @state.token = nil
22
+ @state.channel = ANTLR3::DEFAULT_CHANNEL
23
+ @state.token_start_position = @input.index
24
+ @state.token_start_column = @input.column
25
+ @state.token_start_line = @input.line
26
+ @state.text = nil
27
+ @state.backtracking = 1
28
+ m = @input.mark
29
+ # means we won't throw slow exception
30
+ token!
31
+ @input.release(m)
32
+ emit
33
+ return @state.token
34
+ rescue ANTLR3::BacktrackingFailed
35
+ # token! backtracks with synpred at backtracking==2
36
+ # and we set the synpredgate to allow actions at level 1.
37
+ @input.rewind(m)
38
+ @input.consume # advance one char and try again
39
+ retry
40
+ rescue ANTLR3::Error::RecognitionError => re
41
+ # shouldn't happen in backtracking mode, but...
42
+ report_error(re)
43
+ recover(re)
44
+ ensure
45
+ @state.backtracking = 0
46
+ end
47
+
48
+ def memoize(rule, start_index, success)
49
+ super(rule, start_index, success) if @state.backtracking > 1
50
+ end
51
+
52
+ def already_parsed_rule?(rule)
53
+ @state.backtracking > 1 ? super(rule) : false
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,322 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ module ANTLR3
5
+ module Profile
6
+ =begin rdoc ANTLR3::Profile::ParserEvents
7
+
8
+ ANTLR3::Profile::ParserEvents expands basic debugging events for use by
9
+ recognition code generated by ANTLR when called with the <tt>-profile</tt>
10
+ switch.
11
+
12
+ =end
13
+ module ParserEvents
14
+ include ANTLR3::Debug::ParserEvents
15
+
16
+ def initialize(stream, options = {})
17
+ options[:debug_listener] ||= Profiler.new( self )
18
+ super( stream, options )
19
+ end
20
+
21
+ def already_parsed_rule?(rule)
22
+ @debug_listener.examine_rule_memoization(rule)
23
+ super
24
+ end
25
+
26
+ def profile
27
+ @debug_listener.profile
28
+ end
29
+
30
+ def memoize(rule, start_index, success)
31
+ @debug_listener.memoize(rule, rule_start_index, sucess)
32
+ super
33
+ end
34
+ end
35
+
36
+ class DataSet < ::Array
37
+ include ::Math
38
+ def total
39
+ inject(:+)
40
+ end
41
+ def average
42
+ length > 0 ? (total.to_f / length) : 0
43
+ end
44
+ def variance
45
+ length.zero? and return(0.0)
46
+ mean = average
47
+ inject(0.0) { |t, i| t + (i - mean)**2 } / (length - 1)
48
+ end
49
+ def standard_deviation
50
+ sqrt(variance)
51
+ end
52
+ end
53
+
54
+
55
+
56
+
57
+
58
+ unless const_defined?(:Profile)
59
+ Profile = Struct.new(
60
+ :grammar_file, :parser_class, :top_rule,
61
+ :rule_invocations, :guessing_rule_invocations, :rule_invocation_depth,
62
+ :fixed_looks, :cyclic_looks, :syntactic_predicate_looks,
63
+ :memoization_cache_entries, :memoization_cache_hits,
64
+ :memoization_cache_misses, :tokens, :hidden_tokens,
65
+ :characters_matched, :hidden_characters_matched, :semantic_predicates,
66
+ :syntactic_predicates, :reported_errors
67
+ )
68
+ end
69
+
70
+ class Profile
71
+ def initialize
72
+ init_values = Array.new(self.class.members.length, 0)
73
+ super(*init_values)
74
+ self.top_rule = self.parser_class = self.grammar_file = nil
75
+ self.fixed_looks = DataSet.new
76
+ self.cyclic_looks = DataSet.new
77
+ self.syntactic_predicate_looks = DataSet.new
78
+ end
79
+
80
+ def fixed_decisions
81
+ fixed_looks.length
82
+ end
83
+
84
+ def cyclic_decisions
85
+ cyclic_looks.length
86
+ end
87
+
88
+ def backtracking_decisions
89
+ syntactic_predicate_looks.length
90
+ end
91
+
92
+ def generate_report
93
+ report = '+' << '-' * 78 << "+\n"
94
+ report << '| ' << "ANTLR Rule Profile".center(76) << " |\n"
95
+ report << '+' << '-' * 78 << "+\n"
96
+ report << "| Generated at #{Time.now}".ljust(78) << " |\n"
97
+ report << "| Profiled #{parser_class.name}##{top_rule}".ljust(78) << " |\n"
98
+ report << "| Rule source generated from grammar file #{grammar_file}".ljust(78) << " |\n"
99
+ report << '+' << '-' * 78 << "+\n"
100
+
101
+ report << '| ' << "Rule Invocations".center(76) << " |\n"
102
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
103
+ report << "| %-66s | %7i |\n" % ["Total Invocations", rule_invocations]
104
+ report << "| %-66s | %7i |\n" % ["``Guessing'' Invocations", guessing_rule_invocations]
105
+ report << "| %-66s | %7i |\n" % ["Deepest Level of Invocation", rule_invocation_depth]
106
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
107
+
108
+ report << '| ' << "Execution Events".center(76) << " |\n"
109
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
110
+ report << "| %-66s | %7i |\n" % ["Semantic Predicates Evaluated", semantic_predicates]
111
+ report << "| %-66s | %7i |\n" % ["Syntactic Predicates Evaluated", syntactic_predicates]
112
+ report << "| %-66s | %7i |\n" % ["Errors Reported", reported_errors]
113
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
114
+
115
+ report << '| ' << "Token and Character Data".center(76) << " |\n"
116
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
117
+ report << "| %-66s | %7i |\n" % ["Tokens Consumed", tokens]
118
+ report << "| %-66s | %7i |\n" % ["Hidden Tokens Consumed", hidden_tokens]
119
+ report << "| %-66s | %7i |\n" % ["Characters Matched", characters_matched]
120
+ report << "| %-66s | %7i |\n" % ["Hidden Characters Matched", hidden_characters_matched]
121
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
122
+
123
+ report << '| ' << "Memoization".center(76) << " |\n"
124
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
125
+ report << "| %-66s | %7i |\n" % ["Cache Entries", memoization_cache_entries]
126
+ report << "| %-66s | %7i |\n" % ["Cache Hits", memoization_cache_hits]
127
+ report << "| %-66s | %7i |\n" % ["Cache Misses", memoization_cache_misses]
128
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
129
+
130
+ [
131
+ ['Fixed Lookahead (k)', fixed_looks],
132
+ ['Arbitrary Lookahead (k)', cyclic_looks],
133
+ ['Backtracking (Syntactic Predicate)', syntactic_predicate_looks]
134
+ ].each do |name, set|
135
+ mean, stdev = '%4.2f' % set.average, '%4.2f' % set.standard_deviation
136
+ report << '| ' << "#{name} Decisions".center(76) << " |\n"
137
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
138
+ report << "| %-66s | %7i |\n" % ["Count", set.length]
139
+ report << "| %-66s | %7i |\n" % ["Minimum k", set.min]
140
+ report << "| %-66s | %7i |\n" % ["Maximum k", set.max]
141
+ report << "| %-66s | %7s |\n" % ["Average k", mean]
142
+ report << "| %-66s | %7s |\n" % ["Standard Deviation of k", stdev]
143
+ report << '+' << '-' * 68 << '+' << '-' * 9 << "+\n"
144
+ end
145
+ return(report)
146
+ end
147
+ end
148
+
149
+ =begin rdoc ANTLR3::Profile::Profiler
150
+
151
+ When ANTLR is run with the <tt>-profile</tt> switch, it generates recognition
152
+ code that performs accounting about the decision logic performed while parsing
153
+ any given input. This information can be used to help refactor a slow grammar.
154
+ Profiler is an event-listener that performs all of the profiling accounting and
155
+ builds a simple report to present the various statistics.
156
+
157
+ =end
158
+ class Profiler
159
+ include ANTLR3::Debug::EventListener
160
+
161
+ PROTOCOL_VERSION = 2
162
+
163
+ attr_accessor :parser
164
+ attr_reader :rule_level
165
+ attr_reader :decision_level
166
+
167
+ # tracks the maximum look value for the current decision
168
+ # (maxLookaheadInCurrentDecision in java Profiler)
169
+ attr_reader :decision_look
170
+
171
+ # the last token consumed
172
+ # (lastTokenConsumed in java Profiler)
173
+ attr_reader :last_token
174
+ attr_reader :look_stack
175
+ attr_reader :profile
176
+
177
+ attr_accessor :output
178
+
179
+ def initialize(parser = nil, output = nil)
180
+ @parser = parser
181
+ @profile = nil
182
+ @rule_level = 0
183
+ @decision_level = 0
184
+ @decision_look = 0
185
+ @last_token = nil
186
+ @look_stack = []
187
+ @output = output
188
+ end
189
+
190
+ def commence
191
+ @profile = Profile.new
192
+ @rule_level = 0
193
+ @decision_level = 0
194
+ @decision_look = 0
195
+ @last_token = nil
196
+ @look_stack = []
197
+ end
198
+
199
+ def enter_rule(grammar_file_name, rule_name)
200
+ if @rule_level.zero?
201
+ commence
202
+ @profile.grammar_file = grammar_file_name
203
+ @profile.parser_class = @parser.class
204
+ @profile.top_rule = rule_name
205
+ end
206
+ @rule_level += 1
207
+ @profile.rule_invocations += 1
208
+ @profile.rule_invocation_depth < @rule_level and
209
+ @profile.rule_invocation_depth = @rule_level
210
+ end
211
+
212
+ def exit_rule(grammar_file_name, rule_name)
213
+ @rule_level -= 1
214
+ end
215
+
216
+ def examine_rule_memoization(rule)
217
+ stop_index = parser.rule_memoization(rule, @parser.input.index)
218
+ if stop_index == BaseRecognizer::MEMO_RULE_UNKNOWN
219
+ @profile.memoization_cache_misses += 1
220
+ @profile.guessing_rule_invocations += 1
221
+ else
222
+ @profile.memoization_cache_hits += 1
223
+ end
224
+ end
225
+
226
+ def memoize(rule, start_index, success)
227
+ @profile.memoization_cache_entries += 1
228
+ end
229
+
230
+
231
+ def enter_decision(decision_number)
232
+ @decision_level += 1
233
+ starting_look_index = @parser.token_stream.index
234
+ @look_stack << starting_look_index
235
+ end
236
+
237
+ def exit_decision(decision_number)
238
+ @look_stack.pop
239
+ @decision_level -= 1
240
+ if @parser.cyclic_decision? then
241
+ @profile.cyclic_looks << @decision_look
242
+ else @profile.fixed_looks << @decision_look
243
+ end
244
+
245
+ @parser.cyclic_decision = false
246
+ @decision_look = 0
247
+ end
248
+
249
+ def consume_token(token)
250
+ @last_token = token
251
+ end
252
+
253
+ def in_decision?
254
+ return(@decision_level > 0)
255
+ end
256
+
257
+ def consume_hidden_token(token)
258
+ @last_token = token
259
+ end
260
+
261
+ def look(i, token)
262
+ in_decision? or return
263
+ starting_index = look_stack.last
264
+ input = @parser.token_stream
265
+ this_ref_index = input.index
266
+ num_hidden = input.tokens(starting_index, this_ref_index).count { |t| t.hidden? }
267
+ depth = i + this_ref_index - starting_index - num_hidden
268
+ if depth > @decision_look
269
+ @decision_look = depth
270
+ end
271
+ end
272
+
273
+ def end_backtrack(level, successful)
274
+ @profile.syntactic_predicate_looks << @decision_look
275
+ end
276
+
277
+ def recognition_exception(error)
278
+ @profile.reported_errors += 1
279
+ end
280
+
281
+ def semantic_predicate(result, predicate)
282
+ in_decision? and @profile.semantic_predicates += 1
283
+ end
284
+
285
+ def terminate
286
+ input = @parser.token_stream
287
+ hidden_tokens = input.select { |token| token.hidden? }
288
+ @profile.hidden_tokens = hidden_tokens.length
289
+ @profile.tokens = input.tokens.length
290
+ @profile.hidden_characters_matched = hidden_tokens.inject(0) do |count, token|
291
+ count + token.text.length rescue count
292
+ end
293
+ @profile.characters_matched = (@last_token || input.tokens.last).stop + 1
294
+ write_report
295
+ end
296
+
297
+
298
+ def write_report
299
+ @output << @profile.generate_report unless @output.nil?
300
+ rescue NoMethodError => error
301
+ if error.name.to_s == '<<'
302
+ warn(<<-END.strip! % [__FILE__, __LINE__, @output])
303
+ [%s @ %s]: failed to write report to %p as it does not respond to :<<
304
+ END
305
+ else raise
306
+ end
307
+ rescue IOError => error
308
+ $stderr.puts( Util.tidy(<<-END) % [__FILE__, __LINE__, @output, error.class, error.message])
309
+ | [%s @ %s]: failed to write profile report to %p due to an IO Error:
310
+ | %s: %s
311
+ END
312
+ $stderr.puts(error.backtrace.map { |call| " - #{call}" }.join("\n"))
313
+ end
314
+
315
+ def report
316
+ @profile.generate_report
317
+ end
318
+
319
+ alias to_s report
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,1280 @@
1
+ #!/usr/bin/ruby
2
+ # encoding: utf-8
3
+
4
+ =begin LICENSE
5
+
6
+ [The "BSD licence"]
7
+ Copyright (c) 2009 Kyle Yetter
8
+ All rights reserved.
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions
12
+ are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright
15
+ notice, this list of conditions and the following disclaimer.
16
+ 2. Redistributions in binary form must reproduce the above copyright
17
+ notice, this list of conditions and the following disclaimer in the
18
+ documentation and/or other materials provided with the distribution.
19
+ 3. The name of the author may not be used to endorse or promote products
20
+ derived from this software without specific prior written permission.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25
+ IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27
+ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31
+ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+
33
+ =end
34
+
35
+ module ANTLR3
36
+ unless const_defined?(:RecognizerSharedState)
37
+
38
+ RecognizerSharedState = Struct.new(
39
+ :following,
40
+ :error_recovery,
41
+ :last_error_index,
42
+ :backtracking,
43
+ :rule_memory,
44
+ :syntax_errors,
45
+ :token,
46
+ :token_start_position,
47
+ :token_start_line,
48
+ :token_start_column,
49
+ :channel,
50
+ :type,
51
+ :text
52
+ )
53
+
54
+ =begin rdoc ANTLR3::RecognizerSharedState
55
+
56
+ A big Struct-based class containing most of the data that makes up a
57
+ recognizer's state. These attributes are externalized from the recognizer itself
58
+ so that recognizer delegation (which occurs when you import other grammars into
59
+ your grammar) can function; multiple recognizers can share a common state.
60
+
61
+ == Structure Attributes
62
+
63
+ following::
64
+ a stack that tracks follow sets for error recovery
65
+ error_recovery::
66
+ a flag indicating whether or not the recognizer is in error recovery mode
67
+ last_error_index::
68
+ the index in the input stream of the last error
69
+ backtracking::
70
+ tracks the backtracking depth
71
+ rule_memory::
72
+ if a grammar is compiled with the memoization option, this will be
73
+ set to a hash mapping previously parsed rules to cached indices
74
+ syntax_errors::
75
+ tracks the number of syntax errors seen so far
76
+ token::
77
+ holds newly constructed tokens for lexer rules
78
+ token_start_position::
79
+ the input stream index at which the token starts
80
+ token_start_line::
81
+ the input stream line number at which the token starts
82
+ token_start_column::
83
+ the input stream column at which the token starts
84
+ channel::
85
+ the channel value of the target token
86
+ type::
87
+ the type value of the target token
88
+ text::
89
+ the text of the target token
90
+
91
+ =end
92
+ class RecognizerSharedState
93
+ def initialize
94
+ super([], false, -1, 0, nil, 0, nil, -1)
95
+ # ^-- same as this --v
96
+ # self.following = []
97
+ # self.error_recovery = false
98
+ # self.last_error_index = -1
99
+ # self.backtracking = 0
100
+ # self.syntax_errors = 0
101
+ # self.token_start_position = -1
102
+ end
103
+
104
+
105
+ # restores all of the state variables to their respective
106
+ # initial default values
107
+ def reset!
108
+ self.following.clear
109
+ self.error_recovery = false
110
+ self.last_error_index = -1
111
+ self.backtracking = 0
112
+ self.rule_memory and rule_memory.clear
113
+ self.syntax_errors = 0
114
+ self.token = nil
115
+ self.token_start_position = -1
116
+ self.token_start_line = nil
117
+ self.token_start_column = nil
118
+ self.channel = nil
119
+ self.type = nil
120
+ self.text = nil
121
+ end
122
+
123
+ end
124
+ end
125
+
126
+
127
+ =begin rdoc ANTLR3::BaseRecognizer
128
+
129
+ = BaseRecognizer
130
+
131
+ As the base class of all ANTLR-generated recognizers, BaseRecognizer provides
132
+ much of the shared functionality and structure used in the recognition process.
133
+ For all effective purposes, the class and its immediate subclasses Lexer,
134
+ Parser, and TreeParser are abstract classes. They can be instantiated, but
135
+ they're pretty useless on their own. Instead, to make useful code, you write an
136
+ ANTLR grammar and ANTLR will generate classes which inherit from one of the
137
+ recognizer base classes, providing the implementation of the grammar rules
138
+ itself. this group of classes to implement necessary tasks. BaseRecognizer
139
+ defines methods related to:
140
+
141
+ * token and character matching
142
+ * prediction and recognition strategy
143
+ * recovering from errors
144
+ * reporting errors
145
+ * memoization
146
+ * simple rule tracing and debugging
147
+
148
+ =end
149
+ class BaseRecognizer
150
+ include Constants
151
+ include Error
152
+ include TokenFactory
153
+ extend ClassMacros
154
+
155
+ MEMO_RULE_FAILED = -2
156
+ MEMO_RULE_UNKNOWN = -1
157
+ DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
158
+ HIDDEN = HIDDEN_CHANNEL
159
+
160
+ @rules = {}
161
+
162
+ # inherited class methods and hooks
163
+ class << self
164
+
165
+ attr_reader :grammar_file_name,
166
+ :antlr_version,
167
+ :antlr_version_string,
168
+ :grammar_home
169
+
170
+ attr_accessor :token_scheme, :default_rule
171
+
172
+ # generated recognizer code uses this method to stamp
173
+ # the code with the name of the grammar file and
174
+ # the current version of ANTLR being used to generate
175
+ # the code
176
+ def generated_using(grammar_file, version_string)
177
+ @grammar_file_name = grammar_file.freeze
178
+ @antlr_version_string = version_string.freeze
179
+ if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
180
+ @antlr_version = [$1, $2, $3, $4].map! { |str| str.to_i }
181
+ timestamp = $5.strip
182
+ #@antlr_release_time = $5.empty? ? nil : Time.parse($5)
183
+ else
184
+ raise "bad version string: %p" % version_string
185
+ end
186
+ end
187
+
188
+ # this method is used to generate return-value structures for
189
+ # rules with multiple return values. To avoid generating
190
+ # a special class for ever rule in AST parsers and such
191
+ # (where most rules have the same default set of return values),
192
+ # each recognizer gets a default return value structure
193
+ # assigned to the constant +Return+. Rules which don't
194
+ # require additional custom members will have a rule-return
195
+ # name constant that just points to the generic return
196
+ # value.
197
+ def define_return_scope(*members)
198
+ if members.empty? then generic_return_scope
199
+ else
200
+ members += return_scope_members
201
+ Struct.new(*members)
202
+ end
203
+ end
204
+
205
+ # used as a hook to add additional default members
206
+ # to default return value structures
207
+ # For example, all AST-building parsers override
208
+ # this method to add an extra +:tree+ field to
209
+ # all rule return structures.
210
+ def return_scope_members
211
+ [:start, :stop]
212
+ end
213
+
214
+ # sets up and returns the generic rule return
215
+ # scope for a recognizer
216
+ def generic_return_scope
217
+ @generic_return_scope ||= begin
218
+ struct = Struct.new(*return_scope_members)
219
+ const_set(:Return, struct)
220
+ end
221
+ end
222
+
223
+ def imported_grammars
224
+ @imported_grammars ||= Set.new
225
+ end
226
+
227
+ def master_grammars
228
+ @master_grammars ||= []
229
+ end
230
+
231
+ def master
232
+ master_grammars.last
233
+ end
234
+
235
+ def masters( *grammar_names )
236
+ for grammar in grammar_names
237
+ unless master_grammars.include?( grammar )
238
+ master_grammars << grammar
239
+ attr_reader( Util.snake_case( grammar ) )
240
+ end
241
+ end
242
+ end
243
+ private :masters
244
+
245
+ def imports( *grammar_names )
246
+ for grammar in grammar_names
247
+ imported_grammars.add?(grammar.to_sym) and
248
+ attr_reader( Util.snake_case( grammar ) )
249
+ end
250
+ return imported_grammars
251
+ end
252
+ private :imports
253
+
254
+ def rules
255
+ self::RULE_METHODS.dup rescue []
256
+ end
257
+
258
+ def default_rule
259
+ @default_rule ||= rules.first
260
+ end
261
+
262
+ def debug?
263
+ return false
264
+ end
265
+
266
+ def token_class
267
+ @token_class ||= begin
268
+ self::Token rescue
269
+ superclass.token_class rescue
270
+ ANTLR3::CommonToken
271
+ end
272
+ end
273
+ private :generated_using
274
+ end
275
+
276
+ @grammar_file_name = nil
277
+ @antlr_version = ANTLR3::ANTLR_VERSION
278
+ @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
279
+
280
+ def grammar_file_name
281
+ self.class.grammar_file_name
282
+ end
283
+
284
+ def antlr_version
285
+ self.class.antlr_version
286
+ end
287
+
288
+ def antlr_version_string
289
+ self.class.antlr_version_string
290
+ end
291
+
292
+ attr_accessor :input
293
+ attr_reader :state
294
+
295
+ def each_delegate
296
+ block_given? or return enum_for( __method__ )
297
+ for grammar in self.class.imported_grammars
298
+ del = __send__( Util.snake_case( grammar ) ) and
299
+ yield( del )
300
+ end
301
+ end
302
+
303
+ # Create a new recognizer. The constructor simply ensures that
304
+ # all recognizers are initialized with a shared state object.
305
+ # See the main recognizer subclasses for more specific
306
+ # information about creating recognizer objects like
307
+ # lexers and parsers.
308
+ def initialize(options = {})
309
+ @state = options[:state] || RecognizerSharedState.new
310
+ @error_output = options.fetch(:error_output, $stderr)
311
+ defined?(@input) or @input = nil
312
+ initialize_dfas
313
+ end
314
+
315
+ # Resets the recognizer's state data to initial values.
316
+ # As a result, all error tracking and error recovery
317
+ # data accumulated in the current state will be cleared.
318
+ # It will also attempt to reset the input stream
319
+ # via input.reset, but it ignores any errors received
320
+ # from doing so. Thus the input stream is not guarenteed
321
+ # to be rewound to its initial position
322
+ def reset
323
+ @state and @state.reset!
324
+ @input and @input.reset rescue nil
325
+ end
326
+
327
+ # Attempt to match the current input symbol the token type
328
+ # specified by +type+. If the symbol matches the type,
329
+ # consume the current symbol and return its value. If
330
+ # the symbol doesn't match, attempt to use the follow-set
331
+ # data provided by +follow+ to recover from the mismatched
332
+ # token.
333
+ def match(type, follow)
334
+ matched_symbol = current_input_symbol
335
+ if @input.peek == type
336
+ @input.consume
337
+ @state.error_recovery = false
338
+ return matched_symbol
339
+ end
340
+ raise(BacktrackingFailed) if @state.backtracking > 0
341
+ matched_symbol = recover_from_mismatched_token(type, follow)
342
+ return matched_symbol
343
+ end
344
+
345
+ # match anything -- i.e. wildcard match. Simply consume
346
+ # the current symbol from the input stream.
347
+ def match_any
348
+ @state.error_recovery = false
349
+ @input.consume
350
+ end
351
+
352
+ ##############################################################################################
353
+ ###################################### Error Reporting #######################################
354
+ ##############################################################################################
355
+ ##############################################################################################
356
+
357
+ # When a recognition error occurs, this method is the main
358
+ # hook for carrying out the error reporting process. The
359
+ # default implementation calls +display_recognition_error+
360
+ # to display the error info on $stderr.
361
+ def report_error(e = $!)
362
+ @state.error_recovery and return
363
+ @state.error_recovery = true
364
+ display_recognition_error(e)
365
+ end
366
+
367
+ # error reporting hook for presenting the information
368
+ # The default implementation builds appropriate error
369
+ # message text using +error_header+ and +error_message+,
370
+ # and calls +emit_error_message+ to write the error
371
+ # message out to some source
372
+ def display_recognition_error(e = $!)
373
+ header = error_header(e)
374
+ message = error_message(e)
375
+ emit_error_message("#{header} #{message}")
376
+ end
377
+
378
+ # used to construct an appropriate error message
379
+ # based on the specific type of error and the
380
+ # error's attributes
381
+ def error_message(e = $!)
382
+ case e
383
+ when Error::UnwantedToken
384
+ token_name = token_name(e.expecting)
385
+ "extraneous input #{token_error_display(e.unexpected_token)} expecting #{token_name}"
386
+ when Error::MissingToken
387
+ token_name = token_name(e.expecting)
388
+ "missing #{token_name} at #{token_error_display(e.symbol)}"
389
+ when Error::MismatchedToken
390
+ token_name = token_name(e.expecting)
391
+ "mismatched input #{token_error_display(e.symbol)} expecting #{token_name}"
392
+ when Error::MismatchedTreeNode
393
+ token_name = token_name(e.expecting)
394
+ "mismatched tree node: #{e.symbol} expecting #{token_name}"
395
+ when Error::NoViableAlternative
396
+ "no viable alternative at input " << token_error_display(e.symbol)
397
+ when Error::MismatchedSet
398
+ "mismatched input %s expecting set %s" %
399
+ [token_error_display(e.symbol), e.expecting.inspect]
400
+ when Error::MismatchedNotSet
401
+ "mismatched input %s expecting set %s" %
402
+ [token_error_display(e.symbol), e.expecting.inspect]
403
+ when Error::FailedPredicate
404
+ "rule %s failed predicate: { %s }?" % [e.rule_name, e.predicate_text]
405
+ else e.message
406
+ end
407
+ end
408
+
409
+ # used to add a tag to the error message that indicates
410
+ # the location of the input stream when the error
411
+ # occurred
412
+ def error_header(e = $!)
413
+ e.location
414
+ end
415
+
416
+ # formats a token object appropriately for inspection
417
+ # within an error message
418
+ def token_error_display(token)
419
+ unless text = token.text
420
+ if token.type == EOF then text = '<EOF>'
421
+ elsif name = token_name(token.type) rescue false
422
+ text = "<#{name}>"
423
+ elsif token.respond_to?(:name) then text = "<#{token.name}>"
424
+ else "<#{token.type}>"
425
+ end
426
+ end
427
+ return text.inspect
428
+ end
429
+
430
+ # Write the error report data out to some source. By default,
431
+ # the error message is written to $stderr
432
+ def emit_error_message(message)
433
+ @error_output.puts(message) if @error_output
434
+ end
435
+
436
+ ##############################################################################################
437
+ ###################################### Error Recovery ########################################
438
+ ##############################################################################################
439
+ def recover(error = $!)
440
+ @state.last_error_index == @input.index and @input.consume
441
+ @state.last_error_index = @input.index
442
+
443
+ follow_set = compute_error_recovery_set
444
+
445
+ resync { consume_until(follow_set) }
446
+ end
447
+
448
+ def resync
449
+ begin_resync
450
+ value = yield
451
+ end_resync
452
+ return(value)
453
+ end
454
+
455
+ # overridable hook method that is executed at the start of the
456
+ # resyncing procedure in recover
457
+ #
458
+ # by default, it does nothing
459
+ def begin_resync
460
+ # do nothing
461
+ end
462
+
463
+ # overridable hook method that is after the resyncing procedure has completed
464
+ #
465
+ # by default, it does nothing
466
+ def end_resync
467
+ # do nothing
468
+ end
469
+
470
+ # (The following explanation has been lifted directly from the
471
+ # source code documentation of the ANTLR Java runtime library)
472
+ #
473
+ # Compute the error recovery set for the current rule. During
474
+ # rule invocation, the parser pushes the set of tokens that can
475
+ # follow that rule reference on the stack; this amounts to
476
+ # computing FIRST of what follows the rule reference in the
477
+ # enclosing rule. This local follow set only includes tokens
478
+ # from within the rule; i.e., the FIRST computation done by
479
+ # ANTLR stops at the end of a rule.
480
+ #
481
+ # EXAMPLE
482
+ #
483
+ # When you find a "no viable alt exception", the input is not
484
+ # consistent with any of the alternatives for rule r. The best
485
+ # thing to do is to consume tokens until you see something that
486
+ # can legally follow a call to r *or* any rule that called r.
487
+ # You don't want the exact set of viable next tokens because the
488
+ # input might just be missing a token--you might consume the
489
+ # rest of the input looking for one of the missing tokens.
490
+ #
491
+ # Consider grammar:
492
+ #
493
+ # a : '[' b ']'
494
+ # | '(' b ')'
495
+ # ;
496
+ # b : c '^' INT ;
497
+ # c : ID
498
+ # | INT
499
+ # ;
500
+ #
501
+ # At each rule invocation, the set of tokens that could follow
502
+ # that rule is pushed on a stack. Here are the various "local"
503
+ # follow sets:
504
+ #
505
+ # FOLLOW(b1_in_a) = FIRST(']') = ']'
506
+ # FOLLOW(b2_in_a) = FIRST(')') = ')'
507
+ # FOLLOW(c_in_b) = FIRST('^') = '^'
508
+ #
509
+ # Upon erroneous input "[]", the call chain is
510
+ #
511
+ # a -> b -> c
512
+ #
513
+ # and, hence, the follow context stack is:
514
+ #
515
+ # depth local follow set after call to rule
516
+ # 0 \<EOF> a (from main())
517
+ # 1 ']' b
518
+ # 3 '^' c
519
+ #
520
+ # Notice that <tt>')'</tt> is not included, because b would have to have
521
+ # been called from a different context in rule a for ')' to be
522
+ # included.
523
+ #
524
+ # For error recovery, we cannot consider FOLLOW(c)
525
+ # (context-sensitive or otherwise). We need the combined set of
526
+ # all context-sensitive FOLLOW sets--the set of all tokens that
527
+ # could follow any reference in the call chain. We need to
528
+ # resync to one of those tokens. Note that FOLLOW(c)='^' and if
529
+ # we resync'd to that token, we'd consume until EOF. We need to
530
+ # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
531
+ # In this case, for input "[]", LA(1) is in this set so we would
532
+ # not consume anything and after printing an error rule c would
533
+ # return normally. It would not find the required '^' though.
534
+ # At this point, it gets a mismatched token error and throws an
535
+ # exception (since LA(1) is not in the viable following token
536
+ # set). The rule exception handler tries to recover, but finds
537
+ # the same recovery set and doesn't consume anything. Rule b
538
+ # exits normally returning to rule a. Now it finds the ']' (and
539
+ # with the successful match exits errorRecovery mode).
540
+ #
541
+ # So, you cna see that the parser walks up call chain looking
542
+ # for the token that was a member of the recovery set.
543
+ #
544
+ # Errors are not generated in errorRecovery mode.
545
+ #
546
+ # ANTLR's error recovery mechanism is based upon original ideas:
547
+ #
548
+ # "Algorithms + Data Structures = Programs" by Niklaus Wirth
549
+ #
550
+ # and
551
+ #
552
+ # "A note on error recovery in recursive descent parsers":
553
+ # http://portal.acm.org/citation.cfm?id=947902.947905
554
+ #
555
+ # Later, Josef Grosch had some good ideas:
556
+ #
557
+ # "Efficient and Comfortable Error Recovery in Recursive Descent
558
+ # Parsers":
559
+ # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
560
+ #
561
+ # Like Grosch I implemented local FOLLOW sets that are combined
562
+ # at run-time upon error to avoid overhead during parsing.
563
+ def compute_error_recovery_set
564
+ combine_follows(false)
565
+ end
566
+
567
+ def recover_from_mismatched_token(type, follow)
568
+ if mismatch_is_unwanted_token?(type)
569
+ err = UnwantedToken(type)
570
+
571
+ begin_resync
572
+ @input.consume
573
+ end_resync
574
+
575
+ report_error(err)
576
+
577
+ matched_symbol = current_input_symbol
578
+ @input.consume
579
+ return matched_symbol
580
+ end
581
+
582
+ if mismatch_is_missing_token?(follow)
583
+ inserted = missing_symbol(err, type, follow)
584
+ err = MissingToken(type, inserted)
585
+
586
+ report_error(err)
587
+ return inserted
588
+ end
589
+
590
+ err = MismatchedToken(type)
591
+ raise err
592
+ end
593
+
594
+ def recover_from_mismatched_set(e, follow)
595
+ if mismatch_is_missing_token?(follow)
596
+ report_error(e)
597
+ return missing_symbol(e, INVALID_TOKEN_TYPE, follow)
598
+ end
599
+ raise e
600
+ end
601
+
602
+ # Conjure up a missing token during error recovery.
603
+ #
604
+ # The recognizer attempts to recover from single missing
605
+ # symbols. But, actions might refer to that missing symbol.
606
+ # For example, x=ID {f($x);}. The action clearly assumes
607
+ # that there has been an identifier matched previously and that
608
+ # $x points at that token. If that token is missing, but
609
+ # the next token in the stream is what we want we assume that
610
+ # this token is missing and we keep going. Because we
611
+ # have to return some token to replace the missing token,
612
+ # we have to conjure one up. This method gives the user control
613
+ # over the tokens returned for missing tokens. Mostly,
614
+ # you will want to create something special for identifier
615
+ # tokens. For literals such as '{' and ',', the default
616
+ # action in the parser or tree parser works. It simply creates
617
+ # a CommonToken of the appropriate type. The text will be the token.
618
+ # If you change what tokens must be created by the lexer,
619
+ # override this method to create the appropriate tokens.
620
+ def missing_symbol(error, expected_token_type, follow)
621
+ return nil
622
+ end
623
+
624
+ def recover_from_mismatched_element(e, follow)
625
+ follow.nil? and return false
626
+ if follow.include?(EOR_TOKEN_TYPE)
627
+ viable_tokens = compute_context_sensitive_rule_follow()
628
+ follow = (follow | viable_tokens) - Set.new([EOR_TOKEN_TYPE])
629
+ end
630
+ if follow.include?(@input.peek)
631
+ report_error(e)
632
+ return true
633
+ end
634
+ return false
635
+ end
636
+
637
+ def mismatch_is_unwanted_token?(type)
638
+ @input.peek(2) == type
639
+ end
640
+
641
+ def mismatch_is_missing_token?(follow)
642
+ follow.nil? and return false
643
+ if follow.include?(EOR_TOKEN_TYPE)
644
+ viable_tokens = compute_context_sensitive_rule_follow
645
+ follow = follow | viable_tokens
646
+
647
+ follow.delete(EOR_TOKEN_TYPE) unless @state.following.empty?
648
+ end
649
+ if follow.include?(@input.peek) or follow.include?(EOR_TOKEN_TYPE)
650
+ return true
651
+ end
652
+ return false
653
+ end
654
+
655
+ # factor out what to do upon token mismatch so
656
+ # tree parsers can behave differently.
657
+ #
658
+ # * override this method in your parser to do things
659
+ # like bailing out after the first error
660
+ # * just raise the exception instead of
661
+ # calling the recovery method.
662
+ #
663
+ def number_of_syntax_errors
664
+ @state.syntax_errors
665
+ end
666
+
667
+ # Compute the context-sensitive FOLLOW set for current rule.
668
+ # This is set of token types that can follow a specific rule
669
+ # reference given a specific call chain. You get the set of
670
+ # viable tokens that can possibly come next (look depth 1)
671
+ # given the current call chain. Contrast this with the
672
+ # definition of plain FOLLOW for rule r:
673
+ #
674
+ # FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
675
+ #
676
+ # where x in T* and alpha, beta in V*; T is set of terminals and
677
+ # V is the set of terminals and nonterminals. In other words,
678
+ # FOLLOW(r) is the set of all tokens that can possibly follow
679
+ # references to r in *any* sentential form (context). At
680
+ # runtime, however, we know precisely which context applies as
681
+ # we have the call chain. We may compute the exact (rather
682
+ # than covering superset) set of following tokens.
683
+ #
684
+ # For example, consider grammar:
685
+ #
686
+ # stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
687
+ # | "return" expr '.'
688
+ # ;
689
+ # expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
690
+ # atom : INT // FOLLOW(atom)=={'+',')',';','.'}
691
+ # | '(' expr ')'
692
+ # ;
693
+ #
694
+ # The FOLLOW sets are all inclusive whereas context-sensitive
695
+ # FOLLOW sets are precisely what could follow a rule reference.
696
+ # For input input "i=(3);", here is the derivation:
697
+ #
698
+ # stat => ID '=' expr ';'
699
+ # => ID '=' atom ('+' atom)* ';'
700
+ # => ID '=' '(' expr ')' ('+' atom)* ';'
701
+ # => ID '=' '(' atom ')' ('+' atom)* ';'
702
+ # => ID '=' '(' INT ')' ('+' atom)* ';'
703
+ # => ID '=' '(' INT ')' ';'
704
+ #
705
+ # At the "3" token, you'd have a call chain of
706
+ #
707
+ # stat -> expr -> atom -> expr -> atom
708
+ #
709
+ # What can follow that specific nested ref to atom? Exactly ')'
710
+ # as you can see by looking at the derivation of this specific
711
+ # input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
712
+ #
713
+ # You want the exact viable token set when recovering from a
714
+ # token mismatch. Upon token mismatch, if LA(1) is member of
715
+ # the viable next token set, then you know there is most likely
716
+ # a missing token in the input stream. "Insert" one by just not
717
+ # throwing an exception.
718
+ def compute_context_sensitive_rule_follow
719
+ combine_follows(true)
720
+ end
721
+
722
+ def combine_follows(exact)
723
+ follow_set = Set.new
724
+ @state.following.each_with_index.reverse_each do |local_follow_set, index|
725
+ follow_set |= local_follow_set
726
+ if exact
727
+ if local_follow_set.include?(EOR_TOKEN_TYPE)
728
+ follow_set.delete(EOR_TOKEN_TYPE) if index > 0
729
+ else
730
+ break
731
+ end
732
+ end
733
+ end
734
+ return follow_set
735
+ end
736
+
737
+ # Match needs to return the current input symbol, which gets put
738
+ # into the label for the associated token ref; e.g., x=ID. Token
739
+ # and tree parsers need to return different objects. Rather than test
740
+ # for input stream type or change the IntStream interface, I use
741
+ # a simple method to ask the recognizer to tell me what the current
742
+ # input symbol is.
743
+ #
744
+ # This is ignored for lexers.
745
+
746
+ def current_input_symbol
747
+ return nil
748
+ end
749
+
750
+ # Consume tokens until one matches the given token or token set
751
+ #
752
+ # tokenTypes can be a single token type or a set of token types
753
+ def consume_until(token_types)
754
+ token_types.is_a?(Set) or token_types = Set.new(token_types.to_a)
755
+ type = @input.peek
756
+ until type == EOF or token_types.include?(type)
757
+ @input.consume
758
+ type = @input.peek
759
+ end
760
+ return(type)
761
+ end
762
+
763
+ def backtracking_level
764
+ @state.backtracking
765
+ end
766
+
767
+ def backtracking_level=(n)
768
+ @state.backtracking = n
769
+ end
770
+
771
+ def backtrack
772
+ @state.backtracking += 1
773
+ start = @input.mark
774
+ success =
775
+ begin yield
776
+ rescue BacktrackingFailed then false
777
+ else true
778
+ end
779
+ return success
780
+ ensure
781
+ @input.rewind(start)
782
+ @state.backtracking -= 1
783
+ end
784
+
785
+ def syntactic_predicate?(name)
786
+ backtrack { send(name) }
787
+ end
788
+
789
+ alias backtracking backtracking_level
790
+ alias backtracking= backtracking_level=
791
+
792
+ def rule_memoization(rule, start_index)
793
+ @state.rule_memory[rule] ||= Hash.new(MEMO_RULE_UNKNOWN)
794
+ @state.rule_memory[rule][start_index]
795
+ end
796
+
797
+ def already_parsed_rule?(rule)
798
+ stop_index = rule_memoization(rule, @input.index)
799
+ case stop_index
800
+ when MEMO_RULE_UNKNOWN then return false
801
+ when MEMO_RULE_FAILED then return true
802
+ else
803
+ @input.seek(stop_index + 1)
804
+ end
805
+ return true
806
+ end
807
+
808
+ def memoize(rule, start_index, success)
809
+ stop_index = success ? (@input.index - 1) : MEMO_RULE_FAILED
810
+ memo = @state.rule_memory[rule] and memo[start_index] = stop_index
811
+ end
812
+
813
+ def trace_in(rule_name, rule_index, input_symbol)
814
+ @error_output.printf("--> enter %s on %s", rule_name, input_symbol)
815
+ @state.backtracking > 0 and @error_output.printf(
816
+ " (in backtracking mode: depth = %s)", @state.backtracking
817
+ )
818
+ @error_output.print("\n")
819
+ end
820
+
821
+ def trace_out(rule_name, rule_index, input_symbol)
822
+ @error_output.printf("<-- exit %s on %s", rule_name, input_symbol)
823
+ @state.backtracking > 0 and @error_output.printf(
824
+ " (in backtracking mode: depth = %s)", @state.backtracking
825
+ )
826
+ @error_output.print("\n")
827
+ end
828
+
829
+ private
830
+
831
+ def initialize_dfas
832
+ # do nothing
833
+ end
834
+ end
835
+
836
+ =begin rdoc ANTLR3::Lexer
837
+
838
+ = Lexer
839
+
840
+ Lexer is the default superclass of all lexers generated by ANTLR. The class
841
+ tailors the core functionality provided by BaseRecognizer to the task of
842
+ matching patterns in the text input and breaking the input into tokens.
843
+
844
+ == About Lexers
845
+
846
+ A lexer's job is to take input text and break it up into _tokens_ -- objects
847
+ that encapsulate a piece of text, a type label (such as ID or INTEGER), and the
848
+ position of the text with respect to the input. Thus, a lexer is essentially a
849
+ complicated iterator that steps through an input stream and produces a sequence
850
+ of tokens. Sometimes lexers are enough to carry out a goal on their own, such as
851
+ tasks like source code highlighting and simple code analysis. Usually, however,
852
+ the lexer converts text into tokens for use by a parser, which recognizes larger
853
+ structures within the text.
854
+
855
+ ANTLR parsers have a variety of entry points specified by parser rules, each of
856
+ which defines the structure of a specific type of sentence in a grammar. Lexers,
857
+ however, are primarily intended to have a single entry point. It looks at the
858
+ characters starting at the current input position, decides if the chunk of text
859
+ matches one of a number of possible token type definitions, wraps the chunk into
860
+ a token with information on its type and location, and advances the input stream
861
+ to the next place.
862
+
863
+ == ANTLR Lexers and the Lexer API
864
+
865
+ ANTLR-generated lexers will subclass this class, unless specified otherwise
866
+ within a grammar file. The generated class will provide an implementation of
867
+ each lexer rule as a method of the same name. The subclass will also provide an
868
+ implementation for the abstract method #m_tokens, the purpose of which is to
869
+ multiplex the token type definitions and predict what rule definition to execute
870
+ to fetch a token. The primary method in the lexer API, #next_token, uses
871
+ #m_tokens to fetch the next token and drive the iteration.
872
+
873
+ If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
874
+ will generally be used to build a TokenStream object. The following code example
875
+ demonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
876
+
877
+ # in HypotheticalLexer.rb
878
+ module Hypothetical
879
+ class Lexer < ANTLR3::Lexer
880
+ # ...
881
+ # ANTLR generated code
882
+ # ...
883
+ end
884
+ end
885
+
886
+ # in HypotheticalParser.rb
887
+ module Hypothetical
888
+ class Parser < ANTLR3::Parser
889
+ # ...
890
+ # more ANTLR generated code
891
+ # ...
892
+ end
893
+ end
894
+
895
+ # to take hypothetical source code and prepare it for parsing,
896
+ # there is generally a four-step construction process
897
+
898
+ source = "some hypothetical source code"
899
+ input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
900
+ lexer = Hypothetical::Lexer.new(input)
901
+ tokens = ANTLR3::CommonTokenStream.new(lexer)
902
+ parser = Hypothetical::Parser.new(tokens)
903
+
904
+ # if you're using the standard streams, ANTLR3::StringStream and
905
+ # ANTLR3::CommonTokenStream, you can write the same process
906
+ # shown above more succinctly:
907
+
908
+ lexer = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
909
+ parser = Hypothetical::Parser.new(lexer)
910
+
911
+ =end
912
+ class Lexer < BaseRecognizer
913
+ include TokenSource
914
+ @token_class = CommonToken
915
+
916
+ def self.default_rule
917
+ @default_rule ||= :token!
918
+ end
919
+
920
+ def self.main(argv = ARGV, options = {})
921
+ if argv.is_a?(::Hash) then argv, options = ARGV, argv end
922
+ main = ANTLR3::Main::LexerMain.new(self, options)
923
+ block_given? ? yield(main) : main.execute(argv)
924
+ end
925
+
926
+ def self.associated_parser
927
+ @grammar_home and @grammar_home::Parser
928
+ rescue NameError
929
+ grammar_name = @grammar_home.name.split("::").last
930
+ begin
931
+ require "#{grammar_name}Parser"
932
+ rescue LoadError => e
933
+ return nil
934
+ end
935
+ return @grammar_home::Parser rescue nil
936
+ end
937
+
938
+ def self.associated_parser
939
+ @associated_parser ||= begin
940
+ @grammar_home and @grammar_home::Parser
941
+ rescue NameError
942
+ grammar_name = @grammar_home.name.split("::").last
943
+ begin
944
+ require "#{grammar_name}Parser"
945
+ @grammar_home::Parser
946
+ rescue LoadError, NameError
947
+ end
948
+ end
949
+ end
950
+
951
+ def initialize(input, options = {})
952
+ super(options)
953
+ @input =
954
+ case input
955
+ when ::String then StringStream.new(input, options)
956
+ when ::IO then FileStream.new(input, options)
957
+ else input
958
+ end
959
+ end
960
+
961
+ def next_token
962
+ loop do
963
+ @state.token = nil
964
+ @state.channel = DEFAULT_CHANNEL
965
+ @state.token_start_position = @input.index
966
+ @state.token_start_column = @input.column
967
+ @state.token_start_line = @input.line
968
+ @state.text = nil
969
+ @input.peek == EOF and return EOF_TOKEN
970
+ begin
971
+ token!
972
+
973
+ case token = @state.token
974
+ when nil then return(emit())
975
+ when SKIP_TOKEN then next
976
+ else
977
+ return token
978
+ end
979
+ rescue NoViableAlternative => re
980
+ report_error(re)
981
+ recover(re)
982
+ rescue Error::RecognitionError => re
983
+ report_error(re)
984
+ end
985
+ end
986
+ end
987
+
988
+ def skip
989
+ @state.token = SKIP_TOKEN
990
+ end
991
+
992
+ abstract :token!
993
+
994
+ def exhaust
995
+ self.to_a
996
+ end
997
+
998
+ def char_stream=(input)
999
+ @input = nil
1000
+ reset()
1001
+ @input = input
1002
+ end
1003
+
1004
+ def source_name
1005
+ @input.source_name
1006
+ end
1007
+
1008
+ def emit(token = nil)
1009
+ token ||= create_token
1010
+ @state.token = token
1011
+ return token
1012
+ end
1013
+
1014
+ def match(expected)
1015
+ case expected
1016
+ when String
1017
+ expected.each_byte do |char|
1018
+ unless @input.peek == char
1019
+ @state.backtracking > 0 and raise BacktrackingFailed
1020
+ error = MismatchedToken(char)
1021
+ recover(error)
1022
+ raise error
1023
+ end
1024
+ @input.consume()
1025
+ end
1026
+ else # single integer character
1027
+ unless @input.peek == expected
1028
+ @state.backtracking > 0 and raise BacktrackingFailed
1029
+ error = MismatchedToken(expected)
1030
+ recover(error)
1031
+ raise error
1032
+ end
1033
+ @input.consume
1034
+ end
1035
+ return true
1036
+ end
1037
+
1038
+ def match_any
1039
+ @input.consume
1040
+ end
1041
+
1042
+ def match_range(min, max)
1043
+ char = @input.peek
1044
+ if char.between?(min, max) then @input.consume
1045
+ else
1046
+ @state.backtracking > 0 and raise BacktrackingFailed
1047
+ error = MismatchedRange(min.chr, max.chr)
1048
+ recover(error)
1049
+ raise(error)
1050
+ end
1051
+ return true
1052
+ end
1053
+
1054
+ def line
1055
+ @input.line
1056
+ end
1057
+
1058
+ def column
1059
+ @input.column
1060
+ end
1061
+
1062
+ def character_index
1063
+ @input.index
1064
+ end
1065
+
1066
+ def text
1067
+ @state.text and return @state.text
1068
+ @input.substring(@state.token_start_position, character_index - 1)
1069
+ end
1070
+
1071
+ def text=(text)
1072
+ @state.text = text
1073
+ end
1074
+
1075
+ def report_error(e)
1076
+ display_recognition_error(e)
1077
+ end
1078
+
1079
+ def error_message(e)
1080
+ char = character_error_display(e.symbol) rescue nil
1081
+ case e
1082
+ when Error::MismatchedToken
1083
+ expecting = character_error_display(e.expecting)
1084
+ "mismatched character #{char}; expecting #{expecting}"
1085
+ when Error::NoViableAlternative
1086
+ "no viable alternative at character #{char}"
1087
+ when Error::EarlyExit
1088
+ "required (...)+ loop did not match anything at character #{char}"
1089
+ when Error::MismatchedNotSet
1090
+ "mismatched character %s; expecting set %p" % [char, e.expecting]
1091
+ when Error::MismatchedSet
1092
+ "mismatched character %s; expecting set %p" % [char, e.expecting]
1093
+ when Error::MismatchedRange
1094
+ a = character_error_display(e.min)
1095
+ b = character_error_display(e.max)
1096
+ "mismatched character %s; expecting set %s..%s" % [char, a, b]
1097
+ else super
1098
+ end
1099
+ end
1100
+
1101
+ def character_error_display(char)
1102
+ case char
1103
+ when EOF then '<EOF>'
1104
+ when Integer then char.chr.inspect
1105
+ else char.inspect
1106
+ end
1107
+ end
1108
+
1109
+ def recover(re)
1110
+ @input.consume
1111
+ end
1112
+
1113
+ private
1114
+
1115
+ def trace_in(rule_name, rule_index)
1116
+ if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1117
+ else symbol = '<EOF>' end
1118
+ input_symbol = "#{symbol} @ line #{line} / col #{column}"
1119
+ super(rule_name, rule_index, input_symbol)
1120
+ end
1121
+
1122
+ def trace_out(rule_name, rule_index)
1123
+ if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1124
+ else symbol = '<EOF>' end
1125
+ input_symbol = "#{symbol} @ line #{line} / col #{column}"
1126
+ super(rule_name, rule_index, input_symbol)
1127
+ end
1128
+
1129
+ def create_token(&b)
1130
+ if block_given? then super(&b)
1131
+ else
1132
+ super do |t|
1133
+ t.input = @input
1134
+ t.type = @state.type
1135
+ t.channel = @state.channel
1136
+ t.start = @state.token_start_position
1137
+ t.stop = @input.index - 1
1138
+ t.line = @state.token_start_line
1139
+ t.text = self.text
1140
+ t.column = @state.token_start_column
1141
+ end
1142
+ end
1143
+ end
1144
+ end
1145
+
1146
+
1147
+ =begin rdoc ANTLR3::Parser
1148
+
1149
+ = Parser
1150
+
1151
+ Parser is the default base class of ANTLR-generated parser classes. The class
1152
+ tailors the functionality provided by BaseRecognizer to the task of parsing.
1153
+
1154
+ == About Parsing
1155
+
1156
+ This is just a lose overview of parsing. For considerably more in-depth coverage
1157
+ of the topic, read the ANTLR documentation or check out the ANTLR website
1158
+ (http://www.antlr.org).
1159
+
1160
+ A grammar defines the vocabulary and the sentence structure of a language. While
1161
+ a lexer concerns the basic vocabulary symbols of the language, a parser's
1162
+ primary task is to implement the sentence structure.
1163
+
1164
+ Parsers are set up by providing a stream of tokens, which is usually created by
1165
+ a corresponding lexer. Then, the user requests a specific sentence-structure
1166
+ within the grammar, such as "class_definition" or "xml_node", from the parser.
1167
+ It iterates through the tokens, verifying the syntax of the sentence and
1168
+ performing actions specified by the grammar. It stops when it encounters an
1169
+ error or when it has matched the full sentence according to its defined
1170
+ structure.
1171
+
1172
+ == ANTLR Parsers and the Parser API
1173
+
1174
+ Plain ANTLR-generated parsers directly subclass this class, unless specified
1175
+ otherwise within the grammar options. The generated code will provide a method
1176
+ for each parser rule defined in the ANTLR grammar, as well as any other
1177
+ customized member attributes and methods specified in the source grammar.
1178
+
1179
+ This class does not override much of the functionality in BaseRecognizer, and
1180
+ thus the API closely mirrors BaseRecognizer.
1181
+
1182
+ =end
1183
+ class Parser < BaseRecognizer
1184
+ def self.main(argv = ARGV, options = {})
1185
+ if argv.is_a?(::Hash) then argv, options = ARGV, argv end
1186
+ main = ANTLR3::Main::ParserMain.new(self, options)
1187
+ block_given? ? yield(main) : main.execute(argv)
1188
+ end
1189
+
1190
+ def self.associated_lexer
1191
+ @associated_lexer ||= begin
1192
+ @grammar_home and @grammar_home::Lexer
1193
+ rescue NameError
1194
+ grammar_name = @grammar_home.name.split("::").last
1195
+ begin
1196
+ require "#{grammar_name}Lexer"
1197
+ @grammar_home::Lexer
1198
+ rescue LoadError, NameError
1199
+ end
1200
+ end
1201
+ end
1202
+
1203
+ def initialize(input, options = {})
1204
+ super(options)
1205
+ @input = nil
1206
+ reset
1207
+ input = cast_input( input, options ) unless TokenStream === input
1208
+ @input = input
1209
+ end
1210
+
1211
+ def current_input_symbol
1212
+ @input.look
1213
+ end
1214
+
1215
+ def missing_symbol(error, expected_type, follow)
1216
+ current = @input.look
1217
+ current = @input.look(-1) if current == ANTLR3::EOF_TOKEN
1218
+ t =
1219
+ case
1220
+ when current && current != ANTLR3::EOF_TOKEN then current.clone
1221
+ when @input.token_class then @input.token_class.new
1222
+ else (create_token rescue CommonToken.new)
1223
+ end
1224
+
1225
+ t.type = expected_type
1226
+ name = t.name.gsub(/(^<)|(>$)/,'')
1227
+ t.text = "<missing #{name}>"
1228
+ t.channel = DEFAULT_CHANNEL
1229
+ return(t)
1230
+ end
1231
+
1232
+ def token_stream=(input)
1233
+ @input = nil
1234
+ reset
1235
+ @input = input
1236
+ end
1237
+ alias token_stream input
1238
+
1239
+ def source_name
1240
+ @input.source_name
1241
+ end
1242
+
1243
+ private
1244
+
1245
+ def trace_in(rule_name, rule_index)
1246
+ super(rule_name, rule_index, @input.look.inspect)
1247
+ end
1248
+
1249
+ def trace_out(rule_name, rule_index)
1250
+ super(rule_name, rule_index, @input.look.inspect)
1251
+ end
1252
+
1253
+ def cast_input( input, options )
1254
+ case input
1255
+ when TokenSource then CommonTokenStream.new( input, options )
1256
+ when IO, String
1257
+ if lexer_class = self.class.associated_lexer
1258
+ CommonTokenStream.new( lexer_class.new( input, options ), options )
1259
+ else
1260
+ raise ArgumentError, Util.tidy( <<-END, true )
1261
+ | unable to automatically convert input #{ input.inspect }
1262
+ | to a ANTLR3::TokenStream object as #{ self.class }
1263
+ | does not appear to have an associated lexer class
1264
+ END
1265
+ end
1266
+ else
1267
+ # assume it's a stream if it at least implements peek and consume
1268
+ unless input.respond_to?( :peek ) and input.respond_to?( :consume )
1269
+ raise ArgumentError, Util.tidy(<<-END, true)
1270
+ | #{ self.class } requires a token stream as input, but
1271
+ | #{ input.inspect } was provided
1272
+ END
1273
+ end
1274
+ input
1275
+ end
1276
+ end
1277
+
1278
+ end
1279
+
1280
+ end