parser 0.9.alpha → 0.9.alpha1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac470264d9dbf4e1781557f2e4c6f93050ca1fca
4
- data.tar.gz: 2af0ef8f8dee1d84f91305adbbd6f099f99b5c8f
3
+ metadata.gz: 384df81635da81957880f54cb589109db642c914
4
+ data.tar.gz: 101c991d44683e9ba699a3ec3deca74572fb7a09
5
5
  SHA512:
6
- metadata.gz: cceb3bc547346c33f28c9392971b0d6df75c289c3e5046bebac559bdf3a7334c902efae7d2c02e294d40930589ff27db011ab95afb72ff0f9f82af6851db2e0c
7
- data.tar.gz: 0121d2b3fea37bac9b97db352c1cc55469cfa0ca854f0f913f0d8cb9015740e6799d9180a2a29c6a9715285e8acbc50d5717fafd8759de3966af222fd963a3b4
6
+ metadata.gz: 0724f1d86bbe49d1aa390c5bea5e3e6c859850b59be9ef0b78623f23cbe97cae94f75de5bc961cbdad1aabe6a1bb166e63dcac4103bb211a41c19ae86d8d3624
7
+ data.tar.gz: 1c64825e1d3a58b00d1a7038b599e161025b516435cde204e325061b1800416813f8ef2ab154f5b5dc99a1dc9cb1d11a9dfa33adb1e2e35f756c3fa10a9f5960
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.output
19
+ lib/parser/lexer.rb
20
+ lib/parser/ruby18.rb
21
+ lib/parser/ruby19.rb
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ - rbx-19mode
6
+ - jruby-19mode
7
+ matrix:
8
+ allow_failures:
9
+ - rvm: jruby-19mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in parser.gemspec
4
+ gemspec
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2013 Peter Zotov <whitequark@whitequark.org>
2
+
3
+ Parts of the source are derived from ruby_parser:
4
+ Copyright (c) Ryan Davis, seattle.rb
5
+
6
+ MIT License
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining
9
+ a copy of this software and associated documentation files (the
10
+ "Software"), to deal in the Software without restriction, including
11
+ without limitation the rights to use, copy, modify, merge, publish,
12
+ distribute, sublicense, and/or sell copies of the Software, and to
13
+ permit persons to whom the Software is furnished to do so, subject to
14
+ the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be
17
+ included in all copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Parser
2
+
3
+ [![Build Status](https://travis-ci.org/whitequark/parser.png?branch=master)](https://travis-ci.org/whitequark/parser)
4
+ [![Code Climate](https://codeclimate.com/github/whitequark/parser.png)](https://codeclimate.com/github/whitequark/parser)
5
+
6
+ Parser is a Ruby parser written in pure Ruby.
7
+
8
+ ## Installation
9
+
10
+ $ gem install parser
11
+
12
+ ## Usage
13
+
14
+ TODO: Write usage instructions here
15
+
16
+ ## Acknowledgements
17
+
18
+ The lexer testsuite is derived from [ruby_parser](http://github.com/seattlerb/ruby_parser).
19
+
20
+ The Bison parser rules are derived from [Ruby MRI](http://github.com/ruby/ruby) parse.y.
21
+
22
+ ## Contributing
23
+
24
+ 1. Make sure you have [Ragel 6.8](http://www.complang.org/ragel/) installed
25
+ 2. Fork it
26
+ 3. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 4. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 5. Push to the branch (`git push origin my-new-feature`)
29
+ 6. Create new Pull Request
data/Rakefile CHANGED
@@ -1,192 +1,25 @@
1
- # -*- ruby -*-
1
+ require "bundler/gem_tasks"
2
2
 
3
- require 'rubygems'
4
- require 'hoe'
3
+ task :default => [:generate, :test]
5
4
 
6
- Hoe.plugin :seattlerb
7
- Hoe.plugin :racc
8
- Hoe.plugin :isolate
9
-
10
- Hoe.add_include_dirs "../../sexp_processor/dev/lib"
11
-
12
- Hoe.spec 'parser' do
13
- developer 'Peter Zotov', 'whitequark@whitequark.org'
14
-
15
- dependency 'sexp_processor', '~> 4.1'
16
-
17
- self.racc_flags << " -t" if plugin?(:racc) && ENV["DEBUG"]
18
- end
19
-
20
- file "lib/ruby18_parser.rb" => "lib/ruby18_parser.y"
21
- file "lib/ruby19_parser.rb" => "lib/ruby19_parser.y"
22
-
23
- file "lib/ruby_lexer.rb" => "lib/ruby_lexer.rl" do |t|
24
- sh "ragel -R #{t.prerequisites.first} -o #{t.name}"
25
- end
26
-
27
- task :clean do
28
- rm_rf(Dir["**/*~"] +
29
- Dir["**/*.diff"] +
30
- Dir["coverage.info"] +
31
- Dir["coverage"] +
32
- Dir["lib/*.output"])
33
- end
34
-
35
- def next_num(glob)
36
- num = Dir[glob].max[/\d+/].to_i + 1
37
- end
38
-
39
- desc "Compares PT to RP and deletes all files that match"
40
- task :compare do
41
- files = Dir["unit/**/*.rb"]
42
- puts "Parsing #{files.size} files"
43
- files.each do |file|
44
- puts file
45
- system "./cmp.rb -q #{file} && rm #{file}"
46
- end
47
- system 'find -d unit -type d -empty -exec rmdir {} \;'
48
- end
49
-
50
- desc "Compares PT to RP and stops on first failure"
51
- task :find_bug do
52
- files = Dir["unit/**/*.rb"]
53
- puts "Parsing #{files.size} files"
54
- files.each do |file|
55
- puts file
56
- sh "./cmp.rb -q #{file}"
57
- end
58
- end
59
-
60
- task :sort do
61
- sh 'grepsort "^ +def" lib/ruby_lexer.rb'
62
- sh 'grepsort "^ +def (test|util)" test/test_ruby_lexer.rb'
63
- end
64
-
65
- task :loc do
66
- loc1 = `wc -l ../1.0.0/lib/ruby_lexer.rb`[/\d+/]
67
- flog1 = `flog -s ../1.0.0/lib/ruby_lexer.rb`[/\d+\.\d+/]
68
- loc2 = `cat lib/ruby_lexer.rb lib/ruby_parser_extras.rb | wc -l`[/\d+/]
69
- flog2 = `flog -s lib/ruby_lexer.rb lib/ruby_parser_extras.rb`[/\d+\.\d+/]
70
-
71
- loc1, loc2, flog1, flog2 = loc1.to_i, loc2.to_i, flog1.to_f, flog2.to_f
72
-
73
- puts "1.0.0: loc = #{loc1} flog = #{flog1}"
74
- puts "dev : loc = #{loc2} flog = #{flog2}"
75
- puts "delta: loc = #{loc2-loc1} flog = #{flog2-flog1}"
76
- end
77
-
78
- desc "Validate against all normal files in unit dir"
79
- task :validate do
80
- sh "./cmp.rb unit/*.rb"
81
- end
82
-
83
- def run_and_log cmd, prefix
84
- files = ENV['FILES'] || 'unit/*.rb'
85
- p, x = prefix, "txt"
86
- n = Dir["#{p}.*.#{x}"].map { |s| s[/\d+/].to_i }.max + 1 rescue 1
87
- f = "#{p}.#{n}.#{x}"
88
-
89
- sh "#{cmd} #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g #{files} &> #{f}"
90
-
91
- puts File.read(f)
92
- end
93
-
94
- desc "Benchmark against all normal files in unit dir"
95
- task :benchmark do
96
- run_and_log "ruby", "benchmark"
97
- end
98
-
99
- desc "Profile against all normal files in unit dir"
100
- task :profile do
101
- run_and_log "zenprofile", "profile"
102
- end
103
-
104
- desc "what was that command again?"
105
- task :huh? do
106
- puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..."
107
- end
108
-
109
- task :irb => [:isolate] do
110
- sh "GEM_HOME=#{Gem.path.first} irb -rubygems -Ilib -rruby_parser;"
111
- end
112
-
113
- def (task(:phony)).timestamp
114
- Time.at 0
115
- end
116
-
117
- task :isolate => :phony
118
-
119
- file "lib/ruby18_parser.rb" => :isolate
120
- file "lib/ruby19_parser.rb" => :isolate
121
-
122
- task :compare18 do
123
- sh "./yack.rb lib/ruby18_parser.output > racc18.txt"
124
- sh "./yack.rb parse18.output > yacc18.txt"
125
- sh "diff -du racc18.txt yacc18.txt || true"
126
- puts
127
- sh "diff -du racc18.txt yacc18.txt | wc -l"
128
- end
129
-
130
- task :compare19 do
131
- sh "./yack.rb lib/ruby19_parser.output > racc19.txt"
132
- sh "./yack.rb parse19.output > yacc19.txt"
133
- sh "diff -du racc19.txt yacc19.txt || true"
134
- puts
135
- sh "diff -du racc19.txt yacc19.txt | wc -l"
136
- end
137
-
138
- task :debug => :isolate do
139
- ENV["V"] ||= "19"
140
- Rake.application[:parser].invoke # this way we can have DEBUG set
141
-
142
- $: << "lib"
143
- require 'ruby_parser'
144
- require 'pp'
145
-
146
- parser = if ENV["V"] == "18" then
147
- Ruby18Parser.new
148
- else
149
- Ruby19Parser.new
150
- end
151
-
152
- time = (ENV["RP_TIMEOUT"] || 10).to_i
153
-
154
- file = ENV["F"] || ENV["FILE"]
155
-
156
- ruby = if file then
157
- File.read(file)
158
- else
159
- file = "env"
160
- ENV["R"] || ENV["RUBY"]
161
- end
162
-
163
- begin
164
- pp parser.process(ruby, file, time)
165
- rescue Racc::ParseError => e
166
- p e
167
- ss = parser.lexer.src
168
- src = ss.string
169
- lines = src[0..ss.pos].split(/\n/)
170
- abort "on #{file}:#{lines.size}"
5
+ task :test do
6
+ $LOAD_PATH << File.expand_path('../lib/', __FILE__)
7
+ Dir["test/test_*.rb"].each do |file|
8
+ load file
171
9
  end
172
10
  end
173
11
 
174
- task :debug_ruby do
175
- file = ENV["F"] || ENV["FILE"]
176
- sh "ruby19 -cwy #{file} 2>&1 | ./yuck.rb"
177
- end
178
-
179
- task :extract => :isolate do
180
- ENV["V"] ||= "19"
181
- Rake.application[:parser].invoke # this way we can have DEBUG set
12
+ desc "Generate the Ragel lexer and Bison parser."
13
+ task :generate => %w(lib/parser/lexer.rb)
14
+ #lib/parser/ruby18.rb
15
+ #lib/parser/ruby19.rb)
182
16
 
183
- file = ENV["F"] || ENV["FILE"]
17
+ task :build => :generate
184
18
 
185
- ruby "-Ilib", "bin/ruby_parse_extract_error", file
19
+ rule '.rb' => '.rl' do |t|
20
+ sh "ragel -R #{t.source} -o #{t.name}"
186
21
  end
187
22
 
188
- task :bugs do
189
- sh "for f in bug*.rb ; do rake19 debug F=$f && rm $f ; done"
23
+ rule '.rb' => '.y' do |t|
24
+ sh "racc #{t.source} -o #{t.name} -O"
190
25
  end
191
-
192
- # vim: syntax=Ruby
@@ -0,0 +1,4 @@
1
+ module Parser
2
+ require 'parser/static_environment'
3
+ require 'parser/lexer'
4
+ end
@@ -0,0 +1,1713 @@
1
+ %%machine lex; # % fix highlighting
2
+
3
+ #
4
+ # === BEFORE YOU START ===
5
+ #
6
+ # Remember two things about Ragel scanners:
7
+ #
8
+ # 1) Longest match wins.
9
+ #
10
+ # 2) If two matches have the same length, the first
11
+ # in source code wins.
12
+ #
13
+ # General rules of making Ragel and Bison happy:
14
+ #
15
+ # * `p` (position) and `@te` contain the index of the character
16
+ # they're pointing to ("current"), plus one. `@ts` contains the index
17
+ # of the corresponding character. The code for extracting matched token is:
18
+ #
19
+ # @source[@ts...@te]
20
+ #
21
+ # * If your input is `foooooooobar` and the rule is:
22
+ #
23
+ # 'f' 'o'+
24
+ #
25
+ # the result will be:
26
+ #
27
+ # foooooooobar
28
+ # ^ ts=0 ^ p=te=9
29
+ #
30
+ # * A Ragel lexer action should not emit more than one token, unless
31
+ # you know what you are doing.
32
+ #
33
+ # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
34
+ #
35
+ # * If an action emits the token and transitions to another state, use
36
+ # these Ragel commands:
37
+ #
38
+ # emit($whatever)
39
+ # fnext $next_state; fbreak;
40
+ #
41
+ # * If an action does not emit a token:
42
+ #
43
+ # fgoto $next_state;
44
+ #
45
+ # * If an action features lookbehind, i.e. matches characters with the
46
+ # intent of passing them to another action:
47
+ #
48
+ # p = @ts - 1
49
+ # fgoto $next_state;
50
+ #
51
+ # or, if the lookbehind consists of a single character:
52
+ #
53
+ # fhold; fgoto $next_state;
54
+ #
55
+ # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
56
+ # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
57
+ # _will_ invoke the action `act`.
58
+ #
59
+ # * EOF is explicit and is matched by `c_eof`. If you want to introspect
60
+ # the state of the lexer, add this rule to the state:
61
+ #
62
+ # c_eof => do_eof;
63
+ #
64
+ # * If you proceed past EOF, the lexer will complain:
65
+ #
66
+ # NoMethodError: undefined method `ord' for nil:NilClass
67
+ #
68
+
69
+ require 'parser/lexer_literal'
70
+ require 'parser/syntax_error'
71
+
72
+ class Parser::Lexer
73
+
74
+ %% write data nofinal;
75
+ # %
76
+
77
+ attr_reader :source
78
+ attr_accessor :static_env
79
+
80
+ attr_reader :location, :comments
81
+
82
+ def initialize(version)
83
+ @version = version
84
+
85
+ reset
86
+ end
87
+
88
+ def reset(reset_state=true)
89
+ if reset_state
90
+ # Unit tests set state prior to resetting lexer.
91
+ @cs = self.class.lex_en_line_begin
92
+ end
93
+
94
+ # Ragel-internal variables:
95
+ @p = 0 # stream position (saved manually in #advance)
96
+ @ts = nil # token start
97
+ @te = nil # token end
98
+ @act = 0 # next action
99
+
100
+ @stack = [] # state stack
101
+ @top = 0 # state stack top pointer
102
+
103
+ @token_queue = []
104
+ @literal_stack = []
105
+
106
+ @newlines = [0] # sorted set of \n positions
107
+ @newline_s = nil # location of last encountered newline
108
+ @location = nil # location of last #advance'd token
109
+
110
+ @comments = "" # collected comments
111
+
112
+ @num_base = nil # last numeric base
113
+ @num_digits_s = nil # starting position of numeric digits
114
+
115
+ @escape_s = nil # starting position of current sequence
116
+ @escape = nil # last escaped sequence, as string
117
+
118
+ # See below the section on parsing heredocs.
119
+ @heredoc_e = nil
120
+ @herebody_s = nil
121
+
122
+ # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
123
+ # encountered after a matching closing parenthesis.
124
+ @paren_nest = 0
125
+ @lambda_stack = []
126
+ end
127
+
128
+ def source=(source)
129
+ # Heredoc processing coupled with weird newline quirks
130
+ # require three '\0' (EOF) chars to be appended; after
131
+ # `p = @heredoc_s`, if `p` points at EOF, the FSM could
132
+ # not bail out early enough and will crash.
133
+ #
134
+ # Patches accepted.
135
+ #
136
+ @source = source.gsub(/\r\n/, "\n") + "\0\0\0"
137
+ end
138
+
139
+ LEX_STATES = {
140
+ :line_begin => lex_en_line_begin,
141
+ :expr_beg => lex_en_expr_beg,
142
+ :expr_value => lex_en_expr_value,
143
+ :expr_mid => lex_en_expr_mid,
144
+ :expr_dot => lex_en_expr_dot,
145
+ :expr_fname => lex_en_expr_fname,
146
+ :expr_end => lex_en_expr_end,
147
+ :expr_arg => lex_en_expr_arg,
148
+ :expr_endarg => lex_en_expr_endarg,
149
+ }
150
+
151
+ def state
152
+ LEX_STATES.invert.fetch(@cs, @cs)
153
+ end
154
+
155
+ def state=(state)
156
+ @cs = LEX_STATES.fetch(state)
157
+ end
158
+
159
+ # Return next token: [type, value].
160
+ def advance
161
+ if @token_queue.any?
162
+ return with_location(@token_queue.shift)
163
+ end
164
+
165
+ # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
166
+ _lex_trans_keys = self.class.send :_lex_trans_keys
167
+ _lex_actions = self.class.send :_lex_actions
168
+ _lex_key_offsets = self.class.send :_lex_key_offsets
169
+ _lex_index_offsets = self.class.send :_lex_index_offsets
170
+ _lex_single_lengths = self.class.send :_lex_single_lengths
171
+ _lex_range_lengths = self.class.send :_lex_range_lengths
172
+ _lex_indicies = self.class.send :_lex_indicies
173
+ _lex_trans_targs = self.class.send :_lex_trans_targs
174
+ _lex_trans_actions = self.class.send :_lex_trans_actions
175
+ _lex_to_state_actions = self.class.send :_lex_to_state_actions
176
+ _lex_from_state_actions = self.class.send :_lex_from_state_actions
177
+
178
+ p, pe, eof = @p, @source.length + 1, nil
179
+
180
+ %% write exec;
181
+ # %
182
+
183
+ @p = p
184
+
185
+ if @token_queue.any?
186
+ with_location(@token_queue.shift)
187
+ elsif @cs == self.class.lex_error
188
+ with_location([ false, '$undefined', p, p + 1 ])
189
+ else
190
+ with_location([ false, '$end', p, p + 1 ])
191
+ end
192
+ end
193
+
194
+ # Like #advance, but also pretty-print the token and its position
195
+ # in the stream to `stdout`.
196
+ def advance_and_decorate
197
+ type, val = advance
198
+
199
+ puts decorate(location, "\e[0;32m#{type} #{val.inspect}\e[0m")
200
+
201
+ [type, val]
202
+ end
203
+
204
+ # Return the current collected comment block and clear the storage.
205
+ def clear_comments
206
+ comments = @comments
207
+ @comments = ""
208
+
209
+ comments
210
+ end
211
+
212
+ # Lex `str` for the Ruby version `version` with initial state `state`.
213
+ #
214
+ # The tokens displayed by this function are not the same as tokens
215
+ # consumed by parser, because the parser manipulates lexer state on
216
+ # its own.
217
+ def self.do(source, state=nil, version=19)
218
+ lex = new(version)
219
+ lex.source = source
220
+ lex.state = state if state
221
+
222
+ loop do
223
+ type, val = lex.advance_and_decorate
224
+ break if !type
225
+ end
226
+
227
+ puts "Lex state: #{lex.state}"
228
+ end
229
+
230
+ # Used by LexerLiteral to emit tokens for string content.
231
+ def emit(type, value = tok, s = @ts, e = @te)
232
+ if s.nil? || e.nil?
233
+ raise "broken #emit invocation in #{caller[0]}"
234
+ end
235
+
236
+ @token_queue << [ type, value, s, e ]
237
+ end
238
+
239
+ def emit_table(table, s = @ts, e = @te)
240
+ token = tok(s, e)
241
+ emit(table[token], token, s, e)
242
+ end
243
+
244
+ # shim
245
+ def lineno
246
+ @location[0] + 1
247
+ end
248
+
249
+ protected
250
+
251
+ def eof_char?(char)
252
+ [0x04, 0x1a, 0x00].include? char.ord
253
+ end
254
+
255
+ def ruby18?
256
+ @version == 18
257
+ end
258
+
259
+ def ruby19?
260
+ @version == 19
261
+ end
262
+
263
+ def tok(s = @ts, e = @te)
264
+ @source[s...e]
265
+ end
266
+
267
+ def record_newline(p)
268
+ @newlines = (@newlines + [p]).uniq.sort
269
+ end
270
+
271
+ def dissect_location(start, finish)
272
+ line_number = @newlines.rindex { |nl| start >= nl }
273
+ line_first_col = @newlines[line_number]
274
+
275
+ start_col = start - line_first_col
276
+ finish_col = finish - line_first_col
277
+
278
+ [ line_number, start_col, finish_col ]
279
+ end
280
+
281
+ def with_location(item)
282
+ type, value, start, finish = *item
283
+
284
+ @location = dissect_location(start, finish)
285
+
286
+ [ type, value ]
287
+ end
288
+
289
+ def decorate(location, message="")
290
+ line_number, from, to = location
291
+
292
+ line = @source.lines.drop(line_number).first
293
+ line[from...to] = "\e[4m#{line[from...to]}\e[0m"
294
+
295
+ tail_len = to - from - 1
296
+ tail = "~" * (tail_len >= 0 ? tail_len : 0)
297
+ decoration = "#{" " * from}\e[1;31m^#{tail}\e[0m #{message}"
298
+
299
+ [ line, decoration ]
300
+ end
301
+
302
+ def warning(message, start = @ts, finish = @te)
303
+ $stderr.puts "warning: #{message}"
304
+ $stderr.puts decorate(dissect_location(start, finish))
305
+ end
306
+
307
+ def error(message)
308
+ raise Parser::SyntaxError, message
309
+ end
310
+
311
+ #
312
+ # === LITERAL STACK ===
313
+ #
314
+
315
+ def push_literal(*args)
316
+ new_literal = Parser::LexerLiteral.new(self, *args)
317
+ @literal_stack.push(new_literal)
318
+
319
+ if new_literal.type == :tWORDS_BEG
320
+ self.class.lex_en_interp_words
321
+ elsif new_literal.type == :tQWORDS_BEG
322
+ self.class.lex_en_plain_words
323
+ elsif new_literal.interpolate?
324
+ self.class.lex_en_interp_string
325
+ else
326
+ self.class.lex_en_plain_string
327
+ end
328
+ end
329
+
330
+ def literal
331
+ @literal_stack[-1]
332
+ end
333
+
334
+ def pop_literal
335
+ old_literal = @literal_stack.pop
336
+
337
+ if old_literal.type == :tREGEXP_BEG
338
+ # Fetch modifiers.
339
+ self.class.lex_en_regexp_modifiers
340
+ else
341
+ self.class.lex_en_expr_end
342
+ end
343
+ end
344
+
345
+ # Mapping of strings to parser tokens.
346
+
347
+ PUNCTUATION = {
348
+ '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
349
+ '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
350
+ '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
351
+ '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
352
+ ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
353
+ '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
354
+ '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
355
+ ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
356
+ '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
357
+ '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
358
+ '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
359
+ '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
360
+ '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
361
+ '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
362
+ '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
363
+ '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
364
+ 'do' => :kDO
365
+ }
366
+
367
+ PUNCTUATION_BEGIN = {
368
+ '&' => :tAMPER, '*' => :tSTAR, '+' => :tUPLUS,
369
+ '-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN,
370
+ '{' => :tLBRACE, '[' => :tLBRACK,
371
+ }
372
+
373
+ KEYWORDS = {
374
+ 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
375
+ 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
376
+ 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
377
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
378
+ }
379
+
380
+ %w(class module def undef begin end then elsif else ensure case when
381
+ for break next redo retry in do return yield super self nil true
382
+ false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
383
+ KEYWORDS[keyword] = :"k#{keyword.upcase}"
384
+ end
385
+
386
+ KEYWORDS_BEGIN = {
387
+ 'if' => :kIF, 'unless' => :kUNLESS,
388
+ 'while' => :kWHILE, 'until' => :kUNTIL,
389
+ 'rescue' => :kRESCUE
390
+ }
391
+
392
+ %%{
393
+ # %
394
+
395
+ access @;
396
+ getkey @source[p].ord;
397
+
398
+ # === CHARACTER CLASSES ===
399
+ #
400
+ # Pay close attention to the differences between c_any and any.
401
+ # c_any does not include EOF and so will cause incorrect behavior
402
+ # for machine subtraction (any-except rules) and default transitions
403
+ # for scanners.
404
+
405
+ action do_nl {
406
+ # Record position of a newline for precise line and column reporting.
407
+ #
408
+ # This action is embedded directly into c_nl, as it is idempotent and
409
+ # there are no cases when we need to skip it.
410
+ record_newline(p + 1)
411
+ @newline_s = p
412
+ }
413
+
414
+ c_nl = '\n' $ do_nl;
415
+ c_space = [ \t\r\f\v];
416
+ c_space_nl = c_space | c_nl;
417
+ c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
418
+ c_eol = c_nl | c_eof;
419
+ c_any = any - c_eof - zlen;
420
+ c_line = c_any - c_nl;
421
+
422
+ c_unicode = c_any - 0x00..0x7f;
423
+ c_lower = [a-z_] | c_unicode;
424
+ c_upper = [A-Z] | c_unicode;
425
+ c_alpha = c_lower | c_upper;
426
+ c_alnum = c_alpha | [0-9];
427
+
428
+ action do_eof {
429
+ # Sit at EOF indefinitely. #advance would return $eof each time.
430
+ # This allows to feed the lexer more data if needed; this is only used
431
+ # in tests.
432
+ #
433
+ # Note that this action is not embedded into e_eof like e_nl and e_bs
434
+ # below. This is due to the fact that scanner state at EOF is observed
435
+ # by tests, and encapsulating it in a rule would break the introspection.
436
+ fhold; fbreak;
437
+ }
438
+
439
+ #
440
+ # === TOKEN DEFINITIONS ===
441
+ #
442
+
443
+ # All operators are punctuation. There is more to punctuation
444
+ # than just operators. Operators can be overridden by user;
445
+ # punctuation can not.
446
+
447
+ # A list of operators which are valid in the function name context, but
448
+ # have different semantics in others.
449
+ operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' ;
450
+
451
+ # A list of operators which can occur within an assignment shortcut (+ → +=).
452
+ operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
453
+ '*' | '/' | '**' | '~' | '**' | '<<' | '>>' |
454
+ '%' ;
455
+
456
+ # A list of all user-definable operators not covered by groups above.
457
+ operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
458
+ '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
459
+
460
+ # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
461
+ # as they are ambiguous with interpolation `#{}` and should be counted.
462
+ # These braces are not present in punctuation lists.
463
+
464
+ # A list of punctuation which has different meaning when used at the
465
+ # beginning of expression.
466
+ punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '&' ;
467
+
468
+ # A list of all punctuation except punctuation_begin.
469
+ punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
470
+ '::' | '?' | ':' | '.' | '..' | '...' ;
471
+
472
+ # A list of keywords which have different meaning at the beginning of expression.
473
+ keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
474
+
475
+ # A list of keywords which accept an argument-like expression, i.e. have the
476
+ # same post-processing as method calls or commands. Example: `yield 1`,
477
+ # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
478
+ keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
479
+
480
+ # A list of keywords which accept a literal function name as an argument.
481
+ keyword_with_fname = 'def' | 'undef' | 'alias' ;
482
+
483
+ # A list of keywords which accept an expression after them.
484
+ keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
485
+ 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
486
+ 'and' | 'or' ;
487
+
488
+ # A list of keywords which accept a value, and treat the keywords from
489
+ # `keyword_modifier` list as modifiers.
490
+ keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
491
+
492
+ # A list of keywords which do not accept an expression after them.
493
+ keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
494
+ 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
495
+ '__LINE__' | '__ENCODING__';
496
+
497
+ # All keywords.
498
+ keyword = keyword_with_value | keyword_with_mid |
499
+ keyword_with_end | keyword_with_arg |
500
+ keyword_with_fname | keyword_modifier ;
501
+
502
+ constant = [A-Z] c_alnum*;
503
+ bareword = c_alpha c_alnum*;
504
+
505
+ call_or_var = c_lower c_alnum*;
506
+ class_var = '@@' bareword;
507
+ instance_var = '@' bareword;
508
+ global_var = '$'
509
+ ( bareword | digit+
510
+ | [`'+~*$&?!@/\\;,.=:<>"] # `
511
+ | '-' [A-Za-z0-9_]?
512
+ )
513
+ ;
514
+
515
+ # Ruby accepts (and fails on) variables with leading digit
516
+ # in literal context, but not in unquoted symbol body.
517
+ class_var_v = '@@' [0-9]? bareword;
518
+ instance_var_v = '@' [0-9]? bareword;
519
+
520
+ #
521
+ # === ESCAPE SEQUENCE PARSING ===
522
+ #
523
+
524
+ # Escape parsing code is a Ragel pattern, not a scanner, and therefore
525
+ # it shouldn't directly raise errors or perform other actions with side effects.
526
+ # In reality this would probably just mess up error reporting in pathological
527
+ # cases, through.
528
+
529
+ # The amount of code required to parse \M\C stuff correctly is ridiculous.
530
+
531
+ escaped_nl = "\\" c_nl;
532
+
533
+ action unicode_points {
534
+ @escape = ""
535
+
536
+ codepoints = tok(@escape_s + 2, p - 1)
537
+ codepoints.split(/[ \t]/).each do |codepoint_str|
538
+ codepoint = codepoint_str.to_i(16)
539
+
540
+ if codepoint >= 0x110000
541
+ @escape = lambda { error "invalid Unicode codepoint (too large)" }
542
+ break
543
+ end
544
+
545
+ @escape += codepoint.chr(Encoding::UTF_8)
546
+ end
547
+ }
548
+
549
+ action unescape_char {
550
+ @escape = {
551
+ 'a' => "\a", 'b' => "\b", 'e' => "\e", 'f' => "\f",
552
+ 'n' => "\n", 'r' => "\r", 's' => "\s", 't' => "\t",
553
+ 'v' => "\v", '\\' => "\\"
554
+ }.fetch(@source[p - 1], @source[p - 1])
555
+ }
556
+
557
+ action invalid_complex_escape {
558
+ @escape = lambda { error "invalid escape character syntax" }
559
+ }
560
+
561
+ action slash_c_char {
562
+ @escape = (@escape.ord & 0x9f).chr
563
+ }
564
+
565
+ action slash_m_char {
566
+ @escape = (@escape.ord | 0x80).chr
567
+ }
568
+
569
+ maybe_escaped_char = (
570
+ '\\' c_any %unescape_char
571
+ | ( c_any - [\\] ) % { @escape = @source[p - 1] }
572
+ );
573
+
574
+ maybe_escaped_ctrl_char = ( # why?!
575
+ '\\' c_any %unescape_char %slash_c_char
576
+ | '?' % { @escape = "\x7f" }
577
+ | ( c_any - [\\?] ) % { @escape = @source[p - 1] } %slash_c_char
578
+ );
579
+
580
+ escape = (
581
+ # \377
582
+ [0-7]{1,3}
583
+ % { @escape = tok(@escape_s, p).to_i(8).chr }
584
+
585
+ # \xff
586
+ | ( 'x' xdigit{1,2}
587
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr }
588
+ # \u263a
589
+ | 'u' xdigit{4}
590
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
591
+ )
592
+
593
+ # %q[\x]
594
+ | 'x' ( c_any - xdigit )
595
+ % { @escape = lambda { error "invalid hex escape" } }
596
+
597
+ # %q[\u123] %q[\u{12]
598
+ | 'u' ( c_any{0,4} -
599
+ xdigit{4} - # \u1234 is valid
600
+ ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
601
+ | '{' xdigit [ \t}] # \u{1. \u{1} are valid
602
+ | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
603
+ )
604
+ )
605
+ % { @escape = lambda { error "invalid Unicode escape" } }
606
+
607
+ # \u{123 456}
608
+ | 'u{' ( xdigit{1,6} [ \t] )*
609
+ ( xdigit{1,6} '}'
610
+ %unicode_points
611
+ | ( xdigit* ( c_any - xdigit - '}' )+ '}'
612
+ | ( c_any - '}' )* c_eof
613
+ | xdigit{7,}
614
+ ) % { @escape = lambda { error "unterminated Unicode escape" } }
615
+ )
616
+
617
+ # \C-\a \cx
618
+ | ( 'C-' | 'c' ) escaped_nl?
619
+ maybe_escaped_ctrl_char
620
+
621
+ # \M-a
622
+ | 'M-' escaped_nl?
623
+ maybe_escaped_char
624
+ %slash_m_char
625
+
626
+ # \C-\M-f \M-\cf \c\M-f
627
+ | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
628
+ | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
629
+ maybe_escaped_ctrl_char
630
+ %slash_m_char
631
+
632
+ | 'C' c_any %invalid_complex_escape
633
+ | 'M' c_any %invalid_complex_escape
634
+ | ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
635
+
636
+ | ( c_any - [0-7xuCMc] ) %unescape_char
637
+
638
+ | c_eof % { error "escape sequence meets end of file" }
639
+ );
640
+
641
+ # Use rules in form of `e_bs escape' when you need to parse a sequence.
642
+ e_bs = '\\' % {
643
+ @escape_s = p
644
+ @escape = nil
645
+ };
646
+
647
+ #
648
+ # === STRING AND HEREDOC PARSING ===
649
+ #
650
+
651
+ # Heredoc parsing is quite a complex topic. First, consider that heredocs
652
+ # can be arbitrarily nested. For example:
653
+ #
654
+ # puts <<CODE
655
+ # the result is: #{<<RESULT.inspect
656
+ # i am a heredoc
657
+ # RESULT
658
+ # }
659
+ # CODE
660
+ #
661
+ # which, incidentally, evaluates to:
662
+ #
663
+ # the result is: " i am a heredoc\n"
664
+ #
665
+ # To parse them, lexer refers to two kinds (remember, nested heredocs)
666
+ # of positions in the input stream, namely @heredoc_e
667
+ # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
668
+ #
669
+ # @heredoc_e is simply contained inside the corresponding LexerLiteral, and
670
+ # when the heredoc is closed, the lexing is restarted from that position.
671
+ #
672
+ # @herebody_s is quite more complex. First, @herebody_s changes after each
673
+ # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
674
+ # contains the current line, and also when a heredoc is started, @herebody_s
675
+ # contains the position from which the heredoc will be lexed.
676
+ #
677
+ # Second, as (insanity) there are nested heredocs, we need to maintain a
678
+ # stack of these positions. Each time #push_literal is called, it saves current
679
+ # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
680
+ # containing another heredocs) is closed, the previous value is restored.
681
+
682
+ e_heredoc_nl = c_nl $ {
683
+ # After every heredoc was parsed, @herebody_s contains the
684
+ # position of next token after all heredocs.
685
+ if @herebody_s
686
+ p = @herebody_s
687
+ @herebody_s = nil
688
+ end
689
+ };
690
+
691
+ action extend_string {
692
+ if literal.nest_and_try_closing tok, @ts, @te
693
+ fgoto *pop_literal;
694
+ else
695
+ literal.extend_string tok, @ts, @te
696
+ end
697
+ }
698
+
699
+ action extend_string_escaped {
700
+ if literal.nest_and_try_closing('\\', @ts, @ts + 1)
701
+ # If the literal is actually closed by the backslash,
702
+ # rewind the input prior to consuming the escape sequence.
703
+ p = @escape_s - 1
704
+ fgoto *pop_literal;
705
+ else
706
+ # Get the first character after the backslash.
707
+ escaped_char = @source[@escape_s]
708
+
709
+ if literal.munge_escape? escaped_char
710
+ # If this particular literal uses this character as an opening
711
+ # or closing delimiter, it is an escape sequence for that
712
+ # particular character. Write it without the backslash.
713
+
714
+ if literal.regexp?
715
+ # Regular expressions should have every escape sequence in its
716
+ # raw form.
717
+ literal.extend_string(tok, @ts, @te)
718
+ else
719
+ literal.extend_string(escaped_char, @ts, @te)
720
+ end
721
+ else
722
+ # It does not. So this is an actual escape sequence, yay!
723
+ # Two things to consider here.
724
+ #
725
+ # 1. The `escape' rule should be pure and so won't raise any
726
+ # errors by itself. Instead, it stores them in lambdas.
727
+ #
728
+ # 2. Non-interpolated literals do not go through the aforementioned
729
+ # rule. As \\ and \' (and variants) are munged, the full token
730
+ # should always be written for such literals.
731
+
732
+ @escape.call if @escape.respond_to? :call
733
+
734
+ if literal.regexp?
735
+ # Ditto. Also, expand escaped newlines.
736
+ literal.extend_string(tok.gsub("\\\n", ''), @ts, @te)
737
+ else
738
+ literal.extend_string(@escape || tok, @ts, @te)
739
+ end
740
+ end
741
+ end
742
+ }
743
+
744
+ # Extend a string with a newline or a EOF character.
745
+ # As heredoc closing line can immediately precede EOF, this action
746
+ # has to handle such case specially.
747
+ action extend_string_eol {
748
+ is_eof = eof_char? @source[p]
749
+
750
+ if literal.heredoc?
751
+ # Try ending the heredoc with the complete most recently
752
+ # scanned line. @herebody_s always refers to the start of such line.
753
+ if literal.nest_and_try_closing(tok(@herebody_s, @te - 1),
754
+ @herebody_s, @te - 1)
755
+ # Adjust @herebody_s to point to the next line.
756
+ @herebody_s = @te
757
+
758
+ # Continue regular lexing after the heredoc reference (<<END).
759
+ p = literal.heredoc_e - 1
760
+ fgoto *pop_literal;
761
+ else
762
+ # Ditto.
763
+ @herebody_s = @te
764
+ end
765
+ end
766
+
767
+ if is_eof
768
+ error "unterminated string meets end of file"
769
+ end
770
+
771
+ # A literal newline is appended if the heredoc was _not_ closed
772
+ # this time. See also LexerLiteral#nest_and_try_closing for rationale of
773
+ # calling #flush_string here.
774
+ literal.extend_string tok, @ts, @te
775
+ literal.flush_string
776
+ }
777
+
778
+ #
779
+ # === INTERPOLATION PARSING ===
780
+ #
781
+
782
+ # Interpolations with immediate variable names simply call into
783
+ # the corresponding machine.
784
+
785
+ interp_var =
786
+ '#' ( global_var | class_var_v | instance_var_v );
787
+
788
+ action extend_interp_var {
789
+ literal.flush_string
790
+ emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
791
+
792
+ p = @ts
793
+ fcall expr_variable;
794
+ }
795
+
796
+ # Interpolations with code blocks must match nested curly braces, as
797
+ # interpolation ending is ambiguous with a block ending. So, every
798
+ # opening and closing brace should be matched with e_[lr]brace rules,
799
+ # which automatically perform the counting.
800
+ #
801
+ # Note that interpolations can themselves be nested, so brace balance
802
+ # is tied to the innermost literal.
803
+ #
804
+ # Also note that literals themselves should not use e_[lr]brace rules
805
+ # when matching their opening and closing delimiters, as the amount of
806
+ # braces inside the characters of a string literal is independent.
807
+
808
+ interp_code = '#{';
809
+
810
+ e_lbrace = '{' % {
811
+ if literal
812
+ literal.start_interp_brace
813
+ end
814
+ };
815
+
816
+ e_rbrace = '}' % {
817
+ if literal
818
+ if literal.end_interp_brace_and_try_closing
819
+ emit(:tRCURLY, '}')
820
+
821
+ if literal.words?
822
+ emit(:tSPACE, nil)
823
+ end
824
+
825
+ if literal.saved_herebody_s
826
+ @herebody_s = literal.saved_herebody_s
827
+ end
828
+
829
+ fhold;
830
+ fnext *@stack.pop;
831
+ fbreak;
832
+ end
833
+ end
834
+ };
835
+
836
+ action extend_interp_code {
837
+ literal.flush_string
838
+ emit(:tSTRING_DBEG, '#{')
839
+
840
+ literal.saved_herebody_s = @herebody_s
841
+ @herebody_s = nil
842
+
843
+ literal.start_interp_brace
844
+ fcall expr_beg;
845
+ }
846
+
847
+ # Actual string parsers are simply combined from the primitives defined
848
+ # above.
849
+
850
+ interp_words := |*
851
+ interp_code => extend_interp_code;
852
+ interp_var => extend_interp_var;
853
+ e_bs escape => extend_string_escaped;
854
+ c_space_nl => { literal.flush_string };
855
+ c_eol => extend_string_eol;
856
+ c_any => extend_string;
857
+ *|;
858
+
859
+ interp_string := |*
860
+ interp_code => extend_interp_code;
861
+ interp_var => extend_interp_var;
862
+ e_bs escape => extend_string_escaped;
863
+ c_eol => extend_string_eol;
864
+ c_any => extend_string;
865
+ *|;
866
+
867
+ plain_words := |*
868
+ e_bs c_any => extend_string_escaped;
869
+ c_space_nl => { literal.flush_string };
870
+ c_eol => extend_string_eol;
871
+ c_any => extend_string;
872
+ *|;
873
+
874
+ plain_string := |*
875
+ e_bs c_any => extend_string_escaped;
876
+ c_eol => extend_string_eol;
877
+ c_any => extend_string;
878
+ *|;
879
+
880
+ regexp_modifiers := |*
881
+ [A-Za-z]+
882
+ => {
883
+ unknown_options = tok.scan(/[^imxouesn]/)
884
+ if unknown_options.any?
885
+ error "unknown regexp options: #{unknown_options.join}"
886
+ end
887
+
888
+ emit(:tREGEXP_OPT)
889
+ fgoto expr_end;
890
+ };
891
+
892
+ any
893
+ => {
894
+ emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
895
+ fhold; fgoto expr_end;
896
+ };
897
+ *|;
898
+
899
+ #
900
+ # === EXPRESSION PARSING ===
901
+ #
902
+
903
+ # These rules implement a form of manually defined lookahead.
904
+ # The default longest-match scanning does not work here due
905
+ # to sheer ambiguity.
906
+
907
+ ambiguous_ident_suffix = # actual parsed
908
+ [?!=] %{ tm = p } | # a? a?
909
+ '==' %{ tm = p - 2 } | # a==b a == b
910
+ '=~' %{ tm = p - 2 } | # a=~b a =~ b
911
+ '=>' %{ tm = p - 2 } | # a=>b a => b
912
+ '===' %{ tm = p - 3 } # a===b a === b
913
+ ;
914
+
915
+ ambiguous_symbol_suffix = # actual parsed
916
+ ambiguous_ident_suffix |
917
+ '==>' %{ tm = p - 2 } # :a==>b :a= => b
918
+ ;
919
+
920
+ # Ambiguous with 1.9 hash labels.
921
+ ambiguous_const_suffix = # actual parsed
922
+ '::' %{ tm = p - 2 } # A::B A :: B
923
+ ;
924
+
925
+ # Ruby 1.9 lambdas require parentheses counting in order to
926
+ # emit correct opening kDO/tLBRACE.
927
+
928
+ e_lparen = '(' % {
929
+ @paren_nest += 1
930
+ };
931
+
932
+ e_rparen = ')' % {
933
+ @paren_nest -= 1
934
+ };
935
+
936
+ # Variable lexing code is accessed from both expressions and
937
+ # string interpolation related code.
938
+ #
939
+ expr_variable := |*
940
+ global_var
941
+ => {
942
+ if tok =~ /^\$([1-9][0-9]*)$/
943
+ emit(:tNTH_REF, $1.to_i)
944
+ elsif tok =~ /^\$([&`'+])$/
945
+ emit(:tBACK_REF, $1.to_sym)
946
+ else
947
+ emit(:tGVAR)
948
+ end
949
+
950
+ fnext *@stack.pop; fbreak;
951
+ };
952
+
953
+ class_var_v
954
+ => {
955
+ error "`#{tok}' is not allowed as a class variable name" if tok =~ /^@@[0-9]/
956
+
957
+ emit(:tCVAR)
958
+ fnext *@stack.pop; fbreak;
959
+ };
960
+
961
+ instance_var_v
962
+ => {
963
+ error "`#{tok}' is not allowed as an instance variable name" if tok =~ /^@[0-9]/
964
+
965
+ emit(:tIVAR)
966
+ fnext *@stack.pop; fbreak;
967
+ };
968
+ *|;
969
+
970
+ # Literal function name in definition (e.g. `def class`).
971
+ # Keywords are returned as their respective tokens; this is used
972
+ # to support singleton def `def self.foo`. Global variables are
973
+ # returned as `tGVAR`; this is used in global variable alias
974
+ # statements `alias $a $b`. Symbols are returned verbatim; this
975
+ # is used in `alias :a :"b#{foo}"` and `undef :a`.
976
+ #
977
+ # Transitions to `expr_end` afterwards.
978
+ #
979
+ expr_fname := |*
980
+ keyword
981
+ => { emit(KEYWORDS[tok]);
982
+ fnext expr_end; fbreak; };
983
+
984
+ bareword
985
+ => { emit(:tIDENTIFIER)
986
+ fnext expr_end; fbreak; };
987
+
988
+ bareword ambiguous_ident_suffix
989
+ => { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
990
+ fnext expr_end; p = tm - 1; fbreak; };
991
+
992
+ operator_fname |
993
+ operator_arithmetic |
994
+ operator_rest
995
+ => { emit_table(PUNCTUATION)
996
+ fnext expr_end; fbreak; };
997
+
998
+ ':'
999
+ => { fhold; fgoto expr_end; };
1000
+
1001
+ global_var
1002
+ => { emit(:tGVAR)
1003
+ fbreak; };
1004
+
1005
+ c_space_nl+;
1006
+
1007
+ c_any
1008
+ => { fhold; fgoto expr_end; };
1009
+
1010
+ c_eof => do_eof;
1011
+ *|;
1012
+
1013
+ # Literal function name in method call (e.g. `a.class`).
1014
+ #
1015
+ # Transitions to `expr_arg` afterwards.
1016
+ #
1017
+ expr_dot := |*
1018
+ bareword
1019
+ => { emit(:tIDENTIFIER)
1020
+ fnext expr_arg; fbreak; };
1021
+
1022
+ bareword ambiguous_ident_suffix
1023
+ => { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
1024
+ fnext expr_arg; p = tm - 1; fbreak; };
1025
+
1026
+ operator_fname |
1027
+ operator_arithmetic |
1028
+ operator_rest
1029
+ => { emit_table(PUNCTUATION)
1030
+ fnext expr_arg; fbreak; };
1031
+
1032
+ c_space_nl+;
1033
+
1034
+ c_any
1035
+ => { fhold; fgoto expr_end; };
1036
+
1037
+ c_eof => do_eof;
1038
+ *|;
1039
+
1040
+ # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1041
+ # is consumed; the current expression is a command or method call.
1042
+ #
1043
+ expr_arg := |*
1044
+ #
1045
+ # COMMAND MODE SPECIFIC TOKENS
1046
+ #
1047
+
1048
+ # cmd (1 + 2)
1049
+ # See below the rationale about expr_endarg.
1050
+ c_space+ e_lparen
1051
+ => { emit(:tLPAREN_ARG, '(', @te - 1, @te)
1052
+ fnext expr_beg; fbreak; };
1053
+
1054
+ # meth(1 + 2)
1055
+ # Regular method call.
1056
+ e_lparen
1057
+ => { emit(:tLPAREN2)
1058
+ fnext expr_beg; fbreak; };
1059
+
1060
+ # meth [...]
1061
+ # Array argument. Compare with indexing `meth[...]`.
1062
+ c_space+ '['
1063
+ => { emit(:tLBRACK, '[', @te - 1, @te);
1064
+ fnext expr_beg; fbreak; };
1065
+
1066
+ # cmd {}
1067
+ # Command: method call without parentheses.
1068
+ c_space* e_lbrace
1069
+ => {
1070
+ if @lambda_stack.last == @paren_nest
1071
+ p = @ts - 1
1072
+ fgoto expr_end;
1073
+ else
1074
+ emit(:tLCURLY, '{', @te - 1, @te)
1075
+ fnext expr_value; fbreak;
1076
+ end
1077
+ };
1078
+
1079
+ # a.b
1080
+ # Dot-call.
1081
+ '.' | '::'
1082
+ => { emit_table(PUNCTUATION);
1083
+ fnext expr_dot; fbreak; };
1084
+
1085
+ #
1086
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1087
+ #
1088
+
1089
+ # a ?b
1090
+ # Character literal.
1091
+ c_space+ '?'
1092
+ => { fhold; fgoto expr_beg; };
1093
+
1094
+ # x +1
1095
+ # Ambiguous unary operator or regexp literal.
1096
+ c_space+ [+\-/]
1097
+ => {
1098
+ warning "ambiguous first argument; put parentheses or even spaces", @te - 1, @te
1099
+ fhold; fhold; fgoto expr_beg;
1100
+ };
1101
+
1102
+ # x *1
1103
+ # Ambiguous splat or block-pass.
1104
+ c_space+ [*&]
1105
+ => {
1106
+ what = tok(@te - 1, @te)
1107
+ warning "`#{what}' interpreted as argument prefix", @te - 1, @te
1108
+ fhold; fgoto expr_beg;
1109
+ };
1110
+
1111
+ #
1112
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1113
+ #
1114
+
1115
+ # a ? b
1116
+ # Ternary operator.
1117
+ c_space+ '?' c_space_nl
1118
+ => { fhold; fhold; fgoto expr_end; };
1119
+
1120
+ # x + 1: Binary operator or operator-assignment.
1121
+ c_space* operator_arithmetic
1122
+ ( '=' | c_space_nl )? |
1123
+ # x rescue y: Modifier keyword.
1124
+ c_space+ keyword_modifier |
1125
+ # Miscellanea.
1126
+ c_space* punctuation_end
1127
+ => {
1128
+ p = @ts - 1
1129
+ fgoto expr_end;
1130
+ };
1131
+
1132
+ c_space* c_nl
1133
+ => { fhold; fgoto expr_end; };
1134
+
1135
+ c_any
1136
+ => { fhold; fgoto expr_beg; };
1137
+
1138
+ c_eof => do_eof;
1139
+ *|;
1140
+
1141
+ # The rationale for this state is pretty complex. Normally, if an argument
1142
+ # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1143
+ # the block is attached to the innermost argument (`f` in `m f {}`), or it
1144
+ # is a parse error (`m 1 {}`). But there is a special case for passing a single
1145
+ # primary expression grouped with parentheses: if you write `m (1) {}` or
1146
+ # (2.0 only) `m () {}`, then the block is attached to `m`.
1147
+ #
1148
+ # Thus, we recognize the opening `(` of a command (remember, a command is
1149
+ # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1150
+ # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1151
+ # lexer's state to `expr_endarg`, which makes it emit the possibly following
1152
+ # `{` as `tLBRACE_ARG`.
1153
+ #
1154
+ # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1155
+ # `do` (as `kDO_BLOCK` in `expr_beg`). (I have no clue why the parser cannot
1156
+ # just handle `kDO`.)
1157
+ expr_endarg := |*
1158
+ e_lbrace
1159
+ => { emit(:tLBRACE_ARG)
1160
+ fnext expr_value; };
1161
+
1162
+ 'do'
1163
+ => { emit(:kDO_BLOCK)
1164
+ fnext expr_value; };
1165
+
1166
+ c_space*;
1167
+
1168
+ c_any
1169
+ => { fhold; fgoto expr_end; };
1170
+
1171
+ c_eof => do_eof;
1172
+ *|;
1173
+
1174
+ # The rationale for this state is that several keywords accept value
1175
+ # (i.e. should transition to `expr_beg`), do not accept it like a command
1176
+ # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1177
+ # accept a modifier if/while/etc.
1178
+ #
1179
+ expr_mid := |*
1180
+ keyword_modifier
1181
+ => { emit_table(KEYWORDS)
1182
+ fnext expr_beg; fbreak; };
1183
+
1184
+ c_space+;
1185
+
1186
+ c_nl
1187
+ => { fhold; fgoto expr_end; };
1188
+
1189
+ c_any
1190
+ => { fhold; fgoto expr_beg; };
1191
+
1192
+ c_eof => do_eof;
1193
+ *|;
1194
+
1195
+ # Beginning of an expression.
1196
+ #
1197
+ # Don't fallthrough to this state from `c_any`; make sure to handle
1198
+ # `c_space* c_nl` and let `expr_end` handle the newline.
1199
+ # Otherwise code like `f\ndef x` gets glued together and the parser
1200
+ # explodes.
1201
+ #
1202
+ expr_beg := |*
1203
+ # Numeric processing. Converts:
1204
+ # +5 to [tINTEGER, 5]
1205
+ # -5 to [tUMINUS_NUM] [tINTEGER, 5]
1206
+ [+\-][0-9]
1207
+ => {
1208
+ fhold;
1209
+ if tok.start_with? '-'
1210
+ emit(:tUMINUS_NUM, '-')
1211
+ fnext expr_end; fbreak;
1212
+ end
1213
+ };
1214
+
1215
+ # splat *a
1216
+ '*'
1217
+ => { emit(:tSTAR)
1218
+ fbreak; };
1219
+
1220
+ #
1221
+ # STRING AND REGEXP LITERALS
1222
+ #
1223
+
1224
+ # a / 42
1225
+ # a % 42
1226
+ # a %= 42 (disambiguation with %=string=)
1227
+ [/%] c_space_nl | '%=' # /
1228
+ => {
1229
+ fhold; fhold;
1230
+ fgoto expr_end;
1231
+ };
1232
+
1233
+ # /regexp/oui
1234
+ '/'
1235
+ => {
1236
+ type, delimiter = tok, tok
1237
+ fgoto *push_literal(type, delimiter, @ts);
1238
+ };
1239
+
1240
+ # %<string>
1241
+ '%' ( c_any - [A-Za-z] )
1242
+ => {
1243
+ type, delimiter = tok[0], tok[-1]
1244
+ fgoto *push_literal(type, delimiter, @ts);
1245
+ };
1246
+
1247
+ # %w(we are the people)
1248
+ '%' [A-Za-z]+ c_any
1249
+ => {
1250
+ type, delimiter = tok[0..-2], tok[-1]
1251
+ fgoto *push_literal(type, delimiter, @ts);
1252
+ };
1253
+
1254
+ '%' c_eof
1255
+ => {
1256
+ error "unterminated string meets end of file"
1257
+ };
1258
+
1259
+ # Heredoc start.
1260
+ # <<EOF | <<-END | <<"FOOBAR" | <<-`SMTH`
1261
+ '<<' '-'?
1262
+ ( '"' ( c_any - c_nl - '"' )* '"'
1263
+ | "'" ( c_any - c_nl - "'" )* "'"
1264
+ | "`" ( c_any - c_nl - "`" )* "`"
1265
+ | bareword ) % { @heredoc_e = p }
1266
+ ( c_any - c_nl )* c_nl % { new_herebody_s = p }
1267
+ => {
1268
+ tok(@ts, @heredoc_e) =~ /^<<(-?)(["'`]?)(.*)\2$/
1269
+
1270
+ indent = !$1.empty?
1271
+ type = $2.empty? ? '"' : $2
1272
+ delimiter = $3
1273
+
1274
+ fnext *push_literal(type, delimiter, @ts, @heredoc_e, indent);
1275
+
1276
+ if @herebody_s.nil?
1277
+ @herebody_s = new_herebody_s
1278
+ end
1279
+
1280
+ p = @herebody_s - 1
1281
+ };
1282
+
1283
+ #
1284
+ # AMBIGUOUS TERNARY OPERATOR
1285
+ #
1286
+
1287
+ '?' ( e_bs escape
1288
+ | c_any - c_space_nl - e_bs % { @escape = nil }
1289
+ )
1290
+ => {
1291
+ # Show an error if memorized.
1292
+ @escape.call if @escape.respond_to? :call
1293
+
1294
+ value = @escape || tok(@ts + 1)
1295
+
1296
+ if ruby18?
1297
+ emit(:tINTEGER, value.ord)
1298
+ else
1299
+ emit(:tSTRING, value)
1300
+ end
1301
+
1302
+ fbreak;
1303
+ };
1304
+
1305
+ '?' c_space_nl
1306
+ => {
1307
+ escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1308
+ "\v" => '\v', "\f" => '\f' }[tok[@ts + 1]]
1309
+ warning "invalid character syntax; use ?#{escape}", @ts
1310
+
1311
+ p = @ts - 1
1312
+ fgoto expr_end;
1313
+ };
1314
+
1315
+ '?' c_eof
1316
+ => {
1317
+ error "incomplete character syntax"
1318
+ };
1319
+
1320
+ # f ?aa : b: Disambiguate with a character literal.
1321
+ '?' [A-Za-z_] bareword
1322
+ => {
1323
+ p = @ts - 1
1324
+ fgoto expr_end;
1325
+ };
1326
+
1327
+ #
1328
+ # KEYWORDS AND PUNCTUATION
1329
+ #
1330
+
1331
+ # a(+b)
1332
+ punctuation_begin |
1333
+ # a({b=>c})
1334
+ e_lbrace |
1335
+ # a()
1336
+ e_lparen
1337
+ => { emit_table(PUNCTUATION_BEGIN)
1338
+ fbreak; };
1339
+
1340
+ # rescue Exception => e: Block rescue.
1341
+ # Special because it should transition to expr_mid.
1342
+ 'rescue'
1343
+ => { emit_table(KEYWORDS_BEGIN)
1344
+ fnext expr_mid; fbreak; };
1345
+
1346
+ # if a: Statement if.
1347
+ keyword_modifier
1348
+ => { emit_table(KEYWORDS_BEGIN)
1349
+ fnext expr_value; fbreak; };
1350
+
1351
+ #
1352
+ # RUBY 1.9 HASH LABELS
1353
+ #
1354
+
1355
+ bareword ':' ( c_any - ':' )
1356
+ => {
1357
+ fhold;
1358
+
1359
+ if ruby18?
1360
+ emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
1361
+ fhold; # continue as a symbol
1362
+ else
1363
+ emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1364
+ end
1365
+
1366
+ fbreak;
1367
+ };
1368
+
1369
+ #
1370
+ # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
1371
+ #
1372
+
1373
+ # foo= bar: Disambiguate with bareword rule below.
1374
+ bareword ambiguous_ident_suffix |
1375
+ # def foo: Disambiguate with bareword rule below.
1376
+ keyword
1377
+ => { p = @ts - 1
1378
+ fgoto expr_end; };
1379
+
1380
+ # a = 42; a [42]: Indexing.
1381
+ # def a; end; a [42]: Array argument.
1382
+ call_or_var
1383
+ => {
1384
+ emit(:tIDENTIFIER)
1385
+
1386
+ if @static_env && @static_env.declared?(tok.to_sym)
1387
+ fgoto expr_end;
1388
+ else
1389
+ fgoto expr_arg;
1390
+ end
1391
+ };
1392
+
1393
+ c_space_nl+;
1394
+
1395
+ # The following rules match most binary and all unary operators.
1396
+ # Rules for binary operators provide better error reporting.
1397
+ operator_arithmetic '=' |
1398
+ operator_rest |
1399
+ punctuation_end |
1400
+ c_any
1401
+ => { p = @ts - 1; fgoto expr_end; };
1402
+
1403
+ c_eof => do_eof;
1404
+ *|;
1405
+
1406
+ # Like expr_beg, but no 1.9 label possible.
1407
+ #
1408
+ expr_value := |*
1409
+ # a:b: a(:b), a::B, A::B
1410
+ bareword ':'
1411
+ => { p = @ts - 1
1412
+ fgoto expr_end; };
1413
+
1414
+ c_space_nl+;
1415
+
1416
+ c_any
1417
+ => { fhold; fgoto expr_beg; };
1418
+
1419
+ c_eof => do_eof;
1420
+ *|;
1421
+
1422
+ expr_end := |*
1423
+ #
1424
+ # STABBY LAMBDA
1425
+ #
1426
+
1427
+ '->'
1428
+ => {
1429
+ emit_table(PUNCTUATION)
1430
+
1431
+ @lambda_stack.push @paren_nest
1432
+ fbreak;
1433
+ };
1434
+
1435
+ e_lbrace | 'do'
1436
+ => {
1437
+ if @lambda_stack.last == @paren_nest
1438
+ @lambda_stack.pop
1439
+
1440
+ if tok == '{'
1441
+ emit(:tLAMBEG)
1442
+ else
1443
+ emit(:kDO_LAMBDA)
1444
+ end
1445
+ else
1446
+ emit_table(PUNCTUATION)
1447
+ end
1448
+
1449
+ fnext expr_value; fbreak;
1450
+ };
1451
+
1452
+ #
1453
+ # KEYWORDS
1454
+ #
1455
+
1456
+ keyword_with_fname
1457
+ => { emit_table(KEYWORDS)
1458
+ fnext expr_fname; fbreak; };
1459
+
1460
+ 'class' c_space_nl '<<'
1461
+ => { emit(:kCLASS, 'class', @ts, @ts + 5)
1462
+ emit(:tLSHFT, '<<', @te - 2, @te)
1463
+ fnext expr_beg; fbreak; };
1464
+
1465
+ # a if b:c: Syntax error.
1466
+ keyword_modifier
1467
+ => { emit_table(KEYWORDS)
1468
+ fnext expr_beg; fbreak; };
1469
+
1470
+ # elsif b:c: elsif b(:c)
1471
+ keyword_with_value
1472
+ => { emit_table(KEYWORDS)
1473
+ fnext expr_value; fbreak; };
1474
+
1475
+ keyword_with_mid
1476
+ => { emit_table(KEYWORDS)
1477
+ fnext expr_mid; fbreak; };
1478
+
1479
+ keyword_with_arg
1480
+ => {
1481
+ emit_table(KEYWORDS)
1482
+
1483
+ if ruby18? && tok == 'not'
1484
+ fnext expr_beg; fbreak;
1485
+ else
1486
+ fnext expr_arg; fbreak;
1487
+ end
1488
+ };
1489
+
1490
+ keyword_with_end
1491
+ => { emit_table(KEYWORDS)
1492
+ fbreak; };
1493
+
1494
+ #
1495
+ # NUMERIC LITERALS
1496
+ #
1497
+
1498
+ ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p }
1499
+ ( xdigit+ '_' )* xdigit* '_'?
1500
+ | '0' [Dd] %{ @num_base = 10; @num_digits_s = p }
1501
+ ( digit+ '_' )* digit* '_'?
1502
+ | '0' [Oo] %{ @num_base = 8; @num_digits_s = p }
1503
+ ( digit+ '_' )* digit* '_'?
1504
+ | '0' [Bb] %{ @num_base = 2; @num_digits_s = p }
1505
+ ( [01]+ '_' )* [01]* '_'?
1506
+ | [1-9] %{ @num_base = 10; @num_digits_s = @ts }
1507
+ ( '_' digit+ )* digit* '_'?
1508
+ | '0' %{ @num_base = 8; @num_digits_s = @ts }
1509
+ ( '_' digit+ )* digit* '_'?
1510
+ )
1511
+ => {
1512
+ digits = tok(@num_digits_s)
1513
+
1514
+ if digits.end_with? '_'
1515
+ error "trailing `_' in number"
1516
+ elsif digits.empty? && @num_base == 8 && ruby18?
1517
+ # 1.8 did not raise an error on 0o.
1518
+ digits = "0"
1519
+ elsif digits.empty?
1520
+ error "numeric literal without digits"
1521
+ elsif @num_base == 8 && digits =~ /[89]/
1522
+ error "invalid octal digit"
1523
+ end
1524
+
1525
+ emit(:tINTEGER, digits.to_i(@num_base))
1526
+ fbreak;
1527
+ };
1528
+
1529
+ # Floating point literals cannot start with 0 except when a dot
1530
+ # follows immediately, probably to avoid confusion with octal literals.
1531
+ ( [1-9] [0-9]* ( '_' digit+ )* |
1532
+ '0'
1533
+ )?
1534
+ (
1535
+ '.' ( digit+ '_' )* digit+ |
1536
+ ( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
1537
+ )
1538
+ => {
1539
+ if tok.start_with? '.'
1540
+ error "no .<digit> floating literal anymore; put 0 before dot"
1541
+ elsif tok =~ /^[eE]/
1542
+ # The rule above allows to specify floats as just `e10', which is
1543
+ # certainly not a float. Send a patch if you can do this better.
1544
+ emit(:tIDENTIFIER, tok)
1545
+ fbreak;
1546
+ end
1547
+
1548
+ emit(:tFLOAT, tok.to_f)
1549
+ fbreak;
1550
+ };
1551
+
1552
+ #
1553
+ # SYMBOL LITERALS
1554
+ #
1555
+
1556
+ # `echo foo` | :"bar" | :'baz'
1557
+ '`' | ':'? ['"] # '
1558
+ => {
1559
+ type, delimiter = tok, tok[-1]
1560
+ fgoto *push_literal(type, delimiter, @ts);
1561
+ };
1562
+
1563
+ ':' bareword ambiguous_symbol_suffix
1564
+ => { emit(:tSYMBOL, tok(@ts + 1, tm))
1565
+ p = tm - 1; fbreak; };
1566
+
1567
+ ':' ( bareword | global_var | class_var | instance_var |
1568
+ operator_fname | operator_arithmetic | operator_rest )
1569
+ => { emit(:tSYMBOL, tok(@ts + 1))
1570
+ fbreak; };
1571
+
1572
+ #
1573
+ # CONSTANTS AND VARIABLES
1574
+ #
1575
+
1576
+ constant
1577
+ => { emit(:tCONSTANT)
1578
+ fbreak; };
1579
+
1580
+ constant ambiguous_const_suffix
1581
+ => { emit(:tCONSTANT, tok(@ts, tm))
1582
+ p = tm - 1; fbreak; };
1583
+
1584
+ global_var | class_var_v | instance_var_v
1585
+ => { p = @ts - 1; fcall expr_variable; };
1586
+
1587
+ #
1588
+ # METHOD CALLS
1589
+ #
1590
+
1591
+ '.'
1592
+ => { emit_table(PUNCTUATION)
1593
+ fnext expr_dot; fbreak; };
1594
+
1595
+ call_or_var
1596
+ => { emit(:tIDENTIFIER)
1597
+ fnext expr_arg; fbreak; };
1598
+
1599
+ call_or_var [?!]
1600
+ => { emit(:tFID)
1601
+ fnext expr_arg; fbreak; };
1602
+
1603
+ #
1604
+ # OPERATORS
1605
+ #
1606
+
1607
+ ( e_lparen |
1608
+ operator_arithmetic |
1609
+ operator_rest
1610
+ ) %{ tm = p } c_space_nl*
1611
+ => { emit_table(PUNCTUATION, @ts, tm)
1612
+ fnext expr_beg; fbreak; };
1613
+
1614
+ e_rbrace | e_rparen | ']'
1615
+ => { emit_table(PUNCTUATION)
1616
+ fbreak; };
1617
+
1618
+ operator_arithmetic '='
1619
+ => { emit(:tOP_ASGN, tok(@ts, @te - 1))
1620
+ fnext expr_beg; fbreak; };
1621
+
1622
+ '?'
1623
+ => { emit_table(PUNCTUATION)
1624
+ fnext expr_value; fbreak; };
1625
+
1626
+ punctuation_end
1627
+ => { emit_table(PUNCTUATION)
1628
+ fnext expr_beg; fbreak; };
1629
+
1630
+ #
1631
+ # WHITESPACE
1632
+ #
1633
+
1634
+ '\\' e_heredoc_nl;
1635
+ '\\' ( any - c_nl ) {
1636
+ error "bare backslash only allowed before newline"
1637
+ };
1638
+
1639
+ '#' ( c_any - c_nl )*
1640
+ => { @comments << tok(@ts, @te + 1) };
1641
+
1642
+ e_heredoc_nl
1643
+ => { fgoto leading_dot; };
1644
+
1645
+ ';'
1646
+ => { emit_table(PUNCTUATION)
1647
+ fnext expr_value; fbreak; };
1648
+
1649
+ c_space+;
1650
+
1651
+ c_any
1652
+ => {
1653
+ error "unexpected #{tok.inspect}"
1654
+ };
1655
+
1656
+ c_eof => do_eof;
1657
+ *|;
1658
+
1659
+ leading_dot := |*
1660
+ # Insane leading dots:
1661
+ # a #comment
1662
+ # .b: a.b
1663
+ c_space* '.' ( c_any - '.' )
1664
+ => { fhold; fhold;
1665
+ fgoto expr_end; };
1666
+
1667
+ any
1668
+ => { emit(:tNL, nil, @newline_s, @newline_s + 1)
1669
+ fnext line_begin; fhold; fbreak; };
1670
+ *|;
1671
+
1672
+ #
1673
+ # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
1674
+ #
1675
+
1676
+ line_comment := |*
1677
+ '=end' c_line* c_nl
1678
+ => { @comments << tok
1679
+ fgoto line_begin; };
1680
+
1681
+ c_line* c_nl
1682
+ => { @comments << tok };
1683
+
1684
+ any
1685
+ => {
1686
+ @comments = ""
1687
+ error "embedded document meats end of file (and they embark on a romantic journey)"
1688
+ };
1689
+ *|;
1690
+
1691
+ line_begin := |*
1692
+ c_space_nl+;
1693
+
1694
+ '#' c_line* c_eol
1695
+ => { @comments << tok
1696
+ fhold; };
1697
+
1698
+ '=begin' ( c_space | c_eol )
1699
+ => { @comments << tok(@ts, @te)
1700
+ fgoto line_comment; };
1701
+
1702
+ '__END__' c_eol
1703
+ => { p = pe - 1 };
1704
+
1705
+ c_any
1706
+ => { fhold; fgoto expr_value; };
1707
+
1708
+ c_eof => do_eof;
1709
+ *|;
1710
+
1711
+ }%%
1712
+ # %
1713
+ end