parser 0.9.alpha → 0.9.alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac470264d9dbf4e1781557f2e4c6f93050ca1fca
4
- data.tar.gz: 2af0ef8f8dee1d84f91305adbbd6f099f99b5c8f
3
+ metadata.gz: 384df81635da81957880f54cb589109db642c914
4
+ data.tar.gz: 101c991d44683e9ba699a3ec3deca74572fb7a09
5
5
  SHA512:
6
- metadata.gz: cceb3bc547346c33f28c9392971b0d6df75c289c3e5046bebac559bdf3a7334c902efae7d2c02e294d40930589ff27db011ab95afb72ff0f9f82af6851db2e0c
7
- data.tar.gz: 0121d2b3fea37bac9b97db352c1cc55469cfa0ca854f0f913f0d8cb9015740e6799d9180a2a29c6a9715285e8acbc50d5717fafd8759de3966af222fd963a3b4
6
+ metadata.gz: 0724f1d86bbe49d1aa390c5bea5e3e6c859850b59be9ef0b78623f23cbe97cae94f75de5bc961cbdad1aabe6a1bb166e63dcac4103bb211a41c19ae86d8d3624
7
+ data.tar.gz: 1c64825e1d3a58b00d1a7038b599e161025b516435cde204e325061b1800416813f8ef2ab154f5b5dc99a1dc9cb1d11a9dfa33adb1e2e35f756c3fa10a9f5960
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.output
19
+ lib/parser/lexer.rb
20
+ lib/parser/ruby18.rb
21
+ lib/parser/ruby19.rb
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ - rbx-19mode
6
+ - jruby-19mode
7
+ matrix:
8
+ allow_failures:
9
+ - rvm: jruby-19mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in parser.gemspec
4
+ gemspec
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2013 Peter Zotov <whitequark@whitequark.org>
2
+
3
+ Parts of the source are derived from ruby_parser:
4
+ Copyright (c) Ryan Davis, seattle.rb
5
+
6
+ MIT License
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining
9
+ a copy of this software and associated documentation files (the
10
+ "Software"), to deal in the Software without restriction, including
11
+ without limitation the rights to use, copy, modify, merge, publish,
12
+ distribute, sublicense, and/or sell copies of the Software, and to
13
+ permit persons to whom the Software is furnished to do so, subject to
14
+ the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be
17
+ included in all copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Parser
2
+
3
+ [![Build Status](https://travis-ci.org/whitequark/parser.png?branch=master)](https://travis-ci.org/whitequark/parser)
4
+ [![Code Climate](https://codeclimate.com/github/whitequark/parser.png)](https://codeclimate.com/github/whitequark/parser)
5
+
6
+ Parser is a Ruby parser written in pure Ruby.
7
+
8
+ ## Installation
9
+
10
+ $ gem install parser
11
+
12
+ ## Usage
13
+
14
+ TODO: Write usage instructions here
15
+
16
+ ## Acknowledgements
17
+
18
+ The lexer testsuite is derived from [ruby_parser](http://github.com/seattlerb/ruby_parser).
19
+
20
+ The Bison parser rules are derived from [Ruby MRI](http://github.com/ruby/ruby) parse.y.
21
+
22
+ ## Contributing
23
+
24
+ 1. Make sure you have [Ragel 6.8](http://www.complang.org/ragel/) installed
25
+ 2. Fork it
26
+ 3. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 4. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 5. Push to the branch (`git push origin my-new-feature`)
29
+ 6. Create new Pull Request
data/Rakefile CHANGED
@@ -1,192 +1,25 @@
1
- # -*- ruby -*-
1
+ require "bundler/gem_tasks"
2
2
 
3
- require 'rubygems'
4
- require 'hoe'
3
+ task :default => [:generate, :test]
5
4
 
6
- Hoe.plugin :seattlerb
7
- Hoe.plugin :racc
8
- Hoe.plugin :isolate
9
-
10
- Hoe.add_include_dirs "../../sexp_processor/dev/lib"
11
-
12
- Hoe.spec 'parser' do
13
- developer 'Peter Zotov', 'whitequark@whitequark.org'
14
-
15
- dependency 'sexp_processor', '~> 4.1'
16
-
17
- self.racc_flags << " -t" if plugin?(:racc) && ENV["DEBUG"]
18
- end
19
-
20
- file "lib/ruby18_parser.rb" => "lib/ruby18_parser.y"
21
- file "lib/ruby19_parser.rb" => "lib/ruby19_parser.y"
22
-
23
- file "lib/ruby_lexer.rb" => "lib/ruby_lexer.rl" do |t|
24
- sh "ragel -R #{t.prerequisites.first} -o #{t.name}"
25
- end
26
-
27
- task :clean do
28
- rm_rf(Dir["**/*~"] +
29
- Dir["**/*.diff"] +
30
- Dir["coverage.info"] +
31
- Dir["coverage"] +
32
- Dir["lib/*.output"])
33
- end
34
-
35
- def next_num(glob)
36
- num = Dir[glob].max[/\d+/].to_i + 1
37
- end
38
-
39
- desc "Compares PT to RP and deletes all files that match"
40
- task :compare do
41
- files = Dir["unit/**/*.rb"]
42
- puts "Parsing #{files.size} files"
43
- files.each do |file|
44
- puts file
45
- system "./cmp.rb -q #{file} && rm #{file}"
46
- end
47
- system 'find -d unit -type d -empty -exec rmdir {} \;'
48
- end
49
-
50
- desc "Compares PT to RP and stops on first failure"
51
- task :find_bug do
52
- files = Dir["unit/**/*.rb"]
53
- puts "Parsing #{files.size} files"
54
- files.each do |file|
55
- puts file
56
- sh "./cmp.rb -q #{file}"
57
- end
58
- end
59
-
60
- task :sort do
61
- sh 'grepsort "^ +def" lib/ruby_lexer.rb'
62
- sh 'grepsort "^ +def (test|util)" test/test_ruby_lexer.rb'
63
- end
64
-
65
- task :loc do
66
- loc1 = `wc -l ../1.0.0/lib/ruby_lexer.rb`[/\d+/]
67
- flog1 = `flog -s ../1.0.0/lib/ruby_lexer.rb`[/\d+\.\d+/]
68
- loc2 = `cat lib/ruby_lexer.rb lib/ruby_parser_extras.rb | wc -l`[/\d+/]
69
- flog2 = `flog -s lib/ruby_lexer.rb lib/ruby_parser_extras.rb`[/\d+\.\d+/]
70
-
71
- loc1, loc2, flog1, flog2 = loc1.to_i, loc2.to_i, flog1.to_f, flog2.to_f
72
-
73
- puts "1.0.0: loc = #{loc1} flog = #{flog1}"
74
- puts "dev : loc = #{loc2} flog = #{flog2}"
75
- puts "delta: loc = #{loc2-loc1} flog = #{flog2-flog1}"
76
- end
77
-
78
- desc "Validate against all normal files in unit dir"
79
- task :validate do
80
- sh "./cmp.rb unit/*.rb"
81
- end
82
-
83
- def run_and_log cmd, prefix
84
- files = ENV['FILES'] || 'unit/*.rb'
85
- p, x = prefix, "txt"
86
- n = Dir["#{p}.*.#{x}"].map { |s| s[/\d+/].to_i }.max + 1 rescue 1
87
- f = "#{p}.#{n}.#{x}"
88
-
89
- sh "#{cmd} #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g #{files} &> #{f}"
90
-
91
- puts File.read(f)
92
- end
93
-
94
- desc "Benchmark against all normal files in unit dir"
95
- task :benchmark do
96
- run_and_log "ruby", "benchmark"
97
- end
98
-
99
- desc "Profile against all normal files in unit dir"
100
- task :profile do
101
- run_and_log "zenprofile", "profile"
102
- end
103
-
104
- desc "what was that command again?"
105
- task :huh? do
106
- puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..."
107
- end
108
-
109
- task :irb => [:isolate] do
110
- sh "GEM_HOME=#{Gem.path.first} irb -rubygems -Ilib -rruby_parser;"
111
- end
112
-
113
- def (task(:phony)).timestamp
114
- Time.at 0
115
- end
116
-
117
- task :isolate => :phony
118
-
119
- file "lib/ruby18_parser.rb" => :isolate
120
- file "lib/ruby19_parser.rb" => :isolate
121
-
122
- task :compare18 do
123
- sh "./yack.rb lib/ruby18_parser.output > racc18.txt"
124
- sh "./yack.rb parse18.output > yacc18.txt"
125
- sh "diff -du racc18.txt yacc18.txt || true"
126
- puts
127
- sh "diff -du racc18.txt yacc18.txt | wc -l"
128
- end
129
-
130
- task :compare19 do
131
- sh "./yack.rb lib/ruby19_parser.output > racc19.txt"
132
- sh "./yack.rb parse19.output > yacc19.txt"
133
- sh "diff -du racc19.txt yacc19.txt || true"
134
- puts
135
- sh "diff -du racc19.txt yacc19.txt | wc -l"
136
- end
137
-
138
- task :debug => :isolate do
139
- ENV["V"] ||= "19"
140
- Rake.application[:parser].invoke # this way we can have DEBUG set
141
-
142
- $: << "lib"
143
- require 'ruby_parser'
144
- require 'pp'
145
-
146
- parser = if ENV["V"] == "18" then
147
- Ruby18Parser.new
148
- else
149
- Ruby19Parser.new
150
- end
151
-
152
- time = (ENV["RP_TIMEOUT"] || 10).to_i
153
-
154
- file = ENV["F"] || ENV["FILE"]
155
-
156
- ruby = if file then
157
- File.read(file)
158
- else
159
- file = "env"
160
- ENV["R"] || ENV["RUBY"]
161
- end
162
-
163
- begin
164
- pp parser.process(ruby, file, time)
165
- rescue Racc::ParseError => e
166
- p e
167
- ss = parser.lexer.src
168
- src = ss.string
169
- lines = src[0..ss.pos].split(/\n/)
170
- abort "on #{file}:#{lines.size}"
5
+ task :test do
6
+ $LOAD_PATH << File.expand_path('../lib/', __FILE__)
7
+ Dir["test/test_*.rb"].each do |file|
8
+ load file
171
9
  end
172
10
  end
173
11
 
174
- task :debug_ruby do
175
- file = ENV["F"] || ENV["FILE"]
176
- sh "ruby19 -cwy #{file} 2>&1 | ./yuck.rb"
177
- end
178
-
179
- task :extract => :isolate do
180
- ENV["V"] ||= "19"
181
- Rake.application[:parser].invoke # this way we can have DEBUG set
12
+ desc "Generate the Ragel lexer and Bison parser."
13
+ task :generate => %w(lib/parser/lexer.rb)
14
+ #lib/parser/ruby18.rb
15
+ #lib/parser/ruby19.rb)
182
16
 
183
- file = ENV["F"] || ENV["FILE"]
17
+ task :build => :generate
184
18
 
185
- ruby "-Ilib", "bin/ruby_parse_extract_error", file
19
+ rule '.rb' => '.rl' do |t|
20
+ sh "ragel -R #{t.source} -o #{t.name}"
186
21
  end
187
22
 
188
- task :bugs do
189
- sh "for f in bug*.rb ; do rake19 debug F=$f && rm $f ; done"
23
+ rule '.rb' => '.y' do |t|
24
+ sh "racc #{t.source} -o #{t.name} -O"
190
25
  end
191
-
192
- # vim: syntax=Ruby
@@ -0,0 +1,4 @@
1
+ module Parser
2
+ require 'parser/static_environment'
3
+ require 'parser/lexer'
4
+ end
@@ -0,0 +1,1713 @@
1
+ %%machine lex; # % fix highlighting
2
+
3
+ #
4
+ # === BEFORE YOU START ===
5
+ #
6
+ # Remember two things about Ragel scanners:
7
+ #
8
+ # 1) Longest match wins.
9
+ #
10
+ # 2) If two matches have the same length, the first
11
+ # in source code wins.
12
+ #
13
+ # General rules of making Ragel and Bison happy:
14
+ #
15
+ # * `p` (position) and `@te` contain the index of the character
16
+ # they're pointing to ("current"), plus one. `@ts` contains the index
17
+ # of the corresponding character. The code for extracting matched token is:
18
+ #
19
+ # @source[@ts...@te]
20
+ #
21
+ # * If your input is `foooooooobar` and the rule is:
22
+ #
23
+ # 'f' 'o'+
24
+ #
25
+ # the result will be:
26
+ #
27
+ # foooooooobar
28
+ # ^ ts=0 ^ p=te=9
29
+ #
30
+ # * A Ragel lexer action should not emit more than one token, unless
31
+ # you know what you are doing.
32
+ #
33
+ # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
34
+ #
35
+ # * If an action emits the token and transitions to another state, use
36
+ # these Ragel commands:
37
+ #
38
+ # emit($whatever)
39
+ # fnext $next_state; fbreak;
40
+ #
41
+ # * If an action does not emit a token:
42
+ #
43
+ # fgoto $next_state;
44
+ #
45
+ # * If an action features lookbehind, i.e. matches characters with the
46
+ # intent of passing them to another action:
47
+ #
48
+ # p = @ts - 1
49
+ # fgoto $next_state;
50
+ #
51
+ # or, if the lookbehind consists of a single character:
52
+ #
53
+ # fhold; fgoto $next_state;
54
+ #
55
+ # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
56
+ # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
57
+ # _will_ invoke the action `act`.
58
+ #
59
+ # * EOF is explicit and is matched by `c_eof`. If you want to introspect
60
+ # the state of the lexer, add this rule to the state:
61
+ #
62
+ # c_eof => do_eof;
63
+ #
64
+ # * If you proceed past EOF, the lexer will complain:
65
+ #
66
+ # NoMethodError: undefined method `ord' for nil:NilClass
67
+ #
68
+
69
+ require 'parser/lexer_literal'
70
+ require 'parser/syntax_error'
71
+
72
+ class Parser::Lexer
73
+
74
+ %% write data nofinal;
75
+ # %
76
+
77
+ attr_reader :source
78
+ attr_accessor :static_env
79
+
80
+ attr_reader :location, :comments
81
+
82
+ def initialize(version)
83
+ @version = version
84
+
85
+ reset
86
+ end
87
+
88
+ def reset(reset_state=true)
89
+ if reset_state
90
+ # Unit tests set state prior to resetting lexer.
91
+ @cs = self.class.lex_en_line_begin
92
+ end
93
+
94
+ # Ragel-internal variables:
95
+ @p = 0 # stream position (saved manually in #advance)
96
+ @ts = nil # token start
97
+ @te = nil # token end
98
+ @act = 0 # next action
99
+
100
+ @stack = [] # state stack
101
+ @top = 0 # state stack top pointer
102
+
103
+ @token_queue = []
104
+ @literal_stack = []
105
+
106
+ @newlines = [0] # sorted set of \n positions
107
+ @newline_s = nil # location of last encountered newline
108
+ @location = nil # location of last #advance'd token
109
+
110
+ @comments = "" # collected comments
111
+
112
+ @num_base = nil # last numeric base
113
+ @num_digits_s = nil # starting position of numeric digits
114
+
115
+ @escape_s = nil # starting position of current sequence
116
+ @escape = nil # last escaped sequence, as string
117
+
118
+ # See below the section on parsing heredocs.
119
+ @heredoc_e = nil
120
+ @herebody_s = nil
121
+
122
+ # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
123
+ # encountered after a matching closing parenthesis.
124
+ @paren_nest = 0
125
+ @lambda_stack = []
126
+ end
127
+
128
+ def source=(source)
129
+ # Heredoc processing coupled with weird newline quirks
130
+ # require three '\0' (EOF) chars to be appended; after
131
+ # `p = @heredoc_s`, if `p` points at EOF, the FSM could
132
+ # not bail out early enough and will crash.
133
+ #
134
+ # Patches accepted.
135
+ #
136
+ @source = source.gsub(/\r\n/, "\n") + "\0\0\0"
137
+ end
138
+
139
+ LEX_STATES = {
140
+ :line_begin => lex_en_line_begin,
141
+ :expr_beg => lex_en_expr_beg,
142
+ :expr_value => lex_en_expr_value,
143
+ :expr_mid => lex_en_expr_mid,
144
+ :expr_dot => lex_en_expr_dot,
145
+ :expr_fname => lex_en_expr_fname,
146
+ :expr_end => lex_en_expr_end,
147
+ :expr_arg => lex_en_expr_arg,
148
+ :expr_endarg => lex_en_expr_endarg,
149
+ }
150
+
151
+ def state
152
+ LEX_STATES.invert.fetch(@cs, @cs)
153
+ end
154
+
155
+ def state=(state)
156
+ @cs = LEX_STATES.fetch(state)
157
+ end
158
+
159
+ # Return next token: [type, value].
160
+ def advance
161
+ if @token_queue.any?
162
+ return with_location(@token_queue.shift)
163
+ end
164
+
165
+ # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
166
+ _lex_trans_keys = self.class.send :_lex_trans_keys
167
+ _lex_actions = self.class.send :_lex_actions
168
+ _lex_key_offsets = self.class.send :_lex_key_offsets
169
+ _lex_index_offsets = self.class.send :_lex_index_offsets
170
+ _lex_single_lengths = self.class.send :_lex_single_lengths
171
+ _lex_range_lengths = self.class.send :_lex_range_lengths
172
+ _lex_indicies = self.class.send :_lex_indicies
173
+ _lex_trans_targs = self.class.send :_lex_trans_targs
174
+ _lex_trans_actions = self.class.send :_lex_trans_actions
175
+ _lex_to_state_actions = self.class.send :_lex_to_state_actions
176
+ _lex_from_state_actions = self.class.send :_lex_from_state_actions
177
+
178
+ p, pe, eof = @p, @source.length + 1, nil
179
+
180
+ %% write exec;
181
+ # %
182
+
183
+ @p = p
184
+
185
+ if @token_queue.any?
186
+ with_location(@token_queue.shift)
187
+ elsif @cs == self.class.lex_error
188
+ with_location([ false, '$undefined', p, p + 1 ])
189
+ else
190
+ with_location([ false, '$end', p, p + 1 ])
191
+ end
192
+ end
193
+
194
+ # Like #advance, but also pretty-print the token and its position
195
+ # in the stream to `stdout`.
196
+ def advance_and_decorate
197
+ type, val = advance
198
+
199
+ puts decorate(location, "\e[0;32m#{type} #{val.inspect}\e[0m")
200
+
201
+ [type, val]
202
+ end
203
+
204
+ # Return the current collected comment block and clear the storage.
205
+ def clear_comments
206
+ comments = @comments
207
+ @comments = ""
208
+
209
+ comments
210
+ end
211
+
212
+ # Lex `str` for the Ruby version `version` with initial state `state`.
213
+ #
214
+ # The tokens displayed by this function are not the same as tokens
215
+ # consumed by parser, because the parser manipulates lexer state on
216
+ # its own.
217
+ def self.do(source, state=nil, version=19)
218
+ lex = new(version)
219
+ lex.source = source
220
+ lex.state = state if state
221
+
222
+ loop do
223
+ type, val = lex.advance_and_decorate
224
+ break if !type
225
+ end
226
+
227
+ puts "Lex state: #{lex.state}"
228
+ end
229
+
230
+ # Used by LexerLiteral to emit tokens for string content.
231
+ def emit(type, value = tok, s = @ts, e = @te)
232
+ if s.nil? || e.nil?
233
+ raise "broken #emit invocation in #{caller[0]}"
234
+ end
235
+
236
+ @token_queue << [ type, value, s, e ]
237
+ end
238
+
239
+ def emit_table(table, s = @ts, e = @te)
240
+ token = tok(s, e)
241
+ emit(table[token], token, s, e)
242
+ end
243
+
244
+ # shim
245
+ def lineno
246
+ @location[0] + 1
247
+ end
248
+
249
+ protected
250
+
251
+ def eof_char?(char)
252
+ [0x04, 0x1a, 0x00].include? char.ord
253
+ end
254
+
255
+ def ruby18?
256
+ @version == 18
257
+ end
258
+
259
+ def ruby19?
260
+ @version == 19
261
+ end
262
+
263
+ def tok(s = @ts, e = @te)
264
+ @source[s...e]
265
+ end
266
+
267
+ def record_newline(p)
268
+ @newlines = (@newlines + [p]).uniq.sort
269
+ end
270
+
271
+ def dissect_location(start, finish)
272
+ line_number = @newlines.rindex { |nl| start >= nl }
273
+ line_first_col = @newlines[line_number]
274
+
275
+ start_col = start - line_first_col
276
+ finish_col = finish - line_first_col
277
+
278
+ [ line_number, start_col, finish_col ]
279
+ end
280
+
281
+ def with_location(item)
282
+ type, value, start, finish = *item
283
+
284
+ @location = dissect_location(start, finish)
285
+
286
+ [ type, value ]
287
+ end
288
+
289
+ def decorate(location, message="")
290
+ line_number, from, to = location
291
+
292
+ line = @source.lines.drop(line_number).first
293
+ line[from...to] = "\e[4m#{line[from...to]}\e[0m"
294
+
295
+ tail_len = to - from - 1
296
+ tail = "~" * (tail_len >= 0 ? tail_len : 0)
297
+ decoration = "#{" " * from}\e[1;31m^#{tail}\e[0m #{message}"
298
+
299
+ [ line, decoration ]
300
+ end
301
+
302
+ def warning(message, start = @ts, finish = @te)
303
+ $stderr.puts "warning: #{message}"
304
+ $stderr.puts decorate(dissect_location(start, finish))
305
+ end
306
+
307
+ def error(message)
308
+ raise Parser::SyntaxError, message
309
+ end
310
+
311
+ #
312
+ # === LITERAL STACK ===
313
+ #
314
+
315
+ def push_literal(*args)
316
+ new_literal = Parser::LexerLiteral.new(self, *args)
317
+ @literal_stack.push(new_literal)
318
+
319
+ if new_literal.type == :tWORDS_BEG
320
+ self.class.lex_en_interp_words
321
+ elsif new_literal.type == :tQWORDS_BEG
322
+ self.class.lex_en_plain_words
323
+ elsif new_literal.interpolate?
324
+ self.class.lex_en_interp_string
325
+ else
326
+ self.class.lex_en_plain_string
327
+ end
328
+ end
329
+
330
+ def literal
331
+ @literal_stack[-1]
332
+ end
333
+
334
+ def pop_literal
335
+ old_literal = @literal_stack.pop
336
+
337
+ if old_literal.type == :tREGEXP_BEG
338
+ # Fetch modifiers.
339
+ self.class.lex_en_regexp_modifiers
340
+ else
341
+ self.class.lex_en_expr_end
342
+ end
343
+ end
344
+
345
+ # Mapping of strings to parser tokens.
346
+
347
+ PUNCTUATION = {
348
+ '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
349
+ '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
350
+ '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
351
+ '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
352
+ ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
353
+ '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
354
+ '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
355
+ ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
356
+ '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
357
+ '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
358
+ '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
359
+ '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
360
+ '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
361
+ '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
362
+ '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
363
+ '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
364
+ 'do' => :kDO
365
+ }
366
+
367
+ PUNCTUATION_BEGIN = {
368
+ '&' => :tAMPER, '*' => :tSTAR, '+' => :tUPLUS,
369
+ '-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN,
370
+ '{' => :tLBRACE, '[' => :tLBRACK,
371
+ }
372
+
373
+ KEYWORDS = {
374
+ 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
375
+ 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
376
+ 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
377
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
378
+ }
379
+
380
+ %w(class module def undef begin end then elsif else ensure case when
381
+ for break next redo retry in do return yield super self nil true
382
+ false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
383
+ KEYWORDS[keyword] = :"k#{keyword.upcase}"
384
+ end
385
+
386
+ KEYWORDS_BEGIN = {
387
+ 'if' => :kIF, 'unless' => :kUNLESS,
388
+ 'while' => :kWHILE, 'until' => :kUNTIL,
389
+ 'rescue' => :kRESCUE
390
+ }
391
+
392
+ %%{
393
+ # %
394
+
395
+ access @;
396
+ getkey @source[p].ord;
397
+
398
+ # === CHARACTER CLASSES ===
399
+ #
400
+ # Pay close attention to the differences between c_any and any.
401
+ # c_any does not include EOF and so will cause incorrect behavior
402
+ # for machine subtraction (any-except rules) and default transitions
403
+ # for scanners.
404
+
405
+ action do_nl {
406
+ # Record position of a newline for precise line and column reporting.
407
+ #
408
+ # This action is embedded directly into c_nl, as it is idempotent and
409
+ # there are no cases when we need to skip it.
410
+ record_newline(p + 1)
411
+ @newline_s = p
412
+ }
413
+
414
+ c_nl = '\n' $ do_nl;
415
+ c_space = [ \t\r\f\v];
416
+ c_space_nl = c_space | c_nl;
417
+ c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
418
+ c_eol = c_nl | c_eof;
419
+ c_any = any - c_eof - zlen;
420
+ c_line = c_any - c_nl;
421
+
422
+ c_unicode = c_any - 0x00..0x7f;
423
+ c_lower = [a-z_] | c_unicode;
424
+ c_upper = [A-Z] | c_unicode;
425
+ c_alpha = c_lower | c_upper;
426
+ c_alnum = c_alpha | [0-9];
427
+
428
+ action do_eof {
429
+ # Sit at EOF indefinitely. #advance would return $eof each time.
430
+ # This allows to feed the lexer more data if needed; this is only used
431
+ # in tests.
432
+ #
433
+ # Note that this action is not embedded into e_eof like e_nl and e_bs
434
+ # below. This is due to the fact that scanner state at EOF is observed
435
+ # by tests, and encapsulating it in a rule would break the introspection.
436
+ fhold; fbreak;
437
+ }
438
+
439
+ #
440
+ # === TOKEN DEFINITIONS ===
441
+ #
442
+
443
+ # All operators are punctuation. There is more to punctuation
444
+ # than just operators. Operators can be overridden by user;
445
+ # punctuation can not.
446
+
447
+ # A list of operators which are valid in the function name context, but
448
+ # have different semantics in others.
449
+ operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' ;
450
+
451
+ # A list of operators which can occur within an assignment shortcut (+ → +=).
452
+ operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
453
+ '*' | '/' | '**' | '~' | '**' | '<<' | '>>' |
454
+ '%' ;
455
+
456
+ # A list of all user-definable operators not covered by groups above.
457
+ operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
458
+ '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
459
+
460
+ # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
461
+ # as they are ambiguous with interpolation `#{}` and should be counted.
462
+ # These braces are not present in punctuation lists.
463
+
464
+ # A list of punctuation which has different meaning when used at the
465
+ # beginning of expression.
466
+ punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '&' ;
467
+
468
+ # A list of all punctuation except punctuation_begin.
469
+ punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
470
+ '::' | '?' | ':' | '.' | '..' | '...' ;
471
+
472
+ # A list of keywords which have different meaning at the beginning of expression.
473
+ keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
474
+
475
+ # A list of keywords which accept an argument-like expression, i.e. have the
476
+ # same post-processing as method calls or commands. Example: `yield 1`,
477
+ # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
478
+ keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
479
+
480
+ # A list of keywords which accept a literal function name as an argument.
481
+ keyword_with_fname = 'def' | 'undef' | 'alias' ;
482
+
483
+ # A list of keywords which accept an expression after them.
484
+ keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
485
+ 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
486
+ 'and' | 'or' ;
487
+
488
+ # A list of keywords which accept a value, and treat the keywords from
489
+ # `keyword_modifier` list as modifiers.
490
+ keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
491
+
492
+ # A list of keywords which do not accept an expression after them.
493
+ keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
494
+ 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
495
+ '__LINE__' | '__ENCODING__';
496
+
497
+ # All keywords.
498
+ keyword = keyword_with_value | keyword_with_mid |
499
+ keyword_with_end | keyword_with_arg |
500
+ keyword_with_fname | keyword_modifier ;
501
+
502
+ constant = [A-Z] c_alnum*;
503
+ bareword = c_alpha c_alnum*;
504
+
505
+ call_or_var = c_lower c_alnum*;
506
+ class_var = '@@' bareword;
507
+ instance_var = '@' bareword;
508
+ global_var = '$'
509
+ ( bareword | digit+
510
+ | [`'+~*$&?!@/\\;,.=:<>"] # `
511
+ | '-' [A-Za-z0-9_]?
512
+ )
513
+ ;
514
+
515
+ # Ruby accepts (and fails on) variables with leading digit
516
+ # in literal context, but not in unquoted symbol body.
517
+ class_var_v = '@@' [0-9]? bareword;
518
+ instance_var_v = '@' [0-9]? bareword;
519
+
520
+ #
521
+ # === ESCAPE SEQUENCE PARSING ===
522
+ #
523
+
524
+ # Escape parsing code is a Ragel pattern, not a scanner, and therefore
525
+ # it shouldn't directly raise errors or perform other actions with side effects.
526
+ # In reality this would probably just mess up error reporting in pathological
527
+ # cases, through.
528
+
529
+ # The amount of code required to parse \M\C stuff correctly is ridiculous.
530
+
531
+ escaped_nl = "\\" c_nl;
532
+
533
+ action unicode_points {
534
+ @escape = ""
535
+
536
+ codepoints = tok(@escape_s + 2, p - 1)
537
+ codepoints.split(/[ \t]/).each do |codepoint_str|
538
+ codepoint = codepoint_str.to_i(16)
539
+
540
+ if codepoint >= 0x110000
541
+ @escape = lambda { error "invalid Unicode codepoint (too large)" }
542
+ break
543
+ end
544
+
545
+ @escape += codepoint.chr(Encoding::UTF_8)
546
+ end
547
+ }
548
+
549
+ action unescape_char {
550
+ @escape = {
551
+ 'a' => "\a", 'b' => "\b", 'e' => "\e", 'f' => "\f",
552
+ 'n' => "\n", 'r' => "\r", 's' => "\s", 't' => "\t",
553
+ 'v' => "\v", '\\' => "\\"
554
+ }.fetch(@source[p - 1], @source[p - 1])
555
+ }
556
+
557
+ action invalid_complex_escape {
558
+ @escape = lambda { error "invalid escape character syntax" }
559
+ }
560
+
561
+ action slash_c_char {
562
+ @escape = (@escape.ord & 0x9f).chr
563
+ }
564
+
565
+ action slash_m_char {
566
+ @escape = (@escape.ord | 0x80).chr
567
+ }
568
+
569
+ maybe_escaped_char = (
570
+ '\\' c_any %unescape_char
571
+ | ( c_any - [\\] ) % { @escape = @source[p - 1] }
572
+ );
573
+
574
+ maybe_escaped_ctrl_char = ( # why?!
575
+ '\\' c_any %unescape_char %slash_c_char
576
+ | '?' % { @escape = "\x7f" }
577
+ | ( c_any - [\\?] ) % { @escape = @source[p - 1] } %slash_c_char
578
+ );
579
+
580
+ escape = (
581
+ # \377
582
+ [0-7]{1,3}
583
+ % { @escape = tok(@escape_s, p).to_i(8).chr }
584
+
585
+ # \xff
586
+ | ( 'x' xdigit{1,2}
587
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr }
588
+ # \u263a
589
+ | 'u' xdigit{4}
590
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
591
+ )
592
+
593
+ # %q[\x]
594
+ | 'x' ( c_any - xdigit )
595
+ % { @escape = lambda { error "invalid hex escape" } }
596
+
597
+ # %q[\u123] %q[\u{12]
598
+ | 'u' ( c_any{0,4} -
599
+ xdigit{4} - # \u1234 is valid
600
+ ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
601
+ | '{' xdigit [ \t}] # \u{1. \u{1} are valid
602
+ | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
603
+ )
604
+ )
605
+ % { @escape = lambda { error "invalid Unicode escape" } }
606
+
607
+ # \u{123 456}
608
+ | 'u{' ( xdigit{1,6} [ \t] )*
609
+ ( xdigit{1,6} '}'
610
+ %unicode_points
611
+ | ( xdigit* ( c_any - xdigit - '}' )+ '}'
612
+ | ( c_any - '}' )* c_eof
613
+ | xdigit{7,}
614
+ ) % { @escape = lambda { error "unterminated Unicode escape" } }
615
+ )
616
+
617
+ # \C-\a \cx
618
+ | ( 'C-' | 'c' ) escaped_nl?
619
+ maybe_escaped_ctrl_char
620
+
621
+ # \M-a
622
+ | 'M-' escaped_nl?
623
+ maybe_escaped_char
624
+ %slash_m_char
625
+
626
+ # \C-\M-f \M-\cf \c\M-f
627
+ | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
628
+ | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
629
+ maybe_escaped_ctrl_char
630
+ %slash_m_char
631
+
632
+ | 'C' c_any %invalid_complex_escape
633
+ | 'M' c_any %invalid_complex_escape
634
+ | ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
635
+
636
+ | ( c_any - [0-7xuCMc] ) %unescape_char
637
+
638
+ | c_eof % { error "escape sequence meets end of file" }
639
+ );
640
+
641
+ # Use rules in form of `e_bs escape' when you need to parse a sequence.
642
+ e_bs = '\\' % {
643
+ @escape_s = p
644
+ @escape = nil
645
+ };
646
+
647
+ #
648
+ # === STRING AND HEREDOC PARSING ===
649
+ #
650
+
651
+ # Heredoc parsing is quite a complex topic. First, consider that heredocs
652
+ # can be arbitrarily nested. For example:
653
+ #
654
+ # puts <<CODE
655
+ # the result is: #{<<RESULT.inspect
656
+ # i am a heredoc
657
+ # RESULT
658
+ # }
659
+ # CODE
660
+ #
661
+ # which, incidentally, evaluates to:
662
+ #
663
+ # the result is: " i am a heredoc\n"
664
+ #
665
+ # To parse them, lexer refers to two kinds (remember, nested heredocs)
666
+ # of positions in the input stream, namely @heredoc_e
667
+ # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
668
+ #
669
+ # @heredoc_e is simply contained inside the corresponding LexerLiteral, and
670
+ # when the heredoc is closed, the lexing is restarted from that position.
671
+ #
672
+ # @herebody_s is quite more complex. First, @herebody_s changes after each
673
+ # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
674
+ # contains the current line, and also when a heredoc is started, @herebody_s
675
+ # contains the position from which the heredoc will be lexed.
676
+ #
677
+ # Second, as (insanity) there are nested heredocs, we need to maintain a
678
+ # stack of these positions. Each time #push_literal is called, it saves current
679
+ # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
680
+ # containing another heredocs) is closed, the previous value is restored.
681
+
682
+ e_heredoc_nl = c_nl $ {
683
+ # After every heredoc was parsed, @herebody_s contains the
684
+ # position of next token after all heredocs.
685
+ if @herebody_s
686
+ p = @herebody_s
687
+ @herebody_s = nil
688
+ end
689
+ };
690
+
691
+ action extend_string {
692
+ if literal.nest_and_try_closing tok, @ts, @te
693
+ fgoto *pop_literal;
694
+ else
695
+ literal.extend_string tok, @ts, @te
696
+ end
697
+ }
698
+
699
+ action extend_string_escaped {
700
+ if literal.nest_and_try_closing('\\', @ts, @ts + 1)
701
+ # If the literal is actually closed by the backslash,
702
+ # rewind the input prior to consuming the escape sequence.
703
+ p = @escape_s - 1
704
+ fgoto *pop_literal;
705
+ else
706
+ # Get the first character after the backslash.
707
+ escaped_char = @source[@escape_s]
708
+
709
+ if literal.munge_escape? escaped_char
710
+ # If this particular literal uses this character as an opening
711
+ # or closing delimiter, it is an escape sequence for that
712
+ # particular character. Write it without the backslash.
713
+
714
+ if literal.regexp?
715
+ # Regular expressions should have every escape sequence in its
716
+ # raw form.
717
+ literal.extend_string(tok, @ts, @te)
718
+ else
719
+ literal.extend_string(escaped_char, @ts, @te)
720
+ end
721
+ else
722
+ # It does not. So this is an actual escape sequence, yay!
723
+ # Two things to consider here.
724
+ #
725
+ # 1. The `escape' rule should be pure and so won't raise any
726
+ # errors by itself. Instead, it stores them in lambdas.
727
+ #
728
+ # 2. Non-interpolated literals do not go through the aforementioned
729
+ # rule. As \\ and \' (and variants) are munged, the full token
730
+ # should always be written for such literals.
731
+
732
+ @escape.call if @escape.respond_to? :call
733
+
734
+ if literal.regexp?
735
+ # Ditto. Also, expand escaped newlines.
736
+ literal.extend_string(tok.gsub("\\\n", ''), @ts, @te)
737
+ else
738
+ literal.extend_string(@escape || tok, @ts, @te)
739
+ end
740
+ end
741
+ end
742
+ }
743
+
744
+ # Extend a string with a newline or a EOF character.
745
+ # As heredoc closing line can immediately precede EOF, this action
746
+ # has to handle such case specially.
747
+ action extend_string_eol {
748
+ is_eof = eof_char? @source[p]
749
+
750
+ if literal.heredoc?
751
+ # Try ending the heredoc with the complete most recently
752
+ # scanned line. @herebody_s always refers to the start of such line.
753
+ if literal.nest_and_try_closing(tok(@herebody_s, @te - 1),
754
+ @herebody_s, @te - 1)
755
+ # Adjust @herebody_s to point to the next line.
756
+ @herebody_s = @te
757
+
758
+ # Continue regular lexing after the heredoc reference (<<END).
759
+ p = literal.heredoc_e - 1
760
+ fgoto *pop_literal;
761
+ else
762
+ # Ditto.
763
+ @herebody_s = @te
764
+ end
765
+ end
766
+
767
+ if is_eof
768
+ error "unterminated string meets end of file"
769
+ end
770
+
771
+ # A literal newline is appended if the heredoc was _not_ closed
772
+ # this time. See also LexerLiteral#nest_and_try_closing for rationale of
773
+ # calling #flush_string here.
774
+ literal.extend_string tok, @ts, @te
775
+ literal.flush_string
776
+ }
777
+
778
+ #
779
+ # === INTERPOLATION PARSING ===
780
+ #
781
+
782
+ # Interpolations with immediate variable names simply call into
783
+ # the corresponding machine.
784
+
785
+ interp_var =
786
+ '#' ( global_var | class_var_v | instance_var_v );
787
+
788
+ action extend_interp_var {
789
+ literal.flush_string
790
+ emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
791
+
792
+ p = @ts
793
+ fcall expr_variable;
794
+ }
795
+
796
+ # Interpolations with code blocks must match nested curly braces, as
797
+ # interpolation ending is ambiguous with a block ending. So, every
798
+ # opening and closing brace should be matched with e_[lr]brace rules,
799
+ # which automatically perform the counting.
800
+ #
801
+ # Note that interpolations can themselves be nested, so brace balance
802
+ # is tied to the innermost literal.
803
+ #
804
+ # Also note that literals themselves should not use e_[lr]brace rules
805
+ # when matching their opening and closing delimiters, as the amount of
806
+ # braces inside the characters of a string literal is independent.
807
+
808
+ interp_code = '#{';
809
+
810
+ e_lbrace = '{' % {
811
+ if literal
812
+ literal.start_interp_brace
813
+ end
814
+ };
815
+
816
+ e_rbrace = '}' % {
817
+ if literal
818
+ if literal.end_interp_brace_and_try_closing
819
+ emit(:tRCURLY, '}')
820
+
821
+ if literal.words?
822
+ emit(:tSPACE, nil)
823
+ end
824
+
825
+ if literal.saved_herebody_s
826
+ @herebody_s = literal.saved_herebody_s
827
+ end
828
+
829
+ fhold;
830
+ fnext *@stack.pop;
831
+ fbreak;
832
+ end
833
+ end
834
+ };
835
+
836
+ action extend_interp_code {
837
+ literal.flush_string
838
+ emit(:tSTRING_DBEG, '#{')
839
+
840
+ literal.saved_herebody_s = @herebody_s
841
+ @herebody_s = nil
842
+
843
+ literal.start_interp_brace
844
+ fcall expr_beg;
845
+ }
846
+
847
+ # Actual string parsers are simply combined from the primitives defined
848
+ # above.
849
+
850
+ interp_words := |*
851
+ interp_code => extend_interp_code;
852
+ interp_var => extend_interp_var;
853
+ e_bs escape => extend_string_escaped;
854
+ c_space_nl => { literal.flush_string };
855
+ c_eol => extend_string_eol;
856
+ c_any => extend_string;
857
+ *|;
858
+
859
+ interp_string := |*
860
+ interp_code => extend_interp_code;
861
+ interp_var => extend_interp_var;
862
+ e_bs escape => extend_string_escaped;
863
+ c_eol => extend_string_eol;
864
+ c_any => extend_string;
865
+ *|;
866
+
867
+ plain_words := |*
868
+ e_bs c_any => extend_string_escaped;
869
+ c_space_nl => { literal.flush_string };
870
+ c_eol => extend_string_eol;
871
+ c_any => extend_string;
872
+ *|;
873
+
874
+ plain_string := |*
875
+ e_bs c_any => extend_string_escaped;
876
+ c_eol => extend_string_eol;
877
+ c_any => extend_string;
878
+ *|;
879
+
880
+ regexp_modifiers := |*
881
+ [A-Za-z]+
882
+ => {
883
+ unknown_options = tok.scan(/[^imxouesn]/)
884
+ if unknown_options.any?
885
+ error "unknown regexp options: #{unknown_options.join}"
886
+ end
887
+
888
+ emit(:tREGEXP_OPT)
889
+ fgoto expr_end;
890
+ };
891
+
892
+ any
893
+ => {
894
+ emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
895
+ fhold; fgoto expr_end;
896
+ };
897
+ *|;
898
+
899
+ #
900
+ # === EXPRESSION PARSING ===
901
+ #
902
+
903
+ # These rules implement a form of manually defined lookahead.
904
+ # The default longest-match scanning does not work here due
905
+ # to sheer ambiguity.
906
+
907
+ ambiguous_ident_suffix = # actual parsed
908
+ [?!=] %{ tm = p } | # a? a?
909
+ '==' %{ tm = p - 2 } | # a==b a == b
910
+ '=~' %{ tm = p - 2 } | # a=~b a =~ b
911
+ '=>' %{ tm = p - 2 } | # a=>b a => b
912
+ '===' %{ tm = p - 3 } # a===b a === b
913
+ ;
914
+
915
+ ambiguous_symbol_suffix = # actual parsed
916
+ ambiguous_ident_suffix |
917
+ '==>' %{ tm = p - 2 } # :a==>b :a= => b
918
+ ;
919
+
920
+ # Ambiguous with 1.9 hash labels.
921
+ ambiguous_const_suffix = # actual parsed
922
+ '::' %{ tm = p - 2 } # A::B A :: B
923
+ ;
924
+
925
+ # Ruby 1.9 lambdas require parentheses counting in order to
926
+ # emit correct opening kDO/tLBRACE.
927
+
928
+ e_lparen = '(' % {
929
+ @paren_nest += 1
930
+ };
931
+
932
+ e_rparen = ')' % {
933
+ @paren_nest -= 1
934
+ };
935
+
936
+ # Variable lexing code is accessed from both expressions and
937
+ # string interpolation related code.
938
+ #
939
+ expr_variable := |*
940
+ global_var
941
+ => {
942
+ if tok =~ /^\$([1-9][0-9]*)$/
943
+ emit(:tNTH_REF, $1.to_i)
944
+ elsif tok =~ /^\$([&`'+])$/
945
+ emit(:tBACK_REF, $1.to_sym)
946
+ else
947
+ emit(:tGVAR)
948
+ end
949
+
950
+ fnext *@stack.pop; fbreak;
951
+ };
952
+
953
+ class_var_v
954
+ => {
955
+ error "`#{tok}' is not allowed as a class variable name" if tok =~ /^@@[0-9]/
956
+
957
+ emit(:tCVAR)
958
+ fnext *@stack.pop; fbreak;
959
+ };
960
+
961
+ instance_var_v
962
+ => {
963
+ error "`#{tok}' is not allowed as an instance variable name" if tok =~ /^@[0-9]/
964
+
965
+ emit(:tIVAR)
966
+ fnext *@stack.pop; fbreak;
967
+ };
968
+ *|;
969
+
970
+ # Literal function name in definition (e.g. `def class`).
971
+ # Keywords are returned as their respective tokens; this is used
972
+ # to support singleton def `def self.foo`. Global variables are
973
+ # returned as `tGVAR`; this is used in global variable alias
974
+ # statements `alias $a $b`. Symbols are returned verbatim; this
975
+ # is used in `alias :a :"b#{foo}"` and `undef :a`.
976
+ #
977
+ # Transitions to `expr_end` afterwards.
978
+ #
979
+ expr_fname := |*
980
+ keyword
981
+ => { emit(KEYWORDS[tok]);
982
+ fnext expr_end; fbreak; };
983
+
984
+ bareword
985
+ => { emit(:tIDENTIFIER)
986
+ fnext expr_end; fbreak; };
987
+
988
+ bareword ambiguous_ident_suffix
989
+ => { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
990
+ fnext expr_end; p = tm - 1; fbreak; };
991
+
992
+ operator_fname |
993
+ operator_arithmetic |
994
+ operator_rest
995
+ => { emit_table(PUNCTUATION)
996
+ fnext expr_end; fbreak; };
997
+
998
+ ':'
999
+ => { fhold; fgoto expr_end; };
1000
+
1001
+ global_var
1002
+ => { emit(:tGVAR)
1003
+ fbreak; };
1004
+
1005
+ c_space_nl+;
1006
+
1007
+ c_any
1008
+ => { fhold; fgoto expr_end; };
1009
+
1010
+ c_eof => do_eof;
1011
+ *|;
1012
+
1013
+ # Literal function name in method call (e.g. `a.class`).
1014
+ #
1015
+ # Transitions to `expr_arg` afterwards.
1016
+ #
1017
+ expr_dot := |*
1018
+ bareword
1019
+ => { emit(:tIDENTIFIER)
1020
+ fnext expr_arg; fbreak; };
1021
+
1022
+ bareword ambiguous_ident_suffix
1023
+ => { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
1024
+ fnext expr_arg; p = tm - 1; fbreak; };
1025
+
1026
+ operator_fname |
1027
+ operator_arithmetic |
1028
+ operator_rest
1029
+ => { emit_table(PUNCTUATION)
1030
+ fnext expr_arg; fbreak; };
1031
+
1032
+ c_space_nl+;
1033
+
1034
+ c_any
1035
+ => { fhold; fgoto expr_end; };
1036
+
1037
+ c_eof => do_eof;
1038
+ *|;
1039
+
1040
+ # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1041
+ # is consumed; the current expression is a command or method call.
1042
+ #
1043
+ expr_arg := |*
1044
+ #
1045
+ # COMMAND MODE SPECIFIC TOKENS
1046
+ #
1047
+
1048
+ # cmd (1 + 2)
1049
+ # See below the rationale about expr_endarg.
1050
+ c_space+ e_lparen
1051
+ => { emit(:tLPAREN_ARG, '(', @te - 1, @te)
1052
+ fnext expr_beg; fbreak; };
1053
+
1054
+ # meth(1 + 2)
1055
+ # Regular method call.
1056
+ e_lparen
1057
+ => { emit(:tLPAREN2)
1058
+ fnext expr_beg; fbreak; };
1059
+
1060
+ # meth [...]
1061
+ # Array argument. Compare with indexing `meth[...]`.
1062
+ c_space+ '['
1063
+ => { emit(:tLBRACK, '[', @te - 1, @te);
1064
+ fnext expr_beg; fbreak; };
1065
+
1066
+ # cmd {}
1067
+ # Command: method call without parentheses.
1068
+ c_space* e_lbrace
1069
+ => {
1070
+ if @lambda_stack.last == @paren_nest
1071
+ p = @ts - 1
1072
+ fgoto expr_end;
1073
+ else
1074
+ emit(:tLCURLY, '{', @te - 1, @te)
1075
+ fnext expr_value; fbreak;
1076
+ end
1077
+ };
1078
+
1079
+ # a.b
1080
+ # Dot-call.
1081
+ '.' | '::'
1082
+ => { emit_table(PUNCTUATION);
1083
+ fnext expr_dot; fbreak; };
1084
+
1085
+ #
1086
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1087
+ #
1088
+
1089
+ # a ?b
1090
+ # Character literal.
1091
+ c_space+ '?'
1092
+ => { fhold; fgoto expr_beg; };
1093
+
1094
+ # x +1
1095
+ # Ambiguous unary operator or regexp literal.
1096
+ c_space+ [+\-/]
1097
+ => {
1098
+ warning "ambiguous first argument; put parentheses or even spaces", @te - 1, @te
1099
+ fhold; fhold; fgoto expr_beg;
1100
+ };
1101
+
1102
+ # x *1
1103
+ # Ambiguous splat or block-pass.
1104
+ c_space+ [*&]
1105
+ => {
1106
+ what = tok(@te - 1, @te)
1107
+ warning "`#{what}' interpreted as argument prefix", @te - 1, @te
1108
+ fhold; fgoto expr_beg;
1109
+ };
1110
+
1111
+ #
1112
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1113
+ #
1114
+
1115
+ # a ? b
1116
+ # Ternary operator.
1117
+ c_space+ '?' c_space_nl
1118
+ => { fhold; fhold; fgoto expr_end; };
1119
+
1120
+ # x + 1: Binary operator or operator-assignment.
1121
+ c_space* operator_arithmetic
1122
+ ( '=' | c_space_nl )? |
1123
+ # x rescue y: Modifier keyword.
1124
+ c_space+ keyword_modifier |
1125
+ # Miscellanea.
1126
+ c_space* punctuation_end
1127
+ => {
1128
+ p = @ts - 1
1129
+ fgoto expr_end;
1130
+ };
1131
+
1132
+ c_space* c_nl
1133
+ => { fhold; fgoto expr_end; };
1134
+
1135
+ c_any
1136
+ => { fhold; fgoto expr_beg; };
1137
+
1138
+ c_eof => do_eof;
1139
+ *|;
1140
+
1141
+ # The rationale for this state is pretty complex. Normally, if an argument
1142
+ # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1143
+ # the block is attached to the innermost argument (`f` in `m f {}`), or it
1144
+ # is a parse error (`m 1 {}`). But there is a special case for passing a single
1145
+ # primary expression grouped with parentheses: if you write `m (1) {}` or
1146
+ # (2.0 only) `m () {}`, then the block is attached to `m`.
1147
+ #
1148
+ # Thus, we recognize the opening `(` of a command (remember, a command is
1149
+ # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1150
+ # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1151
+ # lexer's state to `expr_endarg`, which makes it emit the possibly following
1152
+ # `{` as `tLBRACE_ARG`.
1153
+ #
1154
+ # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1155
+ # `do` (as `kDO_BLOCK` in `expr_beg`). (I have no clue why the parser cannot
1156
+ # just handle `kDO`.)
1157
+ expr_endarg := |*
1158
+ e_lbrace
1159
+ => { emit(:tLBRACE_ARG)
1160
+ fnext expr_value; };
1161
+
1162
+ 'do'
1163
+ => { emit(:kDO_BLOCK)
1164
+ fnext expr_value; };
1165
+
1166
+ c_space*;
1167
+
1168
+ c_any
1169
+ => { fhold; fgoto expr_end; };
1170
+
1171
+ c_eof => do_eof;
1172
+ *|;
1173
+
1174
+ # The rationale for this state is that several keywords accept value
1175
+ # (i.e. should transition to `expr_beg`), do not accept it like a command
1176
+ # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1177
+ # accept a modifier if/while/etc.
1178
+ #
1179
+ expr_mid := |*
1180
+ keyword_modifier
1181
+ => { emit_table(KEYWORDS)
1182
+ fnext expr_beg; fbreak; };
1183
+
1184
+ c_space+;
1185
+
1186
+ c_nl
1187
+ => { fhold; fgoto expr_end; };
1188
+
1189
+ c_any
1190
+ => { fhold; fgoto expr_beg; };
1191
+
1192
+ c_eof => do_eof;
1193
+ *|;
1194
+
1195
+ # Beginning of an expression.
1196
+ #
1197
+ # Don't fallthrough to this state from `c_any`; make sure to handle
1198
+ # `c_space* c_nl` and let `expr_end` handle the newline.
1199
+ # Otherwise code like `f\ndef x` gets glued together and the parser
1200
+ # explodes.
1201
+ #
1202
+ expr_beg := |*
1203
+ # Numeric processing. Converts:
1204
+ # +5 to [tINTEGER, 5]
1205
+ # -5 to [tUMINUS_NUM] [tINTEGER, 5]
1206
+ [+\-][0-9]
1207
+ => {
1208
+ fhold;
1209
+ if tok.start_with? '-'
1210
+ emit(:tUMINUS_NUM, '-')
1211
+ fnext expr_end; fbreak;
1212
+ end
1213
+ };
1214
+
1215
+ # splat *a
1216
+ '*'
1217
+ => { emit(:tSTAR)
1218
+ fbreak; };
1219
+
1220
+ #
1221
+ # STRING AND REGEXP LITERALS
1222
+ #
1223
+
1224
+ # a / 42
1225
+ # a % 42
1226
+ # a %= 42 (disambiguation with %=string=)
1227
+ [/%] c_space_nl | '%=' # /
1228
+ => {
1229
+ fhold; fhold;
1230
+ fgoto expr_end;
1231
+ };
1232
+
1233
+ # /regexp/oui
1234
+ '/'
1235
+ => {
1236
+ type, delimiter = tok, tok
1237
+ fgoto *push_literal(type, delimiter, @ts);
1238
+ };
1239
+
1240
+ # %<string>
1241
+ '%' ( c_any - [A-Za-z] )
1242
+ => {
1243
+ type, delimiter = tok[0], tok[-1]
1244
+ fgoto *push_literal(type, delimiter, @ts);
1245
+ };
1246
+
1247
+ # %w(we are the people)
1248
+ '%' [A-Za-z]+ c_any
1249
+ => {
1250
+ type, delimiter = tok[0..-2], tok[-1]
1251
+ fgoto *push_literal(type, delimiter, @ts);
1252
+ };
1253
+
1254
+ '%' c_eof
1255
+ => {
1256
+ error "unterminated string meets end of file"
1257
+ };
1258
+
1259
+ # Heredoc start.
1260
+ # <<EOF | <<-END | <<"FOOBAR" | <<-`SMTH`
1261
+ '<<' '-'?
1262
+ ( '"' ( c_any - c_nl - '"' )* '"'
1263
+ | "'" ( c_any - c_nl - "'" )* "'"
1264
+ | "`" ( c_any - c_nl - "`" )* "`"
1265
+ | bareword ) % { @heredoc_e = p }
1266
+ ( c_any - c_nl )* c_nl % { new_herebody_s = p }
1267
+ => {
1268
+ tok(@ts, @heredoc_e) =~ /^<<(-?)(["'`]?)(.*)\2$/
1269
+
1270
+ indent = !$1.empty?
1271
+ type = $2.empty? ? '"' : $2
1272
+ delimiter = $3
1273
+
1274
+ fnext *push_literal(type, delimiter, @ts, @heredoc_e, indent);
1275
+
1276
+ if @herebody_s.nil?
1277
+ @herebody_s = new_herebody_s
1278
+ end
1279
+
1280
+ p = @herebody_s - 1
1281
+ };
1282
+
1283
+ #
1284
+ # AMBIGUOUS TERNARY OPERATOR
1285
+ #
1286
+
1287
+ '?' ( e_bs escape
1288
+ | c_any - c_space_nl - e_bs % { @escape = nil }
1289
+ )
1290
+ => {
1291
+ # Show an error if memorized.
1292
+ @escape.call if @escape.respond_to? :call
1293
+
1294
+ value = @escape || tok(@ts + 1)
1295
+
1296
+ if ruby18?
1297
+ emit(:tINTEGER, value.ord)
1298
+ else
1299
+ emit(:tSTRING, value)
1300
+ end
1301
+
1302
+ fbreak;
1303
+ };
1304
+
1305
+ '?' c_space_nl
1306
+ => {
1307
+ escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1308
+ "\v" => '\v', "\f" => '\f' }[tok[@ts + 1]]
1309
+ warning "invalid character syntax; use ?#{escape}", @ts
1310
+
1311
+ p = @ts - 1
1312
+ fgoto expr_end;
1313
+ };
1314
+
1315
+ '?' c_eof
1316
+ => {
1317
+ error "incomplete character syntax"
1318
+ };
1319
+
1320
+ # f ?aa : b: Disambiguate with a character literal.
1321
+ '?' [A-Za-z_] bareword
1322
+ => {
1323
+ p = @ts - 1
1324
+ fgoto expr_end;
1325
+ };
1326
+
1327
+ #
1328
+ # KEYWORDS AND PUNCTUATION
1329
+ #
1330
+
1331
+ # a(+b)
1332
+ punctuation_begin |
1333
+ # a({b=>c})
1334
+ e_lbrace |
1335
+ # a()
1336
+ e_lparen
1337
+ => { emit_table(PUNCTUATION_BEGIN)
1338
+ fbreak; };
1339
+
1340
+ # rescue Exception => e: Block rescue.
1341
+ # Special because it should transition to expr_mid.
1342
+ 'rescue'
1343
+ => { emit_table(KEYWORDS_BEGIN)
1344
+ fnext expr_mid; fbreak; };
1345
+
1346
+ # if a: Statement if.
1347
+ keyword_modifier
1348
+ => { emit_table(KEYWORDS_BEGIN)
1349
+ fnext expr_value; fbreak; };
1350
+
1351
+ #
1352
+ # RUBY 1.9 HASH LABELS
1353
+ #
1354
+
1355
+ bareword ':' ( c_any - ':' )
1356
+ => {
1357
+ fhold;
1358
+
1359
+ if ruby18?
1360
+ emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
1361
+ fhold; # continue as a symbol
1362
+ else
1363
+ emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1364
+ end
1365
+
1366
+ fbreak;
1367
+ };
1368
+
1369
+ #
1370
+ # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
1371
+ #
1372
+
1373
+ # foo= bar: Disambiguate with bareword rule below.
1374
+ bareword ambiguous_ident_suffix |
1375
+ # def foo: Disambiguate with bareword rule below.
1376
+ keyword
1377
+ => { p = @ts - 1
1378
+ fgoto expr_end; };
1379
+
1380
+ # a = 42; a [42]: Indexing.
1381
+ # def a; end; a [42]: Array argument.
1382
+ call_or_var
1383
+ => {
1384
+ emit(:tIDENTIFIER)
1385
+
1386
+ if @static_env && @static_env.declared?(tok.to_sym)
1387
+ fgoto expr_end;
1388
+ else
1389
+ fgoto expr_arg;
1390
+ end
1391
+ };
1392
+
1393
+ c_space_nl+;
1394
+
1395
+ # The following rules match most binary and all unary operators.
1396
+ # Rules for binary operators provide better error reporting.
1397
+ operator_arithmetic '=' |
1398
+ operator_rest |
1399
+ punctuation_end |
1400
+ c_any
1401
+ => { p = @ts - 1; fgoto expr_end; };
1402
+
1403
+ c_eof => do_eof;
1404
+ *|;
1405
+
1406
+ # Like expr_beg, but no 1.9 label possible.
1407
+ #
1408
+ expr_value := |*
1409
+ # a:b: a(:b), a::B, A::B
1410
+ bareword ':'
1411
+ => { p = @ts - 1
1412
+ fgoto expr_end; };
1413
+
1414
+ c_space_nl+;
1415
+
1416
+ c_any
1417
+ => { fhold; fgoto expr_beg; };
1418
+
1419
+ c_eof => do_eof;
1420
+ *|;
1421
+
1422
+ expr_end := |*
1423
+ #
1424
+ # STABBY LAMBDA
1425
+ #
1426
+
1427
+ '->'
1428
+ => {
1429
+ emit_table(PUNCTUATION)
1430
+
1431
+ @lambda_stack.push @paren_nest
1432
+ fbreak;
1433
+ };
1434
+
1435
+ e_lbrace | 'do'
1436
+ => {
1437
+ if @lambda_stack.last == @paren_nest
1438
+ @lambda_stack.pop
1439
+
1440
+ if tok == '{'
1441
+ emit(:tLAMBEG)
1442
+ else
1443
+ emit(:kDO_LAMBDA)
1444
+ end
1445
+ else
1446
+ emit_table(PUNCTUATION)
1447
+ end
1448
+
1449
+ fnext expr_value; fbreak;
1450
+ };
1451
+
1452
+ #
1453
+ # KEYWORDS
1454
+ #
1455
+
1456
+ keyword_with_fname
1457
+ => { emit_table(KEYWORDS)
1458
+ fnext expr_fname; fbreak; };
1459
+
1460
+ 'class' c_space_nl '<<'
1461
+ => { emit(:kCLASS, 'class', @ts, @ts + 5)
1462
+ emit(:tLSHFT, '<<', @te - 2, @te)
1463
+ fnext expr_beg; fbreak; };
1464
+
1465
+ # a if b:c: Syntax error.
1466
+ keyword_modifier
1467
+ => { emit_table(KEYWORDS)
1468
+ fnext expr_beg; fbreak; };
1469
+
1470
+ # elsif b:c: elsif b(:c)
1471
+ keyword_with_value
1472
+ => { emit_table(KEYWORDS)
1473
+ fnext expr_value; fbreak; };
1474
+
1475
+ keyword_with_mid
1476
+ => { emit_table(KEYWORDS)
1477
+ fnext expr_mid; fbreak; };
1478
+
1479
+ keyword_with_arg
1480
+ => {
1481
+ emit_table(KEYWORDS)
1482
+
1483
+ if ruby18? && tok == 'not'
1484
+ fnext expr_beg; fbreak;
1485
+ else
1486
+ fnext expr_arg; fbreak;
1487
+ end
1488
+ };
1489
+
1490
+ keyword_with_end
1491
+ => { emit_table(KEYWORDS)
1492
+ fbreak; };
1493
+
1494
+ #
1495
+ # NUMERIC LITERALS
1496
+ #
1497
+
1498
+ ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p }
1499
+ ( xdigit+ '_' )* xdigit* '_'?
1500
+ | '0' [Dd] %{ @num_base = 10; @num_digits_s = p }
1501
+ ( digit+ '_' )* digit* '_'?
1502
+ | '0' [Oo] %{ @num_base = 8; @num_digits_s = p }
1503
+ ( digit+ '_' )* digit* '_'?
1504
+ | '0' [Bb] %{ @num_base = 2; @num_digits_s = p }
1505
+ ( [01]+ '_' )* [01]* '_'?
1506
+ | [1-9] %{ @num_base = 10; @num_digits_s = @ts }
1507
+ ( '_' digit+ )* digit* '_'?
1508
+ | '0' %{ @num_base = 8; @num_digits_s = @ts }
1509
+ ( '_' digit+ )* digit* '_'?
1510
+ )
1511
+ => {
1512
+ digits = tok(@num_digits_s)
1513
+
1514
+ if digits.end_with? '_'
1515
+ error "trailing `_' in number"
1516
+ elsif digits.empty? && @num_base == 8 && ruby18?
1517
+ # 1.8 did not raise an error on 0o.
1518
+ digits = "0"
1519
+ elsif digits.empty?
1520
+ error "numeric literal without digits"
1521
+ elsif @num_base == 8 && digits =~ /[89]/
1522
+ error "invalid octal digit"
1523
+ end
1524
+
1525
+ emit(:tINTEGER, digits.to_i(@num_base))
1526
+ fbreak;
1527
+ };
1528
+
1529
+ # Floating point literals cannot start with 0 except when a dot
1530
+ # follows immediately, probably to avoid confusion with octal literals.
1531
+ ( [1-9] [0-9]* ( '_' digit+ )* |
1532
+ '0'
1533
+ )?
1534
+ (
1535
+ '.' ( digit+ '_' )* digit+ |
1536
+ ( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
1537
+ )
1538
+ => {
1539
+ if tok.start_with? '.'
1540
+ error "no .<digit> floating literal anymore; put 0 before dot"
1541
+ elsif tok =~ /^[eE]/
1542
+ # The rule above allows to specify floats as just `e10', which is
1543
+ # certainly not a float. Send a patch if you can do this better.
1544
+ emit(:tIDENTIFIER, tok)
1545
+ fbreak;
1546
+ end
1547
+
1548
+ emit(:tFLOAT, tok.to_f)
1549
+ fbreak;
1550
+ };
1551
+
1552
+ #
1553
+ # SYMBOL LITERALS
1554
+ #
1555
+
1556
+ # `echo foo` | :"bar" | :'baz'
1557
+ '`' | ':'? ['"] # '
1558
+ => {
1559
+ type, delimiter = tok, tok[-1]
1560
+ fgoto *push_literal(type, delimiter, @ts);
1561
+ };
1562
+
1563
+ ':' bareword ambiguous_symbol_suffix
1564
+ => { emit(:tSYMBOL, tok(@ts + 1, tm))
1565
+ p = tm - 1; fbreak; };
1566
+
1567
+ ':' ( bareword | global_var | class_var | instance_var |
1568
+ operator_fname | operator_arithmetic | operator_rest )
1569
+ => { emit(:tSYMBOL, tok(@ts + 1))
1570
+ fbreak; };
1571
+
1572
+ #
1573
+ # CONSTANTS AND VARIABLES
1574
+ #
1575
+
1576
+ constant
1577
+ => { emit(:tCONSTANT)
1578
+ fbreak; };
1579
+
1580
+ constant ambiguous_const_suffix
1581
+ => { emit(:tCONSTANT, tok(@ts, tm))
1582
+ p = tm - 1; fbreak; };
1583
+
1584
+ global_var | class_var_v | instance_var_v
1585
+ => { p = @ts - 1; fcall expr_variable; };
1586
+
1587
+ #
1588
+ # METHOD CALLS
1589
+ #
1590
+
1591
+ '.'
1592
+ => { emit_table(PUNCTUATION)
1593
+ fnext expr_dot; fbreak; };
1594
+
1595
+ call_or_var
1596
+ => { emit(:tIDENTIFIER)
1597
+ fnext expr_arg; fbreak; };
1598
+
1599
+ call_or_var [?!]
1600
+ => { emit(:tFID)
1601
+ fnext expr_arg; fbreak; };
1602
+
1603
+ #
1604
+ # OPERATORS
1605
+ #
1606
+
1607
+ ( e_lparen |
1608
+ operator_arithmetic |
1609
+ operator_rest
1610
+ ) %{ tm = p } c_space_nl*
1611
+ => { emit_table(PUNCTUATION, @ts, tm)
1612
+ fnext expr_beg; fbreak; };
1613
+
1614
+ e_rbrace | e_rparen | ']'
1615
+ => { emit_table(PUNCTUATION)
1616
+ fbreak; };
1617
+
1618
+ operator_arithmetic '='
1619
+ => { emit(:tOP_ASGN, tok(@ts, @te - 1))
1620
+ fnext expr_beg; fbreak; };
1621
+
1622
+ '?'
1623
+ => { emit_table(PUNCTUATION)
1624
+ fnext expr_value; fbreak; };
1625
+
1626
+ punctuation_end
1627
+ => { emit_table(PUNCTUATION)
1628
+ fnext expr_beg; fbreak; };
1629
+
1630
+ #
1631
+ # WHITESPACE
1632
+ #
1633
+
1634
+ '\\' e_heredoc_nl;
1635
+ '\\' ( any - c_nl ) {
1636
+ error "bare backslash only allowed before newline"
1637
+ };
1638
+
1639
+ '#' ( c_any - c_nl )*
1640
+ => { @comments << tok(@ts, @te + 1) };
1641
+
1642
+ e_heredoc_nl
1643
+ => { fgoto leading_dot; };
1644
+
1645
+ ';'
1646
+ => { emit_table(PUNCTUATION)
1647
+ fnext expr_value; fbreak; };
1648
+
1649
+ c_space+;
1650
+
1651
+ c_any
1652
+ => {
1653
+ error "unexpected #{tok.inspect}"
1654
+ };
1655
+
1656
+ c_eof => do_eof;
1657
+ *|;
1658
+
1659
+ leading_dot := |*
1660
+ # Insane leading dots:
1661
+ # a #comment
1662
+ # .b: a.b
1663
+ c_space* '.' ( c_any - '.' )
1664
+ => { fhold; fhold;
1665
+ fgoto expr_end; };
1666
+
1667
+ any
1668
+ => { emit(:tNL, nil, @newline_s, @newline_s + 1)
1669
+ fnext line_begin; fhold; fbreak; };
1670
+ *|;
1671
+
1672
+ #
1673
+ # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
1674
+ #
1675
+
1676
+ line_comment := |*
1677
+ '=end' c_line* c_nl
1678
+ => { @comments << tok
1679
+ fgoto line_begin; };
1680
+
1681
+ c_line* c_nl
1682
+ => { @comments << tok };
1683
+
1684
+ any
1685
+ => {
1686
+ @comments = ""
1687
+ error "embedded document meats end of file (and they embark on a romantic journey)"
1688
+ };
1689
+ *|;
1690
+
1691
+ line_begin := |*
1692
+ c_space_nl+;
1693
+
1694
+ '#' c_line* c_eol
1695
+ => { @comments << tok
1696
+ fhold; };
1697
+
1698
+ '=begin' ( c_space | c_eol )
1699
+ => { @comments << tok(@ts, @te)
1700
+ fgoto line_comment; };
1701
+
1702
+ '__END__' c_eol
1703
+ => { p = pe - 1 };
1704
+
1705
+ c_any
1706
+ => { fhold; fgoto expr_value; };
1707
+
1708
+ c_eof => do_eof;
1709
+ *|;
1710
+
1711
+ }%%
1712
+ # %
1713
+ end