parser 0.9.alpha → 0.9.alpha1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +21 -0
- data/.travis.yml +9 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +25 -0
- data/README.md +29 -0
- data/Rakefile +15 -182
- data/lib/parser.rb +4 -0
- data/lib/parser/lexer.rl +1713 -0
- data/lib/parser/lexer_literal.rb +175 -0
- data/lib/parser/static_environment.rb +38 -0
- data/lib/parser/syntax_error.rb +3 -0
- data/parser.gemspec +25 -0
- data/test/{test_ruby_lexer.rb → test_lexer.rb} +77 -129
- data/test/test_static_environment.rb +46 -0
- metadata +39 -98
- data/.autotest +0 -50
- data/.gemtest +0 -0
- data/History.txt +0 -558
- data/Manifest.txt +0 -18
- data/README.txt +0 -87
- data/bin/ruby_parse +0 -96
- data/bin/ruby_parse_extract_error +0 -130
- data/lib/gauntlet_rubyparser.rb +0 -117
- data/lib/ruby18_parser.rb +0 -5706
- data/lib/ruby18_parser.y +0 -1846
- data/lib/ruby19_parser.rb +0 -6054
- data/lib/ruby19_parser.y +0 -2035
- data/lib/ruby_lexer.rb +0 -6789
- data/lib/ruby_parser.rb +0 -4
- data/lib/ruby_parser_extras.rb +0 -1148
- data/test/test_ruby_parser.rb +0 -1772
- data/test/test_ruby_parser_extras.rb +0 -228
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 384df81635da81957880f54cb589109db642c914
|
4
|
+
data.tar.gz: 101c991d44683e9ba699a3ec3deca74572fb7a09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0724f1d86bbe49d1aa390c5bea5e3e6c859850b59be9ef0b78623f23cbe97cae94f75de5bc961cbdad1aabe6a1bb166e63dcac4103bb211a41c19ae86d8d3624
|
7
|
+
data.tar.gz: 1c64825e1d3a58b00d1a7038b599e161025b516435cde204e325061b1800416813f8ef2ab154f5b5dc99a1dc9cb1d11a9dfa33adb1e2e35f756c3fa10a9f5960
|
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.output
|
19
|
+
lib/parser/lexer.rb
|
20
|
+
lib/parser/ruby18.rb
|
21
|
+
lib/parser/ruby19.rb
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2013 Peter Zotov <whitequark@whitequark.org>
|
2
|
+
|
3
|
+
Parts of the source are derived from ruby_parser:
|
4
|
+
Copyright (c) Ryan Davis, seattle.rb
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
9
|
+
a copy of this software and associated documentation files (the
|
10
|
+
"Software"), to deal in the Software without restriction, including
|
11
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
12
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
13
|
+
permit persons to whom the Software is furnished to do so, subject to
|
14
|
+
the following conditions:
|
15
|
+
|
16
|
+
The above copyright notice and this permission notice shall be
|
17
|
+
included in all copies or substantial portions of the Software.
|
18
|
+
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
20
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
21
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
22
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
23
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
24
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
25
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Parser
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/whitequark/parser.png?branch=master)](https://travis-ci.org/whitequark/parser)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/whitequark/parser.png)](https://codeclimate.com/github/whitequark/parser)
|
5
|
+
|
6
|
+
Parser is a Ruby parser written in pure Ruby.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
$ gem install parser
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
TODO: Write usage instructions here
|
15
|
+
|
16
|
+
## Acknowledgements
|
17
|
+
|
18
|
+
The lexer testsuite is derived from [ruby_parser](http://github.com/seattlerb/ruby_parser).
|
19
|
+
|
20
|
+
The Bison parser rules are derived from [Ruby MRI](http://github.com/ruby/ruby) parse.y.
|
21
|
+
|
22
|
+
## Contributing
|
23
|
+
|
24
|
+
1. Make sure you have [Ragel 6.8](http://www.complang.org/ragel/) installed
|
25
|
+
2. Fork it
|
26
|
+
3. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
4. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
5. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
6. Create new Pull Request
|
data/Rakefile
CHANGED
@@ -1,192 +1,25 @@
|
|
1
|
-
|
1
|
+
require "bundler/gem_tasks"
|
2
2
|
|
3
|
-
|
4
|
-
require 'hoe'
|
3
|
+
task :default => [:generate, :test]
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
Hoe.add_include_dirs "../../sexp_processor/dev/lib"
|
11
|
-
|
12
|
-
Hoe.spec 'parser' do
|
13
|
-
developer 'Peter Zotov', 'whitequark@whitequark.org'
|
14
|
-
|
15
|
-
dependency 'sexp_processor', '~> 4.1'
|
16
|
-
|
17
|
-
self.racc_flags << " -t" if plugin?(:racc) && ENV["DEBUG"]
|
18
|
-
end
|
19
|
-
|
20
|
-
file "lib/ruby18_parser.rb" => "lib/ruby18_parser.y"
|
21
|
-
file "lib/ruby19_parser.rb" => "lib/ruby19_parser.y"
|
22
|
-
|
23
|
-
file "lib/ruby_lexer.rb" => "lib/ruby_lexer.rl" do |t|
|
24
|
-
sh "ragel -R #{t.prerequisites.first} -o #{t.name}"
|
25
|
-
end
|
26
|
-
|
27
|
-
task :clean do
|
28
|
-
rm_rf(Dir["**/*~"] +
|
29
|
-
Dir["**/*.diff"] +
|
30
|
-
Dir["coverage.info"] +
|
31
|
-
Dir["coverage"] +
|
32
|
-
Dir["lib/*.output"])
|
33
|
-
end
|
34
|
-
|
35
|
-
def next_num(glob)
|
36
|
-
num = Dir[glob].max[/\d+/].to_i + 1
|
37
|
-
end
|
38
|
-
|
39
|
-
desc "Compares PT to RP and deletes all files that match"
|
40
|
-
task :compare do
|
41
|
-
files = Dir["unit/**/*.rb"]
|
42
|
-
puts "Parsing #{files.size} files"
|
43
|
-
files.each do |file|
|
44
|
-
puts file
|
45
|
-
system "./cmp.rb -q #{file} && rm #{file}"
|
46
|
-
end
|
47
|
-
system 'find -d unit -type d -empty -exec rmdir {} \;'
|
48
|
-
end
|
49
|
-
|
50
|
-
desc "Compares PT to RP and stops on first failure"
|
51
|
-
task :find_bug do
|
52
|
-
files = Dir["unit/**/*.rb"]
|
53
|
-
puts "Parsing #{files.size} files"
|
54
|
-
files.each do |file|
|
55
|
-
puts file
|
56
|
-
sh "./cmp.rb -q #{file}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
task :sort do
|
61
|
-
sh 'grepsort "^ +def" lib/ruby_lexer.rb'
|
62
|
-
sh 'grepsort "^ +def (test|util)" test/test_ruby_lexer.rb'
|
63
|
-
end
|
64
|
-
|
65
|
-
task :loc do
|
66
|
-
loc1 = `wc -l ../1.0.0/lib/ruby_lexer.rb`[/\d+/]
|
67
|
-
flog1 = `flog -s ../1.0.0/lib/ruby_lexer.rb`[/\d+\.\d+/]
|
68
|
-
loc2 = `cat lib/ruby_lexer.rb lib/ruby_parser_extras.rb | wc -l`[/\d+/]
|
69
|
-
flog2 = `flog -s lib/ruby_lexer.rb lib/ruby_parser_extras.rb`[/\d+\.\d+/]
|
70
|
-
|
71
|
-
loc1, loc2, flog1, flog2 = loc1.to_i, loc2.to_i, flog1.to_f, flog2.to_f
|
72
|
-
|
73
|
-
puts "1.0.0: loc = #{loc1} flog = #{flog1}"
|
74
|
-
puts "dev : loc = #{loc2} flog = #{flog2}"
|
75
|
-
puts "delta: loc = #{loc2-loc1} flog = #{flog2-flog1}"
|
76
|
-
end
|
77
|
-
|
78
|
-
desc "Validate against all normal files in unit dir"
|
79
|
-
task :validate do
|
80
|
-
sh "./cmp.rb unit/*.rb"
|
81
|
-
end
|
82
|
-
|
83
|
-
def run_and_log cmd, prefix
|
84
|
-
files = ENV['FILES'] || 'unit/*.rb'
|
85
|
-
p, x = prefix, "txt"
|
86
|
-
n = Dir["#{p}.*.#{x}"].map { |s| s[/\d+/].to_i }.max + 1 rescue 1
|
87
|
-
f = "#{p}.#{n}.#{x}"
|
88
|
-
|
89
|
-
sh "#{cmd} #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g #{files} &> #{f}"
|
90
|
-
|
91
|
-
puts File.read(f)
|
92
|
-
end
|
93
|
-
|
94
|
-
desc "Benchmark against all normal files in unit dir"
|
95
|
-
task :benchmark do
|
96
|
-
run_and_log "ruby", "benchmark"
|
97
|
-
end
|
98
|
-
|
99
|
-
desc "Profile against all normal files in unit dir"
|
100
|
-
task :profile do
|
101
|
-
run_and_log "zenprofile", "profile"
|
102
|
-
end
|
103
|
-
|
104
|
-
desc "what was that command again?"
|
105
|
-
task :huh? do
|
106
|
-
puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..."
|
107
|
-
end
|
108
|
-
|
109
|
-
task :irb => [:isolate] do
|
110
|
-
sh "GEM_HOME=#{Gem.path.first} irb -rubygems -Ilib -rruby_parser;"
|
111
|
-
end
|
112
|
-
|
113
|
-
def (task(:phony)).timestamp
|
114
|
-
Time.at 0
|
115
|
-
end
|
116
|
-
|
117
|
-
task :isolate => :phony
|
118
|
-
|
119
|
-
file "lib/ruby18_parser.rb" => :isolate
|
120
|
-
file "lib/ruby19_parser.rb" => :isolate
|
121
|
-
|
122
|
-
task :compare18 do
|
123
|
-
sh "./yack.rb lib/ruby18_parser.output > racc18.txt"
|
124
|
-
sh "./yack.rb parse18.output > yacc18.txt"
|
125
|
-
sh "diff -du racc18.txt yacc18.txt || true"
|
126
|
-
puts
|
127
|
-
sh "diff -du racc18.txt yacc18.txt | wc -l"
|
128
|
-
end
|
129
|
-
|
130
|
-
task :compare19 do
|
131
|
-
sh "./yack.rb lib/ruby19_parser.output > racc19.txt"
|
132
|
-
sh "./yack.rb parse19.output > yacc19.txt"
|
133
|
-
sh "diff -du racc19.txt yacc19.txt || true"
|
134
|
-
puts
|
135
|
-
sh "diff -du racc19.txt yacc19.txt | wc -l"
|
136
|
-
end
|
137
|
-
|
138
|
-
task :debug => :isolate do
|
139
|
-
ENV["V"] ||= "19"
|
140
|
-
Rake.application[:parser].invoke # this way we can have DEBUG set
|
141
|
-
|
142
|
-
$: << "lib"
|
143
|
-
require 'ruby_parser'
|
144
|
-
require 'pp'
|
145
|
-
|
146
|
-
parser = if ENV["V"] == "18" then
|
147
|
-
Ruby18Parser.new
|
148
|
-
else
|
149
|
-
Ruby19Parser.new
|
150
|
-
end
|
151
|
-
|
152
|
-
time = (ENV["RP_TIMEOUT"] || 10).to_i
|
153
|
-
|
154
|
-
file = ENV["F"] || ENV["FILE"]
|
155
|
-
|
156
|
-
ruby = if file then
|
157
|
-
File.read(file)
|
158
|
-
else
|
159
|
-
file = "env"
|
160
|
-
ENV["R"] || ENV["RUBY"]
|
161
|
-
end
|
162
|
-
|
163
|
-
begin
|
164
|
-
pp parser.process(ruby, file, time)
|
165
|
-
rescue Racc::ParseError => e
|
166
|
-
p e
|
167
|
-
ss = parser.lexer.src
|
168
|
-
src = ss.string
|
169
|
-
lines = src[0..ss.pos].split(/\n/)
|
170
|
-
abort "on #{file}:#{lines.size}"
|
5
|
+
task :test do
|
6
|
+
$LOAD_PATH << File.expand_path('../lib/', __FILE__)
|
7
|
+
Dir["test/test_*.rb"].each do |file|
|
8
|
+
load file
|
171
9
|
end
|
172
10
|
end
|
173
11
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
task :extract => :isolate do
|
180
|
-
ENV["V"] ||= "19"
|
181
|
-
Rake.application[:parser].invoke # this way we can have DEBUG set
|
12
|
+
desc "Generate the Ragel lexer and Bison parser."
|
13
|
+
task :generate => %w(lib/parser/lexer.rb)
|
14
|
+
#lib/parser/ruby18.rb
|
15
|
+
#lib/parser/ruby19.rb)
|
182
16
|
|
183
|
-
|
17
|
+
task :build => :generate
|
184
18
|
|
185
|
-
|
19
|
+
rule '.rb' => '.rl' do |t|
|
20
|
+
sh "ragel -R #{t.source} -o #{t.name}"
|
186
21
|
end
|
187
22
|
|
188
|
-
|
189
|
-
sh "
|
23
|
+
rule '.rb' => '.y' do |t|
|
24
|
+
sh "racc #{t.source} -o #{t.name} -O"
|
190
25
|
end
|
191
|
-
|
192
|
-
# vim: syntax=Ruby
|
data/lib/parser.rb
ADDED
data/lib/parser/lexer.rl
ADDED
@@ -0,0 +1,1713 @@
|
|
1
|
+
%%machine lex; # % fix highlighting
|
2
|
+
|
3
|
+
#
|
4
|
+
# === BEFORE YOU START ===
|
5
|
+
#
|
6
|
+
# Remember two things about Ragel scanners:
|
7
|
+
#
|
8
|
+
# 1) Longest match wins.
|
9
|
+
#
|
10
|
+
# 2) If two matches have the same length, the first
|
11
|
+
# in source code wins.
|
12
|
+
#
|
13
|
+
# General rules of making Ragel and Bison happy:
|
14
|
+
#
|
15
|
+
# * `p` (position) and `@te` contain the index of the character
|
16
|
+
# they're pointing to ("current"), plus one. `@ts` contains the index
|
17
|
+
# of the corresponding character. The code for extracting matched token is:
|
18
|
+
#
|
19
|
+
# @source[@ts...@te]
|
20
|
+
#
|
21
|
+
# * If your input is `foooooooobar` and the rule is:
|
22
|
+
#
|
23
|
+
# 'f' 'o'+
|
24
|
+
#
|
25
|
+
# the result will be:
|
26
|
+
#
|
27
|
+
# foooooooobar
|
28
|
+
# ^ ts=0 ^ p=te=9
|
29
|
+
#
|
30
|
+
# * A Ragel lexer action should not emit more than one token, unless
|
31
|
+
# you know what you are doing.
|
32
|
+
#
|
33
|
+
# * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
|
34
|
+
#
|
35
|
+
# * If an action emits the token and transitions to another state, use
|
36
|
+
# these Ragel commands:
|
37
|
+
#
|
38
|
+
# emit($whatever)
|
39
|
+
# fnext $next_state; fbreak;
|
40
|
+
#
|
41
|
+
# * If an action does not emit a token:
|
42
|
+
#
|
43
|
+
# fgoto $next_state;
|
44
|
+
#
|
45
|
+
# * If an action features lookbehind, i.e. matches characters with the
|
46
|
+
# intent of passing them to another action:
|
47
|
+
#
|
48
|
+
# p = @ts - 1
|
49
|
+
# fgoto $next_state;
|
50
|
+
#
|
51
|
+
# or, if the lookbehind consists of a single character:
|
52
|
+
#
|
53
|
+
# fhold; fgoto $next_state;
|
54
|
+
#
|
55
|
+
# * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
|
56
|
+
# `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
|
57
|
+
# _will_ invoke the action `act`.
|
58
|
+
#
|
59
|
+
# * EOF is explicit and is matched by `c_eof`. If you want to introspect
|
60
|
+
# the state of the lexer, add this rule to the state:
|
61
|
+
#
|
62
|
+
# c_eof => do_eof;
|
63
|
+
#
|
64
|
+
# * If you proceed past EOF, the lexer will complain:
|
65
|
+
#
|
66
|
+
# NoMethodError: undefined method `ord' for nil:NilClass
|
67
|
+
#
|
68
|
+
|
69
|
+
require 'parser/lexer_literal'
|
70
|
+
require 'parser/syntax_error'
|
71
|
+
|
72
|
+
class Parser::Lexer
|
73
|
+
|
74
|
+
%% write data nofinal;
|
75
|
+
# %
|
76
|
+
|
77
|
+
attr_reader :source
|
78
|
+
attr_accessor :static_env
|
79
|
+
|
80
|
+
attr_reader :location, :comments
|
81
|
+
|
82
|
+
def initialize(version)
|
83
|
+
@version = version
|
84
|
+
|
85
|
+
reset
|
86
|
+
end
|
87
|
+
|
88
|
+
def reset(reset_state=true)
|
89
|
+
if reset_state
|
90
|
+
# Unit tests set state prior to resetting lexer.
|
91
|
+
@cs = self.class.lex_en_line_begin
|
92
|
+
end
|
93
|
+
|
94
|
+
# Ragel-internal variables:
|
95
|
+
@p = 0 # stream position (saved manually in #advance)
|
96
|
+
@ts = nil # token start
|
97
|
+
@te = nil # token end
|
98
|
+
@act = 0 # next action
|
99
|
+
|
100
|
+
@stack = [] # state stack
|
101
|
+
@top = 0 # state stack top pointer
|
102
|
+
|
103
|
+
@token_queue = []
|
104
|
+
@literal_stack = []
|
105
|
+
|
106
|
+
@newlines = [0] # sorted set of \n positions
|
107
|
+
@newline_s = nil # location of last encountered newline
|
108
|
+
@location = nil # location of last #advance'd token
|
109
|
+
|
110
|
+
@comments = "" # collected comments
|
111
|
+
|
112
|
+
@num_base = nil # last numeric base
|
113
|
+
@num_digits_s = nil # starting position of numeric digits
|
114
|
+
|
115
|
+
@escape_s = nil # starting position of current sequence
|
116
|
+
@escape = nil # last escaped sequence, as string
|
117
|
+
|
118
|
+
# See below the section on parsing heredocs.
|
119
|
+
@heredoc_e = nil
|
120
|
+
@herebody_s = nil
|
121
|
+
|
122
|
+
# Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
|
123
|
+
# encountered after a matching closing parenthesis.
|
124
|
+
@paren_nest = 0
|
125
|
+
@lambda_stack = []
|
126
|
+
end
|
127
|
+
|
128
|
+
def source=(source)
|
129
|
+
# Heredoc processing coupled with weird newline quirks
|
130
|
+
# require three '\0' (EOF) chars to be appended; after
|
131
|
+
# `p = @heredoc_s`, if `p` points at EOF, the FSM could
|
132
|
+
# not bail out early enough and will crash.
|
133
|
+
#
|
134
|
+
# Patches accepted.
|
135
|
+
#
|
136
|
+
@source = source.gsub(/\r\n/, "\n") + "\0\0\0"
|
137
|
+
end
|
138
|
+
|
139
|
+
LEX_STATES = {
|
140
|
+
:line_begin => lex_en_line_begin,
|
141
|
+
:expr_beg => lex_en_expr_beg,
|
142
|
+
:expr_value => lex_en_expr_value,
|
143
|
+
:expr_mid => lex_en_expr_mid,
|
144
|
+
:expr_dot => lex_en_expr_dot,
|
145
|
+
:expr_fname => lex_en_expr_fname,
|
146
|
+
:expr_end => lex_en_expr_end,
|
147
|
+
:expr_arg => lex_en_expr_arg,
|
148
|
+
:expr_endarg => lex_en_expr_endarg,
|
149
|
+
}
|
150
|
+
|
151
|
+
def state
|
152
|
+
LEX_STATES.invert.fetch(@cs, @cs)
|
153
|
+
end
|
154
|
+
|
155
|
+
def state=(state)
|
156
|
+
@cs = LEX_STATES.fetch(state)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return next token: [type, value].
|
160
|
+
def advance
|
161
|
+
if @token_queue.any?
|
162
|
+
return with_location(@token_queue.shift)
|
163
|
+
end
|
164
|
+
|
165
|
+
# Ugly, but dependent on Ragel output. Consider refactoring it somehow.
|
166
|
+
_lex_trans_keys = self.class.send :_lex_trans_keys
|
167
|
+
_lex_actions = self.class.send :_lex_actions
|
168
|
+
_lex_key_offsets = self.class.send :_lex_key_offsets
|
169
|
+
_lex_index_offsets = self.class.send :_lex_index_offsets
|
170
|
+
_lex_single_lengths = self.class.send :_lex_single_lengths
|
171
|
+
_lex_range_lengths = self.class.send :_lex_range_lengths
|
172
|
+
_lex_indicies = self.class.send :_lex_indicies
|
173
|
+
_lex_trans_targs = self.class.send :_lex_trans_targs
|
174
|
+
_lex_trans_actions = self.class.send :_lex_trans_actions
|
175
|
+
_lex_to_state_actions = self.class.send :_lex_to_state_actions
|
176
|
+
_lex_from_state_actions = self.class.send :_lex_from_state_actions
|
177
|
+
|
178
|
+
p, pe, eof = @p, @source.length + 1, nil
|
179
|
+
|
180
|
+
%% write exec;
|
181
|
+
# %
|
182
|
+
|
183
|
+
@p = p
|
184
|
+
|
185
|
+
if @token_queue.any?
|
186
|
+
with_location(@token_queue.shift)
|
187
|
+
elsif @cs == self.class.lex_error
|
188
|
+
with_location([ false, '$undefined', p, p + 1 ])
|
189
|
+
else
|
190
|
+
with_location([ false, '$end', p, p + 1 ])
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# Like #advance, but also pretty-print the token and its position
|
195
|
+
# in the stream to `stdout`.
|
196
|
+
def advance_and_decorate
|
197
|
+
type, val = advance
|
198
|
+
|
199
|
+
puts decorate(location, "\e[0;32m#{type} #{val.inspect}\e[0m")
|
200
|
+
|
201
|
+
[type, val]
|
202
|
+
end
|
203
|
+
|
204
|
+
# Return the current collected comment block and clear the storage.
|
205
|
+
def clear_comments
|
206
|
+
comments = @comments
|
207
|
+
@comments = ""
|
208
|
+
|
209
|
+
comments
|
210
|
+
end
|
211
|
+
|
212
|
+
# Lex `str` for the Ruby version `version` with initial state `state`.
|
213
|
+
#
|
214
|
+
# The tokens displayed by this function are not the same as tokens
|
215
|
+
# consumed by parser, because the parser manipulates lexer state on
|
216
|
+
# its own.
|
217
|
+
def self.do(source, state=nil, version=19)
|
218
|
+
lex = new(version)
|
219
|
+
lex.source = source
|
220
|
+
lex.state = state if state
|
221
|
+
|
222
|
+
loop do
|
223
|
+
type, val = lex.advance_and_decorate
|
224
|
+
break if !type
|
225
|
+
end
|
226
|
+
|
227
|
+
puts "Lex state: #{lex.state}"
|
228
|
+
end
|
229
|
+
|
230
|
+
# Used by LexerLiteral to emit tokens for string content.
|
231
|
+
def emit(type, value = tok, s = @ts, e = @te)
|
232
|
+
if s.nil? || e.nil?
|
233
|
+
raise "broken #emit invocation in #{caller[0]}"
|
234
|
+
end
|
235
|
+
|
236
|
+
@token_queue << [ type, value, s, e ]
|
237
|
+
end
|
238
|
+
|
239
|
+
def emit_table(table, s = @ts, e = @te)
|
240
|
+
token = tok(s, e)
|
241
|
+
emit(table[token], token, s, e)
|
242
|
+
end
|
243
|
+
|
244
|
+
# shim
|
245
|
+
def lineno
|
246
|
+
@location[0] + 1
|
247
|
+
end
|
248
|
+
|
249
|
+
protected
|
250
|
+
|
251
|
+
def eof_char?(char)
|
252
|
+
[0x04, 0x1a, 0x00].include? char.ord
|
253
|
+
end
|
254
|
+
|
255
|
+
def ruby18?
|
256
|
+
@version == 18
|
257
|
+
end
|
258
|
+
|
259
|
+
def ruby19?
|
260
|
+
@version == 19
|
261
|
+
end
|
262
|
+
|
263
|
+
def tok(s = @ts, e = @te)
|
264
|
+
@source[s...e]
|
265
|
+
end
|
266
|
+
|
267
|
+
def record_newline(p)
|
268
|
+
@newlines = (@newlines + [p]).uniq.sort
|
269
|
+
end
|
270
|
+
|
271
|
+
def dissect_location(start, finish)
|
272
|
+
line_number = @newlines.rindex { |nl| start >= nl }
|
273
|
+
line_first_col = @newlines[line_number]
|
274
|
+
|
275
|
+
start_col = start - line_first_col
|
276
|
+
finish_col = finish - line_first_col
|
277
|
+
|
278
|
+
[ line_number, start_col, finish_col ]
|
279
|
+
end
|
280
|
+
|
281
|
+
def with_location(item)
|
282
|
+
type, value, start, finish = *item
|
283
|
+
|
284
|
+
@location = dissect_location(start, finish)
|
285
|
+
|
286
|
+
[ type, value ]
|
287
|
+
end
|
288
|
+
|
289
|
+
def decorate(location, message="")
|
290
|
+
line_number, from, to = location
|
291
|
+
|
292
|
+
line = @source.lines.drop(line_number).first
|
293
|
+
line[from...to] = "\e[4m#{line[from...to]}\e[0m"
|
294
|
+
|
295
|
+
tail_len = to - from - 1
|
296
|
+
tail = "~" * (tail_len >= 0 ? tail_len : 0)
|
297
|
+
decoration = "#{" " * from}\e[1;31m^#{tail}\e[0m #{message}"
|
298
|
+
|
299
|
+
[ line, decoration ]
|
300
|
+
end
|
301
|
+
|
302
|
+
def warning(message, start = @ts, finish = @te)
|
303
|
+
$stderr.puts "warning: #{message}"
|
304
|
+
$stderr.puts decorate(dissect_location(start, finish))
|
305
|
+
end
|
306
|
+
|
307
|
+
def error(message)
|
308
|
+
raise Parser::SyntaxError, message
|
309
|
+
end
|
310
|
+
|
311
|
+
#
|
312
|
+
# === LITERAL STACK ===
|
313
|
+
#
|
314
|
+
|
315
|
+
def push_literal(*args)
|
316
|
+
new_literal = Parser::LexerLiteral.new(self, *args)
|
317
|
+
@literal_stack.push(new_literal)
|
318
|
+
|
319
|
+
if new_literal.type == :tWORDS_BEG
|
320
|
+
self.class.lex_en_interp_words
|
321
|
+
elsif new_literal.type == :tQWORDS_BEG
|
322
|
+
self.class.lex_en_plain_words
|
323
|
+
elsif new_literal.interpolate?
|
324
|
+
self.class.lex_en_interp_string
|
325
|
+
else
|
326
|
+
self.class.lex_en_plain_string
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def literal
|
331
|
+
@literal_stack[-1]
|
332
|
+
end
|
333
|
+
|
334
|
+
def pop_literal
|
335
|
+
old_literal = @literal_stack.pop
|
336
|
+
|
337
|
+
if old_literal.type == :tREGEXP_BEG
|
338
|
+
# Fetch modifiers.
|
339
|
+
self.class.lex_en_regexp_modifiers
|
340
|
+
else
|
341
|
+
self.class.lex_en_expr_end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Mapping of strings to parser tokens.
|
346
|
+
|
347
|
+
PUNCTUATION = {
|
348
|
+
'=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
|
349
|
+
'!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
|
350
|
+
'-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
|
351
|
+
'%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
|
352
|
+
';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
|
353
|
+
'...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
|
354
|
+
'(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
|
355
|
+
':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
|
356
|
+
'-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
|
357
|
+
'**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
|
358
|
+
'!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
|
359
|
+
'>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
|
360
|
+
'<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
|
361
|
+
'=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
|
362
|
+
'<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
|
363
|
+
'{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
|
364
|
+
'do' => :kDO
|
365
|
+
}
|
366
|
+
|
367
|
+
PUNCTUATION_BEGIN = {
|
368
|
+
'&' => :tAMPER, '*' => :tSTAR, '+' => :tUPLUS,
|
369
|
+
'-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN,
|
370
|
+
'{' => :tLBRACE, '[' => :tLBRACK,
|
371
|
+
}
|
372
|
+
|
373
|
+
KEYWORDS = {
|
374
|
+
'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
|
375
|
+
'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
|
376
|
+
'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
|
377
|
+
'BEGIN' => :klBEGIN, 'END' => :klEND,
|
378
|
+
}
|
379
|
+
|
380
|
+
%w(class module def undef begin end then elsif else ensure case when
|
381
|
+
for break next redo retry in do return yield super self nil true
|
382
|
+
false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
|
383
|
+
KEYWORDS[keyword] = :"k#{keyword.upcase}"
|
384
|
+
end
|
385
|
+
|
386
|
+
KEYWORDS_BEGIN = {
|
387
|
+
'if' => :kIF, 'unless' => :kUNLESS,
|
388
|
+
'while' => :kWHILE, 'until' => :kUNTIL,
|
389
|
+
'rescue' => :kRESCUE
|
390
|
+
}
|
391
|
+
|
392
|
+
%%{
|
393
|
+
# %
|
394
|
+
|
395
|
+
access @;
|
396
|
+
getkey @source[p].ord;
|
397
|
+
|
398
|
+
# === CHARACTER CLASSES ===
|
399
|
+
#
|
400
|
+
# Pay close attention to the differences between c_any and any.
|
401
|
+
# c_any does not include EOF and so will cause incorrect behavior
|
402
|
+
# for machine subtraction (any-except rules) and default transitions
|
403
|
+
# for scanners.
|
404
|
+
|
405
|
+
action do_nl {
|
406
|
+
# Record position of a newline for precise line and column reporting.
|
407
|
+
#
|
408
|
+
# This action is embedded directly into c_nl, as it is idempotent and
|
409
|
+
# there are no cases when we need to skip it.
|
410
|
+
record_newline(p + 1)
|
411
|
+
@newline_s = p
|
412
|
+
}
|
413
|
+
|
414
|
+
c_nl = '\n' $ do_nl;
|
415
|
+
c_space = [ \t\r\f\v];
|
416
|
+
c_space_nl = c_space | c_nl;
|
417
|
+
c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
|
418
|
+
c_eol = c_nl | c_eof;
|
419
|
+
c_any = any - c_eof - zlen;
|
420
|
+
c_line = c_any - c_nl;
|
421
|
+
|
422
|
+
c_unicode = c_any - 0x00..0x7f;
|
423
|
+
c_lower = [a-z_] | c_unicode;
|
424
|
+
c_upper = [A-Z] | c_unicode;
|
425
|
+
c_alpha = c_lower | c_upper;
|
426
|
+
c_alnum = c_alpha | [0-9];
|
427
|
+
|
428
|
+
action do_eof {
|
429
|
+
# Sit at EOF indefinitely. #advance would return $eof each time.
|
430
|
+
# This allows to feed the lexer more data if needed; this is only used
|
431
|
+
# in tests.
|
432
|
+
#
|
433
|
+
# Note that this action is not embedded into e_eof like e_nl and e_bs
|
434
|
+
# below. This is due to the fact that scanner state at EOF is observed
|
435
|
+
# by tests, and encapsulating it in a rule would break the introspection.
|
436
|
+
fhold; fbreak;
|
437
|
+
}
|
438
|
+
|
439
|
+
#
|
440
|
+
# === TOKEN DEFINITIONS ===
|
441
|
+
#
|
442
|
+
|
443
|
+
# All operators are punctuation. There is more to punctuation
|
444
|
+
# than just operators. Operators can be overridden by user;
|
445
|
+
# punctuation can not.
|
446
|
+
|
447
|
+
# A list of operators which are valid in the function name context, but
|
448
|
+
# have different semantics in others.
|
449
|
+
operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' ;
|
450
|
+
|
451
|
+
# A list of operators which can occur within an assignment shortcut (+ → +=).
|
452
|
+
operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
|
453
|
+
'*' | '/' | '**' | '~' | '**' | '<<' | '>>' |
|
454
|
+
'%' ;
|
455
|
+
|
456
|
+
# A list of all user-definable operators not covered by groups above.
|
457
|
+
operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
|
458
|
+
'<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
|
459
|
+
|
460
|
+
# Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
|
461
|
+
# as they are ambiguous with interpolation `#{}` and should be counted.
|
462
|
+
# These braces are not present in punctuation lists.
|
463
|
+
|
464
|
+
# A list of punctuation which has different meaning when used at the
|
465
|
+
# beginning of expression.
|
466
|
+
punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '&' ;
|
467
|
+
|
468
|
+
# A list of all punctuation except punctuation_begin.
|
469
|
+
punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
|
470
|
+
'::' | '?' | ':' | '.' | '..' | '...' ;
|
471
|
+
|
472
|
+
# A list of keywords which have different meaning at the beginning of expression.
|
473
|
+
keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
|
474
|
+
|
475
|
+
# A list of keywords which accept an argument-like expression, i.e. have the
|
476
|
+
# same post-processing as method calls or commands. Example: `yield 1`,
|
477
|
+
# `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
|
478
|
+
keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
|
479
|
+
|
480
|
+
# A list of keywords which accept a literal function name as an argument.
|
481
|
+
keyword_with_fname = 'def' | 'undef' | 'alias' ;
|
482
|
+
|
483
|
+
# A list of keywords which accept an expression after them.
|
484
|
+
keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
|
485
|
+
'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
|
486
|
+
'and' | 'or' ;
|
487
|
+
|
488
|
+
# A list of keywords which accept a value, and treat the keywords from
|
489
|
+
# `keyword_modifier` list as modifiers.
|
490
|
+
keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
|
491
|
+
|
492
|
+
# A list of keywords which do not accept an expression after them.
|
493
|
+
keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
|
494
|
+
'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
|
495
|
+
'__LINE__' | '__ENCODING__';
|
496
|
+
|
497
|
+
# All keywords.
|
498
|
+
keyword = keyword_with_value | keyword_with_mid |
|
499
|
+
keyword_with_end | keyword_with_arg |
|
500
|
+
keyword_with_fname | keyword_modifier ;
|
501
|
+
|
502
|
+
constant = [A-Z] c_alnum*;
|
503
|
+
bareword = c_alpha c_alnum*;
|
504
|
+
|
505
|
+
call_or_var = c_lower c_alnum*;
|
506
|
+
class_var = '@@' bareword;
|
507
|
+
instance_var = '@' bareword;
|
508
|
+
global_var = '$'
|
509
|
+
( bareword | digit+
|
510
|
+
| [`'+~*$&?!@/\\;,.=:<>"] # `
|
511
|
+
| '-' [A-Za-z0-9_]?
|
512
|
+
)
|
513
|
+
;
|
514
|
+
|
515
|
+
# Ruby accepts (and fails on) variables with leading digit
|
516
|
+
# in literal context, but not in unquoted symbol body.
|
517
|
+
class_var_v = '@@' [0-9]? bareword;
|
518
|
+
instance_var_v = '@' [0-9]? bareword;
|
519
|
+
|
520
|
+
#
|
521
|
+
# === ESCAPE SEQUENCE PARSING ===
|
522
|
+
#
|
523
|
+
|
524
|
+
# Escape parsing code is a Ragel pattern, not a scanner, and therefore
|
525
|
+
# it shouldn't directly raise errors or perform other actions with side effects.
|
526
|
+
# In reality this would probably just mess up error reporting in pathological
|
527
|
+
# cases, through.
|
528
|
+
|
529
|
+
# The amount of code required to parse \M\C stuff correctly is ridiculous.
|
530
|
+
|
531
|
+
escaped_nl = "\\" c_nl;
|
532
|
+
|
533
|
+
action unicode_points {
|
534
|
+
@escape = ""
|
535
|
+
|
536
|
+
codepoints = tok(@escape_s + 2, p - 1)
|
537
|
+
codepoints.split(/[ \t]/).each do |codepoint_str|
|
538
|
+
codepoint = codepoint_str.to_i(16)
|
539
|
+
|
540
|
+
if codepoint >= 0x110000
|
541
|
+
@escape = lambda { error "invalid Unicode codepoint (too large)" }
|
542
|
+
break
|
543
|
+
end
|
544
|
+
|
545
|
+
@escape += codepoint.chr(Encoding::UTF_8)
|
546
|
+
end
|
547
|
+
}
|
548
|
+
|
549
|
+
action unescape_char {
|
550
|
+
@escape = {
|
551
|
+
'a' => "\a", 'b' => "\b", 'e' => "\e", 'f' => "\f",
|
552
|
+
'n' => "\n", 'r' => "\r", 's' => "\s", 't' => "\t",
|
553
|
+
'v' => "\v", '\\' => "\\"
|
554
|
+
}.fetch(@source[p - 1], @source[p - 1])
|
555
|
+
}
|
556
|
+
|
557
|
+
action invalid_complex_escape {
|
558
|
+
@escape = lambda { error "invalid escape character syntax" }
|
559
|
+
}
|
560
|
+
|
561
|
+
action slash_c_char {
|
562
|
+
@escape = (@escape.ord & 0x9f).chr
|
563
|
+
}
|
564
|
+
|
565
|
+
action slash_m_char {
|
566
|
+
@escape = (@escape.ord | 0x80).chr
|
567
|
+
}
|
568
|
+
|
569
|
+
maybe_escaped_char = (
|
570
|
+
'\\' c_any %unescape_char
|
571
|
+
| ( c_any - [\\] ) % { @escape = @source[p - 1] }
|
572
|
+
);
|
573
|
+
|
574
|
+
maybe_escaped_ctrl_char = ( # why?!
|
575
|
+
'\\' c_any %unescape_char %slash_c_char
|
576
|
+
| '?' % { @escape = "\x7f" }
|
577
|
+
| ( c_any - [\\?] ) % { @escape = @source[p - 1] } %slash_c_char
|
578
|
+
);
|
579
|
+
|
580
|
+
escape = (
|
581
|
+
# \377
|
582
|
+
[0-7]{1,3}
|
583
|
+
% { @escape = tok(@escape_s, p).to_i(8).chr }
|
584
|
+
|
585
|
+
# \xff
|
586
|
+
| ( 'x' xdigit{1,2}
|
587
|
+
% { @escape = tok(@escape_s + 1, p).to_i(16).chr }
|
588
|
+
# \u263a
|
589
|
+
| 'u' xdigit{4}
|
590
|
+
% { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
|
591
|
+
)
|
592
|
+
|
593
|
+
# %q[\x]
|
594
|
+
| 'x' ( c_any - xdigit )
|
595
|
+
% { @escape = lambda { error "invalid hex escape" } }
|
596
|
+
|
597
|
+
# %q[\u123] %q[\u{12]
|
598
|
+
| 'u' ( c_any{0,4} -
|
599
|
+
xdigit{4} - # \u1234 is valid
|
600
|
+
( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
|
601
|
+
| '{' xdigit [ \t}] # \u{1. \u{1} are valid
|
602
|
+
| '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
|
603
|
+
)
|
604
|
+
)
|
605
|
+
% { @escape = lambda { error "invalid Unicode escape" } }
|
606
|
+
|
607
|
+
# \u{123 456}
|
608
|
+
| 'u{' ( xdigit{1,6} [ \t] )*
|
609
|
+
( xdigit{1,6} '}'
|
610
|
+
%unicode_points
|
611
|
+
| ( xdigit* ( c_any - xdigit - '}' )+ '}'
|
612
|
+
| ( c_any - '}' )* c_eof
|
613
|
+
| xdigit{7,}
|
614
|
+
) % { @escape = lambda { error "unterminated Unicode escape" } }
|
615
|
+
)
|
616
|
+
|
617
|
+
# \C-\a \cx
|
618
|
+
| ( 'C-' | 'c' ) escaped_nl?
|
619
|
+
maybe_escaped_ctrl_char
|
620
|
+
|
621
|
+
# \M-a
|
622
|
+
| 'M-' escaped_nl?
|
623
|
+
maybe_escaped_char
|
624
|
+
%slash_m_char
|
625
|
+
|
626
|
+
# \C-\M-f \M-\cf \c\M-f
|
627
|
+
| ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
|
628
|
+
| 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
|
629
|
+
maybe_escaped_ctrl_char
|
630
|
+
%slash_m_char
|
631
|
+
|
632
|
+
| 'C' c_any %invalid_complex_escape
|
633
|
+
| 'M' c_any %invalid_complex_escape
|
634
|
+
| ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
|
635
|
+
|
636
|
+
| ( c_any - [0-7xuCMc] ) %unescape_char
|
637
|
+
|
638
|
+
| c_eof % { error "escape sequence meets end of file" }
|
639
|
+
);
|
640
|
+
|
641
|
+
# Use rules in form of `e_bs escape' when you need to parse a sequence.
|
642
|
+
e_bs = '\\' % {
|
643
|
+
@escape_s = p
|
644
|
+
@escape = nil
|
645
|
+
};
|
646
|
+
|
647
|
+
#
|
648
|
+
# === STRING AND HEREDOC PARSING ===
|
649
|
+
#
|
650
|
+
|
651
|
+
# Heredoc parsing is quite a complex topic. First, consider that heredocs
|
652
|
+
# can be arbitrarily nested. For example:
|
653
|
+
#
|
654
|
+
# puts <<CODE
|
655
|
+
# the result is: #{<<RESULT.inspect
|
656
|
+
# i am a heredoc
|
657
|
+
# RESULT
|
658
|
+
# }
|
659
|
+
# CODE
|
660
|
+
#
|
661
|
+
# which, incidentally, evaluates to:
|
662
|
+
#
|
663
|
+
# the result is: " i am a heredoc\n"
|
664
|
+
#
|
665
|
+
# To parse them, lexer refers to two kinds (remember, nested heredocs)
|
666
|
+
# of positions in the input stream, namely @heredoc_e
|
667
|
+
# (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
|
668
|
+
#
|
669
|
+
# @heredoc_e is simply contained inside the corresponding LexerLiteral, and
|
670
|
+
# when the heredoc is closed, the lexing is restarted from that position.
|
671
|
+
#
|
672
|
+
# @herebody_s is quite more complex. First, @herebody_s changes after each
|
673
|
+
# heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
|
674
|
+
# contains the current line, and also when a heredoc is started, @herebody_s
|
675
|
+
# contains the position from which the heredoc will be lexed.
|
676
|
+
#
|
677
|
+
# Second, as (insanity) there are nested heredocs, we need to maintain a
|
678
|
+
# stack of these positions. Each time #push_literal is called, it saves current
|
679
|
+
# @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
|
680
|
+
# containing another heredocs) is closed, the previous value is restored.
|
681
|
+
|
682
|
+
e_heredoc_nl = c_nl $ {
|
683
|
+
# After every heredoc was parsed, @herebody_s contains the
|
684
|
+
# position of next token after all heredocs.
|
685
|
+
if @herebody_s
|
686
|
+
p = @herebody_s
|
687
|
+
@herebody_s = nil
|
688
|
+
end
|
689
|
+
};
|
690
|
+
|
691
|
+
action extend_string {
|
692
|
+
if literal.nest_and_try_closing tok, @ts, @te
|
693
|
+
fgoto *pop_literal;
|
694
|
+
else
|
695
|
+
literal.extend_string tok, @ts, @te
|
696
|
+
end
|
697
|
+
}
|
698
|
+
|
699
|
+
action extend_string_escaped {
|
700
|
+
if literal.nest_and_try_closing('\\', @ts, @ts + 1)
|
701
|
+
# If the literal is actually closed by the backslash,
|
702
|
+
# rewind the input prior to consuming the escape sequence.
|
703
|
+
p = @escape_s - 1
|
704
|
+
fgoto *pop_literal;
|
705
|
+
else
|
706
|
+
# Get the first character after the backslash.
|
707
|
+
escaped_char = @source[@escape_s]
|
708
|
+
|
709
|
+
if literal.munge_escape? escaped_char
|
710
|
+
# If this particular literal uses this character as an opening
|
711
|
+
# or closing delimiter, it is an escape sequence for that
|
712
|
+
# particular character. Write it without the backslash.
|
713
|
+
|
714
|
+
if literal.regexp?
|
715
|
+
# Regular expressions should have every escape sequence in its
|
716
|
+
# raw form.
|
717
|
+
literal.extend_string(tok, @ts, @te)
|
718
|
+
else
|
719
|
+
literal.extend_string(escaped_char, @ts, @te)
|
720
|
+
end
|
721
|
+
else
|
722
|
+
# It does not. So this is an actual escape sequence, yay!
|
723
|
+
# Two things to consider here.
|
724
|
+
#
|
725
|
+
# 1. The `escape' rule should be pure and so won't raise any
|
726
|
+
# errors by itself. Instead, it stores them in lambdas.
|
727
|
+
#
|
728
|
+
# 2. Non-interpolated literals do not go through the aforementioned
|
729
|
+
# rule. As \\ and \' (and variants) are munged, the full token
|
730
|
+
# should always be written for such literals.
|
731
|
+
|
732
|
+
@escape.call if @escape.respond_to? :call
|
733
|
+
|
734
|
+
if literal.regexp?
|
735
|
+
# Ditto. Also, expand escaped newlines.
|
736
|
+
literal.extend_string(tok.gsub("\\\n", ''), @ts, @te)
|
737
|
+
else
|
738
|
+
literal.extend_string(@escape || tok, @ts, @te)
|
739
|
+
end
|
740
|
+
end
|
741
|
+
end
|
742
|
+
}
|
743
|
+
|
744
|
+
# Extend a string with a newline or a EOF character.
|
745
|
+
# As heredoc closing line can immediately precede EOF, this action
|
746
|
+
# has to handle such case specially.
|
747
|
+
action extend_string_eol {
|
748
|
+
is_eof = eof_char? @source[p]
|
749
|
+
|
750
|
+
if literal.heredoc?
|
751
|
+
# Try ending the heredoc with the complete most recently
|
752
|
+
# scanned line. @herebody_s always refers to the start of such line.
|
753
|
+
if literal.nest_and_try_closing(tok(@herebody_s, @te - 1),
|
754
|
+
@herebody_s, @te - 1)
|
755
|
+
# Adjust @herebody_s to point to the next line.
|
756
|
+
@herebody_s = @te
|
757
|
+
|
758
|
+
# Continue regular lexing after the heredoc reference (<<END).
|
759
|
+
p = literal.heredoc_e - 1
|
760
|
+
fgoto *pop_literal;
|
761
|
+
else
|
762
|
+
# Ditto.
|
763
|
+
@herebody_s = @te
|
764
|
+
end
|
765
|
+
end
|
766
|
+
|
767
|
+
if is_eof
|
768
|
+
error "unterminated string meets end of file"
|
769
|
+
end
|
770
|
+
|
771
|
+
# A literal newline is appended if the heredoc was _not_ closed
|
772
|
+
# this time. See also LexerLiteral#nest_and_try_closing for rationale of
|
773
|
+
# calling #flush_string here.
|
774
|
+
literal.extend_string tok, @ts, @te
|
775
|
+
literal.flush_string
|
776
|
+
}
|
777
|
+
|
778
|
+
#
|
779
|
+
# === INTERPOLATION PARSING ===
|
780
|
+
#
|
781
|
+
|
782
|
+
# Interpolations with immediate variable names simply call into
|
783
|
+
# the corresponding machine.
|
784
|
+
|
785
|
+
interp_var =
|
786
|
+
'#' ( global_var | class_var_v | instance_var_v );
|
787
|
+
|
788
|
+
action extend_interp_var {
|
789
|
+
literal.flush_string
|
790
|
+
emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
|
791
|
+
|
792
|
+
p = @ts
|
793
|
+
fcall expr_variable;
|
794
|
+
}
|
795
|
+
|
796
|
+
# Interpolations with code blocks must match nested curly braces, as
|
797
|
+
# interpolation ending is ambiguous with a block ending. So, every
|
798
|
+
# opening and closing brace should be matched with e_[lr]brace rules,
|
799
|
+
# which automatically perform the counting.
|
800
|
+
#
|
801
|
+
# Note that interpolations can themselves be nested, so brace balance
|
802
|
+
# is tied to the innermost literal.
|
803
|
+
#
|
804
|
+
# Also note that literals themselves should not use e_[lr]brace rules
|
805
|
+
# when matching their opening and closing delimiters, as the amount of
|
806
|
+
# braces inside the characters of a string literal is independent.
|
807
|
+
|
808
|
+
interp_code = '#{';
|
809
|
+
|
810
|
+
e_lbrace = '{' % {
|
811
|
+
if literal
|
812
|
+
literal.start_interp_brace
|
813
|
+
end
|
814
|
+
};
|
815
|
+
|
816
|
+
e_rbrace = '}' % {
|
817
|
+
if literal
|
818
|
+
if literal.end_interp_brace_and_try_closing
|
819
|
+
emit(:tRCURLY, '}')
|
820
|
+
|
821
|
+
if literal.words?
|
822
|
+
emit(:tSPACE, nil)
|
823
|
+
end
|
824
|
+
|
825
|
+
if literal.saved_herebody_s
|
826
|
+
@herebody_s = literal.saved_herebody_s
|
827
|
+
end
|
828
|
+
|
829
|
+
fhold;
|
830
|
+
fnext *@stack.pop;
|
831
|
+
fbreak;
|
832
|
+
end
|
833
|
+
end
|
834
|
+
};
|
835
|
+
|
836
|
+
action extend_interp_code {
|
837
|
+
literal.flush_string
|
838
|
+
emit(:tSTRING_DBEG, '#{')
|
839
|
+
|
840
|
+
literal.saved_herebody_s = @herebody_s
|
841
|
+
@herebody_s = nil
|
842
|
+
|
843
|
+
literal.start_interp_brace
|
844
|
+
fcall expr_beg;
|
845
|
+
}
|
846
|
+
|
847
|
+
# Actual string parsers are simply combined from the primitives defined
|
848
|
+
# above.
|
849
|
+
|
850
|
+
interp_words := |*
|
851
|
+
interp_code => extend_interp_code;
|
852
|
+
interp_var => extend_interp_var;
|
853
|
+
e_bs escape => extend_string_escaped;
|
854
|
+
c_space_nl => { literal.flush_string };
|
855
|
+
c_eol => extend_string_eol;
|
856
|
+
c_any => extend_string;
|
857
|
+
*|;
|
858
|
+
|
859
|
+
interp_string := |*
|
860
|
+
interp_code => extend_interp_code;
|
861
|
+
interp_var => extend_interp_var;
|
862
|
+
e_bs escape => extend_string_escaped;
|
863
|
+
c_eol => extend_string_eol;
|
864
|
+
c_any => extend_string;
|
865
|
+
*|;
|
866
|
+
|
867
|
+
plain_words := |*
|
868
|
+
e_bs c_any => extend_string_escaped;
|
869
|
+
c_space_nl => { literal.flush_string };
|
870
|
+
c_eol => extend_string_eol;
|
871
|
+
c_any => extend_string;
|
872
|
+
*|;
|
873
|
+
|
874
|
+
plain_string := |*
|
875
|
+
e_bs c_any => extend_string_escaped;
|
876
|
+
c_eol => extend_string_eol;
|
877
|
+
c_any => extend_string;
|
878
|
+
*|;
|
879
|
+
|
880
|
+
regexp_modifiers := |*
|
881
|
+
[A-Za-z]+
|
882
|
+
=> {
|
883
|
+
unknown_options = tok.scan(/[^imxouesn]/)
|
884
|
+
if unknown_options.any?
|
885
|
+
error "unknown regexp options: #{unknown_options.join}"
|
886
|
+
end
|
887
|
+
|
888
|
+
emit(:tREGEXP_OPT)
|
889
|
+
fgoto expr_end;
|
890
|
+
};
|
891
|
+
|
892
|
+
any
|
893
|
+
=> {
|
894
|
+
emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
|
895
|
+
fhold; fgoto expr_end;
|
896
|
+
};
|
897
|
+
*|;
|
898
|
+
|
899
|
+
#
|
900
|
+
# === EXPRESSION PARSING ===
|
901
|
+
#
|
902
|
+
|
903
|
+
# These rules implement a form of manually defined lookahead.
|
904
|
+
# The default longest-match scanning does not work here due
|
905
|
+
# to sheer ambiguity.
|
906
|
+
|
907
|
+
ambiguous_ident_suffix = # actual parsed
|
908
|
+
[?!=] %{ tm = p } | # a? a?
|
909
|
+
'==' %{ tm = p - 2 } | # a==b a == b
|
910
|
+
'=~' %{ tm = p - 2 } | # a=~b a =~ b
|
911
|
+
'=>' %{ tm = p - 2 } | # a=>b a => b
|
912
|
+
'===' %{ tm = p - 3 } # a===b a === b
|
913
|
+
;
|
914
|
+
|
915
|
+
ambiguous_symbol_suffix = # actual parsed
|
916
|
+
ambiguous_ident_suffix |
|
917
|
+
'==>' %{ tm = p - 2 } # :a==>b :a= => b
|
918
|
+
;
|
919
|
+
|
920
|
+
# Ambiguous with 1.9 hash labels.
|
921
|
+
ambiguous_const_suffix = # actual parsed
|
922
|
+
'::' %{ tm = p - 2 } # A::B A :: B
|
923
|
+
;
|
924
|
+
|
925
|
+
# Ruby 1.9 lambdas require parentheses counting in order to
|
926
|
+
# emit correct opening kDO/tLBRACE.
|
927
|
+
|
928
|
+
e_lparen = '(' % {
|
929
|
+
@paren_nest += 1
|
930
|
+
};
|
931
|
+
|
932
|
+
e_rparen = ')' % {
|
933
|
+
@paren_nest -= 1
|
934
|
+
};
|
935
|
+
|
936
|
+
# Variable lexing code is accessed from both expressions and
|
937
|
+
# string interpolation related code.
|
938
|
+
#
|
939
|
+
expr_variable := |*
|
940
|
+
global_var
|
941
|
+
=> {
|
942
|
+
if tok =~ /^\$([1-9][0-9]*)$/
|
943
|
+
emit(:tNTH_REF, $1.to_i)
|
944
|
+
elsif tok =~ /^\$([&`'+])$/
|
945
|
+
emit(:tBACK_REF, $1.to_sym)
|
946
|
+
else
|
947
|
+
emit(:tGVAR)
|
948
|
+
end
|
949
|
+
|
950
|
+
fnext *@stack.pop; fbreak;
|
951
|
+
};
|
952
|
+
|
953
|
+
class_var_v
|
954
|
+
=> {
|
955
|
+
error "`#{tok}' is not allowed as a class variable name" if tok =~ /^@@[0-9]/
|
956
|
+
|
957
|
+
emit(:tCVAR)
|
958
|
+
fnext *@stack.pop; fbreak;
|
959
|
+
};
|
960
|
+
|
961
|
+
instance_var_v
|
962
|
+
=> {
|
963
|
+
error "`#{tok}' is not allowed as an instance variable name" if tok =~ /^@[0-9]/
|
964
|
+
|
965
|
+
emit(:tIVAR)
|
966
|
+
fnext *@stack.pop; fbreak;
|
967
|
+
};
|
968
|
+
*|;
|
969
|
+
|
970
|
+
# Literal function name in definition (e.g. `def class`).
|
971
|
+
# Keywords are returned as their respective tokens; this is used
|
972
|
+
# to support singleton def `def self.foo`. Global variables are
|
973
|
+
# returned as `tGVAR`; this is used in global variable alias
|
974
|
+
# statements `alias $a $b`. Symbols are returned verbatim; this
|
975
|
+
# is used in `alias :a :"b#{foo}"` and `undef :a`.
|
976
|
+
#
|
977
|
+
# Transitions to `expr_end` afterwards.
|
978
|
+
#
|
979
|
+
expr_fname := |*
|
980
|
+
keyword
|
981
|
+
=> { emit(KEYWORDS[tok]);
|
982
|
+
fnext expr_end; fbreak; };
|
983
|
+
|
984
|
+
bareword
|
985
|
+
=> { emit(:tIDENTIFIER)
|
986
|
+
fnext expr_end; fbreak; };
|
987
|
+
|
988
|
+
bareword ambiguous_ident_suffix
|
989
|
+
=> { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
|
990
|
+
fnext expr_end; p = tm - 1; fbreak; };
|
991
|
+
|
992
|
+
operator_fname |
|
993
|
+
operator_arithmetic |
|
994
|
+
operator_rest
|
995
|
+
=> { emit_table(PUNCTUATION)
|
996
|
+
fnext expr_end; fbreak; };
|
997
|
+
|
998
|
+
':'
|
999
|
+
=> { fhold; fgoto expr_end; };
|
1000
|
+
|
1001
|
+
global_var
|
1002
|
+
=> { emit(:tGVAR)
|
1003
|
+
fbreak; };
|
1004
|
+
|
1005
|
+
c_space_nl+;
|
1006
|
+
|
1007
|
+
c_any
|
1008
|
+
=> { fhold; fgoto expr_end; };
|
1009
|
+
|
1010
|
+
c_eof => do_eof;
|
1011
|
+
*|;
|
1012
|
+
|
1013
|
+
# Literal function name in method call (e.g. `a.class`).
|
1014
|
+
#
|
1015
|
+
# Transitions to `expr_arg` afterwards.
|
1016
|
+
#
|
1017
|
+
expr_dot := |*
|
1018
|
+
bareword
|
1019
|
+
=> { emit(:tIDENTIFIER)
|
1020
|
+
fnext expr_arg; fbreak; };
|
1021
|
+
|
1022
|
+
bareword ambiguous_ident_suffix
|
1023
|
+
=> { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
|
1024
|
+
fnext expr_arg; p = tm - 1; fbreak; };
|
1025
|
+
|
1026
|
+
operator_fname |
|
1027
|
+
operator_arithmetic |
|
1028
|
+
operator_rest
|
1029
|
+
=> { emit_table(PUNCTUATION)
|
1030
|
+
fnext expr_arg; fbreak; };
|
1031
|
+
|
1032
|
+
c_space_nl+;
|
1033
|
+
|
1034
|
+
c_any
|
1035
|
+
=> { fhold; fgoto expr_end; };
|
1036
|
+
|
1037
|
+
c_eof => do_eof;
|
1038
|
+
*|;
|
1039
|
+
|
1040
|
+
# The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
|
1041
|
+
# is consumed; the current expression is a command or method call.
|
1042
|
+
#
|
1043
|
+
expr_arg := |*
|
1044
|
+
#
|
1045
|
+
# COMMAND MODE SPECIFIC TOKENS
|
1046
|
+
#
|
1047
|
+
|
1048
|
+
# cmd (1 + 2)
|
1049
|
+
# See below the rationale about expr_endarg.
|
1050
|
+
c_space+ e_lparen
|
1051
|
+
=> { emit(:tLPAREN_ARG, '(', @te - 1, @te)
|
1052
|
+
fnext expr_beg; fbreak; };
|
1053
|
+
|
1054
|
+
# meth(1 + 2)
|
1055
|
+
# Regular method call.
|
1056
|
+
e_lparen
|
1057
|
+
=> { emit(:tLPAREN2)
|
1058
|
+
fnext expr_beg; fbreak; };
|
1059
|
+
|
1060
|
+
# meth [...]
|
1061
|
+
# Array argument. Compare with indexing `meth[...]`.
|
1062
|
+
c_space+ '['
|
1063
|
+
=> { emit(:tLBRACK, '[', @te - 1, @te);
|
1064
|
+
fnext expr_beg; fbreak; };
|
1065
|
+
|
1066
|
+
# cmd {}
|
1067
|
+
# Command: method call without parentheses.
|
1068
|
+
c_space* e_lbrace
|
1069
|
+
=> {
|
1070
|
+
if @lambda_stack.last == @paren_nest
|
1071
|
+
p = @ts - 1
|
1072
|
+
fgoto expr_end;
|
1073
|
+
else
|
1074
|
+
emit(:tLCURLY, '{', @te - 1, @te)
|
1075
|
+
fnext expr_value; fbreak;
|
1076
|
+
end
|
1077
|
+
};
|
1078
|
+
|
1079
|
+
# a.b
|
1080
|
+
# Dot-call.
|
1081
|
+
'.' | '::'
|
1082
|
+
=> { emit_table(PUNCTUATION);
|
1083
|
+
fnext expr_dot; fbreak; };
|
1084
|
+
|
1085
|
+
#
|
1086
|
+
# AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
|
1087
|
+
#
|
1088
|
+
|
1089
|
+
# a ?b
|
1090
|
+
# Character literal.
|
1091
|
+
c_space+ '?'
|
1092
|
+
=> { fhold; fgoto expr_beg; };
|
1093
|
+
|
1094
|
+
# x +1
|
1095
|
+
# Ambiguous unary operator or regexp literal.
|
1096
|
+
c_space+ [+\-/]
|
1097
|
+
=> {
|
1098
|
+
warning "ambiguous first argument; put parentheses or even spaces", @te - 1, @te
|
1099
|
+
fhold; fhold; fgoto expr_beg;
|
1100
|
+
};
|
1101
|
+
|
1102
|
+
# x *1
|
1103
|
+
# Ambiguous splat or block-pass.
|
1104
|
+
c_space+ [*&]
|
1105
|
+
=> {
|
1106
|
+
what = tok(@te - 1, @te)
|
1107
|
+
warning "`#{what}' interpreted as argument prefix", @te - 1, @te
|
1108
|
+
fhold; fgoto expr_beg;
|
1109
|
+
};
|
1110
|
+
|
1111
|
+
#
|
1112
|
+
# AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
|
1113
|
+
#
|
1114
|
+
|
1115
|
+
# a ? b
|
1116
|
+
# Ternary operator.
|
1117
|
+
c_space+ '?' c_space_nl
|
1118
|
+
=> { fhold; fhold; fgoto expr_end; };
|
1119
|
+
|
1120
|
+
# x + 1: Binary operator or operator-assignment.
|
1121
|
+
c_space* operator_arithmetic
|
1122
|
+
( '=' | c_space_nl )? |
|
1123
|
+
# x rescue y: Modifier keyword.
|
1124
|
+
c_space+ keyword_modifier |
|
1125
|
+
# Miscellanea.
|
1126
|
+
c_space* punctuation_end
|
1127
|
+
=> {
|
1128
|
+
p = @ts - 1
|
1129
|
+
fgoto expr_end;
|
1130
|
+
};
|
1131
|
+
|
1132
|
+
c_space* c_nl
|
1133
|
+
=> { fhold; fgoto expr_end; };
|
1134
|
+
|
1135
|
+
c_any
|
1136
|
+
=> { fhold; fgoto expr_beg; };
|
1137
|
+
|
1138
|
+
c_eof => do_eof;
|
1139
|
+
*|;
|
1140
|
+
|
1141
|
+
# The rationale for this state is pretty complex. Normally, if an argument
|
1142
|
+
# is passed to a command and then there is a block (tLCURLY...tRCURLY),
|
1143
|
+
# the block is attached to the innermost argument (`f` in `m f {}`), or it
|
1144
|
+
# is a parse error (`m 1 {}`). But there is a special case for passing a single
|
1145
|
+
# primary expression grouped with parentheses: if you write `m (1) {}` or
|
1146
|
+
# (2.0 only) `m () {}`, then the block is attached to `m`.
|
1147
|
+
#
|
1148
|
+
# Thus, we recognize the opening `(` of a command (remember, a command is
|
1149
|
+
# a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
|
1150
|
+
# `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
|
1151
|
+
# lexer's state to `expr_endarg`, which makes it emit the possibly following
|
1152
|
+
# `{` as `tLBRACE_ARG`.
|
1153
|
+
#
|
1154
|
+
# The default post-`expr_endarg` state is `expr_end`, so this state also handles
|
1155
|
+
# `do` (as `kDO_BLOCK` in `expr_beg`). (I have no clue why the parser cannot
|
1156
|
+
# just handle `kDO`.)
|
1157
|
+
expr_endarg := |*
|
1158
|
+
e_lbrace
|
1159
|
+
=> { emit(:tLBRACE_ARG)
|
1160
|
+
fnext expr_value; };
|
1161
|
+
|
1162
|
+
'do'
|
1163
|
+
=> { emit(:kDO_BLOCK)
|
1164
|
+
fnext expr_value; };
|
1165
|
+
|
1166
|
+
c_space*;
|
1167
|
+
|
1168
|
+
c_any
|
1169
|
+
=> { fhold; fgoto expr_end; };
|
1170
|
+
|
1171
|
+
c_eof => do_eof;
|
1172
|
+
*|;
|
1173
|
+
|
1174
|
+
# The rationale for this state is that several keywords accept value
|
1175
|
+
# (i.e. should transition to `expr_beg`), do not accept it like a command
|
1176
|
+
# (i.e. not an `expr_arg`), and must behave like a statement, that is,
|
1177
|
+
# accept a modifier if/while/etc.
|
1178
|
+
#
|
1179
|
+
expr_mid := |*
|
1180
|
+
keyword_modifier
|
1181
|
+
=> { emit_table(KEYWORDS)
|
1182
|
+
fnext expr_beg; fbreak; };
|
1183
|
+
|
1184
|
+
c_space+;
|
1185
|
+
|
1186
|
+
c_nl
|
1187
|
+
=> { fhold; fgoto expr_end; };
|
1188
|
+
|
1189
|
+
c_any
|
1190
|
+
=> { fhold; fgoto expr_beg; };
|
1191
|
+
|
1192
|
+
c_eof => do_eof;
|
1193
|
+
*|;
|
1194
|
+
|
1195
|
+
# Beginning of an expression.
|
1196
|
+
#
|
1197
|
+
# Don't fallthrough to this state from `c_any`; make sure to handle
|
1198
|
+
# `c_space* c_nl` and let `expr_end` handle the newline.
|
1199
|
+
# Otherwise code like `f\ndef x` gets glued together and the parser
|
1200
|
+
# explodes.
|
1201
|
+
#
|
1202
|
+
expr_beg := |*
|
1203
|
+
# Numeric processing. Converts:
|
1204
|
+
# +5 to [tINTEGER, 5]
|
1205
|
+
# -5 to [tUMINUS_NUM] [tINTEGER, 5]
|
1206
|
+
[+\-][0-9]
|
1207
|
+
=> {
|
1208
|
+
fhold;
|
1209
|
+
if tok.start_with? '-'
|
1210
|
+
emit(:tUMINUS_NUM, '-')
|
1211
|
+
fnext expr_end; fbreak;
|
1212
|
+
end
|
1213
|
+
};
|
1214
|
+
|
1215
|
+
# splat *a
|
1216
|
+
'*'
|
1217
|
+
=> { emit(:tSTAR)
|
1218
|
+
fbreak; };
|
1219
|
+
|
1220
|
+
#
|
1221
|
+
# STRING AND REGEXP LITERALS
|
1222
|
+
#
|
1223
|
+
|
1224
|
+
# a / 42
|
1225
|
+
# a % 42
|
1226
|
+
# a %= 42 (disambiguation with %=string=)
|
1227
|
+
[/%] c_space_nl | '%=' # /
|
1228
|
+
=> {
|
1229
|
+
fhold; fhold;
|
1230
|
+
fgoto expr_end;
|
1231
|
+
};
|
1232
|
+
|
1233
|
+
# /regexp/oui
|
1234
|
+
'/'
|
1235
|
+
=> {
|
1236
|
+
type, delimiter = tok, tok
|
1237
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1238
|
+
};
|
1239
|
+
|
1240
|
+
# %<string>
|
1241
|
+
'%' ( c_any - [A-Za-z] )
|
1242
|
+
=> {
|
1243
|
+
type, delimiter = tok[0], tok[-1]
|
1244
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1245
|
+
};
|
1246
|
+
|
1247
|
+
# %w(we are the people)
|
1248
|
+
'%' [A-Za-z]+ c_any
|
1249
|
+
=> {
|
1250
|
+
type, delimiter = tok[0..-2], tok[-1]
|
1251
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1252
|
+
};
|
1253
|
+
|
1254
|
+
'%' c_eof
|
1255
|
+
=> {
|
1256
|
+
error "unterminated string meets end of file"
|
1257
|
+
};
|
1258
|
+
|
1259
|
+
# Heredoc start.
|
1260
|
+
# <<EOF | <<-END | <<"FOOBAR" | <<-`SMTH`
|
1261
|
+
'<<' '-'?
|
1262
|
+
( '"' ( c_any - c_nl - '"' )* '"'
|
1263
|
+
| "'" ( c_any - c_nl - "'" )* "'"
|
1264
|
+
| "`" ( c_any - c_nl - "`" )* "`"
|
1265
|
+
| bareword ) % { @heredoc_e = p }
|
1266
|
+
( c_any - c_nl )* c_nl % { new_herebody_s = p }
|
1267
|
+
=> {
|
1268
|
+
tok(@ts, @heredoc_e) =~ /^<<(-?)(["'`]?)(.*)\2$/
|
1269
|
+
|
1270
|
+
indent = !$1.empty?
|
1271
|
+
type = $2.empty? ? '"' : $2
|
1272
|
+
delimiter = $3
|
1273
|
+
|
1274
|
+
fnext *push_literal(type, delimiter, @ts, @heredoc_e, indent);
|
1275
|
+
|
1276
|
+
if @herebody_s.nil?
|
1277
|
+
@herebody_s = new_herebody_s
|
1278
|
+
end
|
1279
|
+
|
1280
|
+
p = @herebody_s - 1
|
1281
|
+
};
|
1282
|
+
|
1283
|
+
#
|
1284
|
+
# AMBIGUOUS TERNARY OPERATOR
|
1285
|
+
#
|
1286
|
+
|
1287
|
+
'?' ( e_bs escape
|
1288
|
+
| c_any - c_space_nl - e_bs % { @escape = nil }
|
1289
|
+
)
|
1290
|
+
=> {
|
1291
|
+
# Show an error if memorized.
|
1292
|
+
@escape.call if @escape.respond_to? :call
|
1293
|
+
|
1294
|
+
value = @escape || tok(@ts + 1)
|
1295
|
+
|
1296
|
+
if ruby18?
|
1297
|
+
emit(:tINTEGER, value.ord)
|
1298
|
+
else
|
1299
|
+
emit(:tSTRING, value)
|
1300
|
+
end
|
1301
|
+
|
1302
|
+
fbreak;
|
1303
|
+
};
|
1304
|
+
|
1305
|
+
'?' c_space_nl
|
1306
|
+
=> {
|
1307
|
+
escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
|
1308
|
+
"\v" => '\v', "\f" => '\f' }[tok[@ts + 1]]
|
1309
|
+
warning "invalid character syntax; use ?#{escape}", @ts
|
1310
|
+
|
1311
|
+
p = @ts - 1
|
1312
|
+
fgoto expr_end;
|
1313
|
+
};
|
1314
|
+
|
1315
|
+
'?' c_eof
|
1316
|
+
=> {
|
1317
|
+
error "incomplete character syntax"
|
1318
|
+
};
|
1319
|
+
|
1320
|
+
# f ?aa : b: Disambiguate with a character literal.
|
1321
|
+
'?' [A-Za-z_] bareword
|
1322
|
+
=> {
|
1323
|
+
p = @ts - 1
|
1324
|
+
fgoto expr_end;
|
1325
|
+
};
|
1326
|
+
|
1327
|
+
#
|
1328
|
+
# KEYWORDS AND PUNCTUATION
|
1329
|
+
#
|
1330
|
+
|
1331
|
+
# a(+b)
|
1332
|
+
punctuation_begin |
|
1333
|
+
# a({b=>c})
|
1334
|
+
e_lbrace |
|
1335
|
+
# a()
|
1336
|
+
e_lparen
|
1337
|
+
=> { emit_table(PUNCTUATION_BEGIN)
|
1338
|
+
fbreak; };
|
1339
|
+
|
1340
|
+
# rescue Exception => e: Block rescue.
|
1341
|
+
# Special because it should transition to expr_mid.
|
1342
|
+
'rescue'
|
1343
|
+
=> { emit_table(KEYWORDS_BEGIN)
|
1344
|
+
fnext expr_mid; fbreak; };
|
1345
|
+
|
1346
|
+
# if a: Statement if.
|
1347
|
+
keyword_modifier
|
1348
|
+
=> { emit_table(KEYWORDS_BEGIN)
|
1349
|
+
fnext expr_value; fbreak; };
|
1350
|
+
|
1351
|
+
#
|
1352
|
+
# RUBY 1.9 HASH LABELS
|
1353
|
+
#
|
1354
|
+
|
1355
|
+
bareword ':' ( c_any - ':' )
|
1356
|
+
=> {
|
1357
|
+
fhold;
|
1358
|
+
|
1359
|
+
if ruby18?
|
1360
|
+
emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
|
1361
|
+
fhold; # continue as a symbol
|
1362
|
+
else
|
1363
|
+
emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
|
1364
|
+
end
|
1365
|
+
|
1366
|
+
fbreak;
|
1367
|
+
};
|
1368
|
+
|
1369
|
+
#
|
1370
|
+
# CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
|
1371
|
+
#
|
1372
|
+
|
1373
|
+
# foo= bar: Disambiguate with bareword rule below.
|
1374
|
+
bareword ambiguous_ident_suffix |
|
1375
|
+
# def foo: Disambiguate with bareword rule below.
|
1376
|
+
keyword
|
1377
|
+
=> { p = @ts - 1
|
1378
|
+
fgoto expr_end; };
|
1379
|
+
|
1380
|
+
# a = 42; a [42]: Indexing.
|
1381
|
+
# def a; end; a [42]: Array argument.
|
1382
|
+
call_or_var
|
1383
|
+
=> {
|
1384
|
+
emit(:tIDENTIFIER)
|
1385
|
+
|
1386
|
+
if @static_env && @static_env.declared?(tok.to_sym)
|
1387
|
+
fgoto expr_end;
|
1388
|
+
else
|
1389
|
+
fgoto expr_arg;
|
1390
|
+
end
|
1391
|
+
};
|
1392
|
+
|
1393
|
+
c_space_nl+;
|
1394
|
+
|
1395
|
+
# The following rules match most binary and all unary operators.
|
1396
|
+
# Rules for binary operators provide better error reporting.
|
1397
|
+
operator_arithmetic '=' |
|
1398
|
+
operator_rest |
|
1399
|
+
punctuation_end |
|
1400
|
+
c_any
|
1401
|
+
=> { p = @ts - 1; fgoto expr_end; };
|
1402
|
+
|
1403
|
+
c_eof => do_eof;
|
1404
|
+
*|;
|
1405
|
+
|
1406
|
+
# Like expr_beg, but no 1.9 label possible.
|
1407
|
+
#
|
1408
|
+
expr_value := |*
|
1409
|
+
# a:b: a(:b), a::B, A::B
|
1410
|
+
bareword ':'
|
1411
|
+
=> { p = @ts - 1
|
1412
|
+
fgoto expr_end; };
|
1413
|
+
|
1414
|
+
c_space_nl+;
|
1415
|
+
|
1416
|
+
c_any
|
1417
|
+
=> { fhold; fgoto expr_beg; };
|
1418
|
+
|
1419
|
+
c_eof => do_eof;
|
1420
|
+
*|;
|
1421
|
+
|
1422
|
+
expr_end := |*
|
1423
|
+
#
|
1424
|
+
# STABBY LAMBDA
|
1425
|
+
#
|
1426
|
+
|
1427
|
+
'->'
|
1428
|
+
=> {
|
1429
|
+
emit_table(PUNCTUATION)
|
1430
|
+
|
1431
|
+
@lambda_stack.push @paren_nest
|
1432
|
+
fbreak;
|
1433
|
+
};
|
1434
|
+
|
1435
|
+
e_lbrace | 'do'
|
1436
|
+
=> {
|
1437
|
+
if @lambda_stack.last == @paren_nest
|
1438
|
+
@lambda_stack.pop
|
1439
|
+
|
1440
|
+
if tok == '{'
|
1441
|
+
emit(:tLAMBEG)
|
1442
|
+
else
|
1443
|
+
emit(:kDO_LAMBDA)
|
1444
|
+
end
|
1445
|
+
else
|
1446
|
+
emit_table(PUNCTUATION)
|
1447
|
+
end
|
1448
|
+
|
1449
|
+
fnext expr_value; fbreak;
|
1450
|
+
};
|
1451
|
+
|
1452
|
+
#
|
1453
|
+
# KEYWORDS
|
1454
|
+
#
|
1455
|
+
|
1456
|
+
keyword_with_fname
|
1457
|
+
=> { emit_table(KEYWORDS)
|
1458
|
+
fnext expr_fname; fbreak; };
|
1459
|
+
|
1460
|
+
'class' c_space_nl '<<'
|
1461
|
+
=> { emit(:kCLASS, 'class', @ts, @ts + 5)
|
1462
|
+
emit(:tLSHFT, '<<', @te - 2, @te)
|
1463
|
+
fnext expr_beg; fbreak; };
|
1464
|
+
|
1465
|
+
# a if b:c: Syntax error.
|
1466
|
+
keyword_modifier
|
1467
|
+
=> { emit_table(KEYWORDS)
|
1468
|
+
fnext expr_beg; fbreak; };
|
1469
|
+
|
1470
|
+
# elsif b:c: elsif b(:c)
|
1471
|
+
keyword_with_value
|
1472
|
+
=> { emit_table(KEYWORDS)
|
1473
|
+
fnext expr_value; fbreak; };
|
1474
|
+
|
1475
|
+
keyword_with_mid
|
1476
|
+
=> { emit_table(KEYWORDS)
|
1477
|
+
fnext expr_mid; fbreak; };
|
1478
|
+
|
1479
|
+
keyword_with_arg
|
1480
|
+
=> {
|
1481
|
+
emit_table(KEYWORDS)
|
1482
|
+
|
1483
|
+
if ruby18? && tok == 'not'
|
1484
|
+
fnext expr_beg; fbreak;
|
1485
|
+
else
|
1486
|
+
fnext expr_arg; fbreak;
|
1487
|
+
end
|
1488
|
+
};
|
1489
|
+
|
1490
|
+
keyword_with_end
|
1491
|
+
=> { emit_table(KEYWORDS)
|
1492
|
+
fbreak; };
|
1493
|
+
|
1494
|
+
#
|
1495
|
+
# NUMERIC LITERALS
|
1496
|
+
#
|
1497
|
+
|
1498
|
+
( '0' [Xx] %{ @num_base = 16; @num_digits_s = p }
|
1499
|
+
( xdigit+ '_' )* xdigit* '_'?
|
1500
|
+
| '0' [Dd] %{ @num_base = 10; @num_digits_s = p }
|
1501
|
+
( digit+ '_' )* digit* '_'?
|
1502
|
+
| '0' [Oo] %{ @num_base = 8; @num_digits_s = p }
|
1503
|
+
( digit+ '_' )* digit* '_'?
|
1504
|
+
| '0' [Bb] %{ @num_base = 2; @num_digits_s = p }
|
1505
|
+
( [01]+ '_' )* [01]* '_'?
|
1506
|
+
| [1-9] %{ @num_base = 10; @num_digits_s = @ts }
|
1507
|
+
( '_' digit+ )* digit* '_'?
|
1508
|
+
| '0' %{ @num_base = 8; @num_digits_s = @ts }
|
1509
|
+
( '_' digit+ )* digit* '_'?
|
1510
|
+
)
|
1511
|
+
=> {
|
1512
|
+
digits = tok(@num_digits_s)
|
1513
|
+
|
1514
|
+
if digits.end_with? '_'
|
1515
|
+
error "trailing `_' in number"
|
1516
|
+
elsif digits.empty? && @num_base == 8 && ruby18?
|
1517
|
+
# 1.8 did not raise an error on 0o.
|
1518
|
+
digits = "0"
|
1519
|
+
elsif digits.empty?
|
1520
|
+
error "numeric literal without digits"
|
1521
|
+
elsif @num_base == 8 && digits =~ /[89]/
|
1522
|
+
error "invalid octal digit"
|
1523
|
+
end
|
1524
|
+
|
1525
|
+
emit(:tINTEGER, digits.to_i(@num_base))
|
1526
|
+
fbreak;
|
1527
|
+
};
|
1528
|
+
|
1529
|
+
# Floating point literals cannot start with 0 except when a dot
|
1530
|
+
# follows immediately, probably to avoid confusion with octal literals.
|
1531
|
+
( [1-9] [0-9]* ( '_' digit+ )* |
|
1532
|
+
'0'
|
1533
|
+
)?
|
1534
|
+
(
|
1535
|
+
'.' ( digit+ '_' )* digit+ |
|
1536
|
+
( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
|
1537
|
+
)
|
1538
|
+
=> {
|
1539
|
+
if tok.start_with? '.'
|
1540
|
+
error "no .<digit> floating literal anymore; put 0 before dot"
|
1541
|
+
elsif tok =~ /^[eE]/
|
1542
|
+
# The rule above allows to specify floats as just `e10', which is
|
1543
|
+
# certainly not a float. Send a patch if you can do this better.
|
1544
|
+
emit(:tIDENTIFIER, tok)
|
1545
|
+
fbreak;
|
1546
|
+
end
|
1547
|
+
|
1548
|
+
emit(:tFLOAT, tok.to_f)
|
1549
|
+
fbreak;
|
1550
|
+
};
|
1551
|
+
|
1552
|
+
#
|
1553
|
+
# SYMBOL LITERALS
|
1554
|
+
#
|
1555
|
+
|
1556
|
+
# `echo foo` | :"bar" | :'baz'
|
1557
|
+
'`' | ':'? ['"] # '
|
1558
|
+
=> {
|
1559
|
+
type, delimiter = tok, tok[-1]
|
1560
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1561
|
+
};
|
1562
|
+
|
1563
|
+
':' bareword ambiguous_symbol_suffix
|
1564
|
+
=> { emit(:tSYMBOL, tok(@ts + 1, tm))
|
1565
|
+
p = tm - 1; fbreak; };
|
1566
|
+
|
1567
|
+
':' ( bareword | global_var | class_var | instance_var |
|
1568
|
+
operator_fname | operator_arithmetic | operator_rest )
|
1569
|
+
=> { emit(:tSYMBOL, tok(@ts + 1))
|
1570
|
+
fbreak; };
|
1571
|
+
|
1572
|
+
#
|
1573
|
+
# CONSTANTS AND VARIABLES
|
1574
|
+
#
|
1575
|
+
|
1576
|
+
constant
|
1577
|
+
=> { emit(:tCONSTANT)
|
1578
|
+
fbreak; };
|
1579
|
+
|
1580
|
+
constant ambiguous_const_suffix
|
1581
|
+
=> { emit(:tCONSTANT, tok(@ts, tm))
|
1582
|
+
p = tm - 1; fbreak; };
|
1583
|
+
|
1584
|
+
global_var | class_var_v | instance_var_v
|
1585
|
+
=> { p = @ts - 1; fcall expr_variable; };
|
1586
|
+
|
1587
|
+
#
|
1588
|
+
# METHOD CALLS
|
1589
|
+
#
|
1590
|
+
|
1591
|
+
'.'
|
1592
|
+
=> { emit_table(PUNCTUATION)
|
1593
|
+
fnext expr_dot; fbreak; };
|
1594
|
+
|
1595
|
+
call_or_var
|
1596
|
+
=> { emit(:tIDENTIFIER)
|
1597
|
+
fnext expr_arg; fbreak; };
|
1598
|
+
|
1599
|
+
call_or_var [?!]
|
1600
|
+
=> { emit(:tFID)
|
1601
|
+
fnext expr_arg; fbreak; };
|
1602
|
+
|
1603
|
+
#
|
1604
|
+
# OPERATORS
|
1605
|
+
#
|
1606
|
+
|
1607
|
+
( e_lparen |
|
1608
|
+
operator_arithmetic |
|
1609
|
+
operator_rest
|
1610
|
+
) %{ tm = p } c_space_nl*
|
1611
|
+
=> { emit_table(PUNCTUATION, @ts, tm)
|
1612
|
+
fnext expr_beg; fbreak; };
|
1613
|
+
|
1614
|
+
e_rbrace | e_rparen | ']'
|
1615
|
+
=> { emit_table(PUNCTUATION)
|
1616
|
+
fbreak; };
|
1617
|
+
|
1618
|
+
operator_arithmetic '='
|
1619
|
+
=> { emit(:tOP_ASGN, tok(@ts, @te - 1))
|
1620
|
+
fnext expr_beg; fbreak; };
|
1621
|
+
|
1622
|
+
'?'
|
1623
|
+
=> { emit_table(PUNCTUATION)
|
1624
|
+
fnext expr_value; fbreak; };
|
1625
|
+
|
1626
|
+
punctuation_end
|
1627
|
+
=> { emit_table(PUNCTUATION)
|
1628
|
+
fnext expr_beg; fbreak; };
|
1629
|
+
|
1630
|
+
#
|
1631
|
+
# WHITESPACE
|
1632
|
+
#
|
1633
|
+
|
1634
|
+
'\\' e_heredoc_nl;
|
1635
|
+
'\\' ( any - c_nl ) {
|
1636
|
+
error "bare backslash only allowed before newline"
|
1637
|
+
};
|
1638
|
+
|
1639
|
+
'#' ( c_any - c_nl )*
|
1640
|
+
=> { @comments << tok(@ts, @te + 1) };
|
1641
|
+
|
1642
|
+
e_heredoc_nl
|
1643
|
+
=> { fgoto leading_dot; };
|
1644
|
+
|
1645
|
+
';'
|
1646
|
+
=> { emit_table(PUNCTUATION)
|
1647
|
+
fnext expr_value; fbreak; };
|
1648
|
+
|
1649
|
+
c_space+;
|
1650
|
+
|
1651
|
+
c_any
|
1652
|
+
=> {
|
1653
|
+
error "unexpected #{tok.inspect}"
|
1654
|
+
};
|
1655
|
+
|
1656
|
+
c_eof => do_eof;
|
1657
|
+
*|;
|
1658
|
+
|
1659
|
+
leading_dot := |*
|
1660
|
+
# Insane leading dots:
|
1661
|
+
# a #comment
|
1662
|
+
# .b: a.b
|
1663
|
+
c_space* '.' ( c_any - '.' )
|
1664
|
+
=> { fhold; fhold;
|
1665
|
+
fgoto expr_end; };
|
1666
|
+
|
1667
|
+
any
|
1668
|
+
=> { emit(:tNL, nil, @newline_s, @newline_s + 1)
|
1669
|
+
fnext line_begin; fhold; fbreak; };
|
1670
|
+
*|;
|
1671
|
+
|
1672
|
+
#
|
1673
|
+
# === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
|
1674
|
+
#
|
1675
|
+
|
1676
|
+
line_comment := |*
|
1677
|
+
'=end' c_line* c_nl
|
1678
|
+
=> { @comments << tok
|
1679
|
+
fgoto line_begin; };
|
1680
|
+
|
1681
|
+
c_line* c_nl
|
1682
|
+
=> { @comments << tok };
|
1683
|
+
|
1684
|
+
any
|
1685
|
+
=> {
|
1686
|
+
@comments = ""
|
1687
|
+
error "embedded document meats end of file (and they embark on a romantic journey)"
|
1688
|
+
};
|
1689
|
+
*|;
|
1690
|
+
|
1691
|
+
line_begin := |*
|
1692
|
+
c_space_nl+;
|
1693
|
+
|
1694
|
+
'#' c_line* c_eol
|
1695
|
+
=> { @comments << tok
|
1696
|
+
fhold; };
|
1697
|
+
|
1698
|
+
'=begin' ( c_space | c_eol )
|
1699
|
+
=> { @comments << tok(@ts, @te)
|
1700
|
+
fgoto line_comment; };
|
1701
|
+
|
1702
|
+
'__END__' c_eol
|
1703
|
+
=> { p = pe - 1 };
|
1704
|
+
|
1705
|
+
c_any
|
1706
|
+
=> { fhold; fgoto expr_value; };
|
1707
|
+
|
1708
|
+
c_eof => do_eof;
|
1709
|
+
*|;
|
1710
|
+
|
1711
|
+
}%%
|
1712
|
+
# %
|
1713
|
+
end
|