parser 0.9.alpha → 0.9.alpha1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +21 -0
- data/.travis.yml +9 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +25 -0
- data/README.md +29 -0
- data/Rakefile +15 -182
- data/lib/parser.rb +4 -0
- data/lib/parser/lexer.rl +1713 -0
- data/lib/parser/lexer_literal.rb +175 -0
- data/lib/parser/static_environment.rb +38 -0
- data/lib/parser/syntax_error.rb +3 -0
- data/parser.gemspec +25 -0
- data/test/{test_ruby_lexer.rb → test_lexer.rb} +77 -129
- data/test/test_static_environment.rb +46 -0
- metadata +39 -98
- data/.autotest +0 -50
- data/.gemtest +0 -0
- data/History.txt +0 -558
- data/Manifest.txt +0 -18
- data/README.txt +0 -87
- data/bin/ruby_parse +0 -96
- data/bin/ruby_parse_extract_error +0 -130
- data/lib/gauntlet_rubyparser.rb +0 -117
- data/lib/ruby18_parser.rb +0 -5706
- data/lib/ruby18_parser.y +0 -1846
- data/lib/ruby19_parser.rb +0 -6054
- data/lib/ruby19_parser.y +0 -2035
- data/lib/ruby_lexer.rb +0 -6789
- data/lib/ruby_parser.rb +0 -4
- data/lib/ruby_parser_extras.rb +0 -1148
- data/test/test_ruby_parser.rb +0 -1772
- data/test/test_ruby_parser_extras.rb +0 -228
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 384df81635da81957880f54cb589109db642c914
|
4
|
+
data.tar.gz: 101c991d44683e9ba699a3ec3deca74572fb7a09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0724f1d86bbe49d1aa390c5bea5e3e6c859850b59be9ef0b78623f23cbe97cae94f75de5bc961cbdad1aabe6a1bb166e63dcac4103bb211a41c19ae86d8d3624
|
7
|
+
data.tar.gz: 1c64825e1d3a58b00d1a7038b599e161025b516435cde204e325061b1800416813f8ef2ab154f5b5dc99a1dc9cb1d11a9dfa33adb1e2e35f756c3fa10a9f5960
|
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.output
|
19
|
+
lib/parser/lexer.rb
|
20
|
+
lib/parser/ruby18.rb
|
21
|
+
lib/parser/ruby19.rb
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2013 Peter Zotov <whitequark@whitequark.org>
|
2
|
+
|
3
|
+
Parts of the source are derived from ruby_parser:
|
4
|
+
Copyright (c) Ryan Davis, seattle.rb
|
5
|
+
|
6
|
+
MIT License
|
7
|
+
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
9
|
+
a copy of this software and associated documentation files (the
|
10
|
+
"Software"), to deal in the Software without restriction, including
|
11
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
12
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
13
|
+
permit persons to whom the Software is furnished to do so, subject to
|
14
|
+
the following conditions:
|
15
|
+
|
16
|
+
The above copyright notice and this permission notice shall be
|
17
|
+
included in all copies or substantial portions of the Software.
|
18
|
+
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
20
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
21
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
22
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
23
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
24
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
25
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Parser
|
2
|
+
|
3
|
+
[](https://travis-ci.org/whitequark/parser)
|
4
|
+
[](https://codeclimate.com/github/whitequark/parser)
|
5
|
+
|
6
|
+
Parser is a Ruby parser written in pure Ruby.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
$ gem install parser
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
TODO: Write usage instructions here
|
15
|
+
|
16
|
+
## Acknowledgements
|
17
|
+
|
18
|
+
The lexer testsuite is derived from [ruby_parser](http://github.com/seattlerb/ruby_parser).
|
19
|
+
|
20
|
+
The Bison parser rules are derived from [Ruby MRI](http://github.com/ruby/ruby) parse.y.
|
21
|
+
|
22
|
+
## Contributing
|
23
|
+
|
24
|
+
1. Make sure you have [Ragel 6.8](http://www.complang.org/ragel/) installed
|
25
|
+
2. Fork it
|
26
|
+
3. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
4. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
5. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
6. Create new Pull Request
|
data/Rakefile
CHANGED
@@ -1,192 +1,25 @@
|
|
1
|
-
|
1
|
+
require "bundler/gem_tasks"
|
2
2
|
|
3
|
-
|
4
|
-
require 'hoe'
|
3
|
+
task :default => [:generate, :test]
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
Hoe.add_include_dirs "../../sexp_processor/dev/lib"
|
11
|
-
|
12
|
-
Hoe.spec 'parser' do
|
13
|
-
developer 'Peter Zotov', 'whitequark@whitequark.org'
|
14
|
-
|
15
|
-
dependency 'sexp_processor', '~> 4.1'
|
16
|
-
|
17
|
-
self.racc_flags << " -t" if plugin?(:racc) && ENV["DEBUG"]
|
18
|
-
end
|
19
|
-
|
20
|
-
file "lib/ruby18_parser.rb" => "lib/ruby18_parser.y"
|
21
|
-
file "lib/ruby19_parser.rb" => "lib/ruby19_parser.y"
|
22
|
-
|
23
|
-
file "lib/ruby_lexer.rb" => "lib/ruby_lexer.rl" do |t|
|
24
|
-
sh "ragel -R #{t.prerequisites.first} -o #{t.name}"
|
25
|
-
end
|
26
|
-
|
27
|
-
task :clean do
|
28
|
-
rm_rf(Dir["**/*~"] +
|
29
|
-
Dir["**/*.diff"] +
|
30
|
-
Dir["coverage.info"] +
|
31
|
-
Dir["coverage"] +
|
32
|
-
Dir["lib/*.output"])
|
33
|
-
end
|
34
|
-
|
35
|
-
def next_num(glob)
|
36
|
-
num = Dir[glob].max[/\d+/].to_i + 1
|
37
|
-
end
|
38
|
-
|
39
|
-
desc "Compares PT to RP and deletes all files that match"
|
40
|
-
task :compare do
|
41
|
-
files = Dir["unit/**/*.rb"]
|
42
|
-
puts "Parsing #{files.size} files"
|
43
|
-
files.each do |file|
|
44
|
-
puts file
|
45
|
-
system "./cmp.rb -q #{file} && rm #{file}"
|
46
|
-
end
|
47
|
-
system 'find -d unit -type d -empty -exec rmdir {} \;'
|
48
|
-
end
|
49
|
-
|
50
|
-
desc "Compares PT to RP and stops on first failure"
|
51
|
-
task :find_bug do
|
52
|
-
files = Dir["unit/**/*.rb"]
|
53
|
-
puts "Parsing #{files.size} files"
|
54
|
-
files.each do |file|
|
55
|
-
puts file
|
56
|
-
sh "./cmp.rb -q #{file}"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
task :sort do
|
61
|
-
sh 'grepsort "^ +def" lib/ruby_lexer.rb'
|
62
|
-
sh 'grepsort "^ +def (test|util)" test/test_ruby_lexer.rb'
|
63
|
-
end
|
64
|
-
|
65
|
-
task :loc do
|
66
|
-
loc1 = `wc -l ../1.0.0/lib/ruby_lexer.rb`[/\d+/]
|
67
|
-
flog1 = `flog -s ../1.0.0/lib/ruby_lexer.rb`[/\d+\.\d+/]
|
68
|
-
loc2 = `cat lib/ruby_lexer.rb lib/ruby_parser_extras.rb | wc -l`[/\d+/]
|
69
|
-
flog2 = `flog -s lib/ruby_lexer.rb lib/ruby_parser_extras.rb`[/\d+\.\d+/]
|
70
|
-
|
71
|
-
loc1, loc2, flog1, flog2 = loc1.to_i, loc2.to_i, flog1.to_f, flog2.to_f
|
72
|
-
|
73
|
-
puts "1.0.0: loc = #{loc1} flog = #{flog1}"
|
74
|
-
puts "dev : loc = #{loc2} flog = #{flog2}"
|
75
|
-
puts "delta: loc = #{loc2-loc1} flog = #{flog2-flog1}"
|
76
|
-
end
|
77
|
-
|
78
|
-
desc "Validate against all normal files in unit dir"
|
79
|
-
task :validate do
|
80
|
-
sh "./cmp.rb unit/*.rb"
|
81
|
-
end
|
82
|
-
|
83
|
-
def run_and_log cmd, prefix
|
84
|
-
files = ENV['FILES'] || 'unit/*.rb'
|
85
|
-
p, x = prefix, "txt"
|
86
|
-
n = Dir["#{p}.*.#{x}"].map { |s| s[/\d+/].to_i }.max + 1 rescue 1
|
87
|
-
f = "#{p}.#{n}.#{x}"
|
88
|
-
|
89
|
-
sh "#{cmd} #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g #{files} &> #{f}"
|
90
|
-
|
91
|
-
puts File.read(f)
|
92
|
-
end
|
93
|
-
|
94
|
-
desc "Benchmark against all normal files in unit dir"
|
95
|
-
task :benchmark do
|
96
|
-
run_and_log "ruby", "benchmark"
|
97
|
-
end
|
98
|
-
|
99
|
-
desc "Profile against all normal files in unit dir"
|
100
|
-
task :profile do
|
101
|
-
run_and_log "zenprofile", "profile"
|
102
|
-
end
|
103
|
-
|
104
|
-
desc "what was that command again?"
|
105
|
-
task :huh? do
|
106
|
-
puts "ruby #{Hoe::RUBY_FLAGS} bin/ruby_parse -q -g ..."
|
107
|
-
end
|
108
|
-
|
109
|
-
task :irb => [:isolate] do
|
110
|
-
sh "GEM_HOME=#{Gem.path.first} irb -rubygems -Ilib -rruby_parser;"
|
111
|
-
end
|
112
|
-
|
113
|
-
def (task(:phony)).timestamp
|
114
|
-
Time.at 0
|
115
|
-
end
|
116
|
-
|
117
|
-
task :isolate => :phony
|
118
|
-
|
119
|
-
file "lib/ruby18_parser.rb" => :isolate
|
120
|
-
file "lib/ruby19_parser.rb" => :isolate
|
121
|
-
|
122
|
-
task :compare18 do
|
123
|
-
sh "./yack.rb lib/ruby18_parser.output > racc18.txt"
|
124
|
-
sh "./yack.rb parse18.output > yacc18.txt"
|
125
|
-
sh "diff -du racc18.txt yacc18.txt || true"
|
126
|
-
puts
|
127
|
-
sh "diff -du racc18.txt yacc18.txt | wc -l"
|
128
|
-
end
|
129
|
-
|
130
|
-
task :compare19 do
|
131
|
-
sh "./yack.rb lib/ruby19_parser.output > racc19.txt"
|
132
|
-
sh "./yack.rb parse19.output > yacc19.txt"
|
133
|
-
sh "diff -du racc19.txt yacc19.txt || true"
|
134
|
-
puts
|
135
|
-
sh "diff -du racc19.txt yacc19.txt | wc -l"
|
136
|
-
end
|
137
|
-
|
138
|
-
task :debug => :isolate do
|
139
|
-
ENV["V"] ||= "19"
|
140
|
-
Rake.application[:parser].invoke # this way we can have DEBUG set
|
141
|
-
|
142
|
-
$: << "lib"
|
143
|
-
require 'ruby_parser'
|
144
|
-
require 'pp'
|
145
|
-
|
146
|
-
parser = if ENV["V"] == "18" then
|
147
|
-
Ruby18Parser.new
|
148
|
-
else
|
149
|
-
Ruby19Parser.new
|
150
|
-
end
|
151
|
-
|
152
|
-
time = (ENV["RP_TIMEOUT"] || 10).to_i
|
153
|
-
|
154
|
-
file = ENV["F"] || ENV["FILE"]
|
155
|
-
|
156
|
-
ruby = if file then
|
157
|
-
File.read(file)
|
158
|
-
else
|
159
|
-
file = "env"
|
160
|
-
ENV["R"] || ENV["RUBY"]
|
161
|
-
end
|
162
|
-
|
163
|
-
begin
|
164
|
-
pp parser.process(ruby, file, time)
|
165
|
-
rescue Racc::ParseError => e
|
166
|
-
p e
|
167
|
-
ss = parser.lexer.src
|
168
|
-
src = ss.string
|
169
|
-
lines = src[0..ss.pos].split(/\n/)
|
170
|
-
abort "on #{file}:#{lines.size}"
|
5
|
+
task :test do
|
6
|
+
$LOAD_PATH << File.expand_path('../lib/', __FILE__)
|
7
|
+
Dir["test/test_*.rb"].each do |file|
|
8
|
+
load file
|
171
9
|
end
|
172
10
|
end
|
173
11
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
task :extract => :isolate do
|
180
|
-
ENV["V"] ||= "19"
|
181
|
-
Rake.application[:parser].invoke # this way we can have DEBUG set
|
12
|
+
desc "Generate the Ragel lexer and Bison parser."
|
13
|
+
task :generate => %w(lib/parser/lexer.rb)
|
14
|
+
#lib/parser/ruby18.rb
|
15
|
+
#lib/parser/ruby19.rb)
|
182
16
|
|
183
|
-
|
17
|
+
task :build => :generate
|
184
18
|
|
185
|
-
|
19
|
+
rule '.rb' => '.rl' do |t|
|
20
|
+
sh "ragel -R #{t.source} -o #{t.name}"
|
186
21
|
end
|
187
22
|
|
188
|
-
|
189
|
-
sh "
|
23
|
+
rule '.rb' => '.y' do |t|
|
24
|
+
sh "racc #{t.source} -o #{t.name} -O"
|
190
25
|
end
|
191
|
-
|
192
|
-
# vim: syntax=Ruby
|
data/lib/parser.rb
ADDED
data/lib/parser/lexer.rl
ADDED
@@ -0,0 +1,1713 @@
|
|
1
|
+
%%machine lex; # % fix highlighting
|
2
|
+
|
3
|
+
#
|
4
|
+
# === BEFORE YOU START ===
|
5
|
+
#
|
6
|
+
# Remember two things about Ragel scanners:
|
7
|
+
#
|
8
|
+
# 1) Longest match wins.
|
9
|
+
#
|
10
|
+
# 2) If two matches have the same length, the first
|
11
|
+
# in source code wins.
|
12
|
+
#
|
13
|
+
# General rules of making Ragel and Bison happy:
|
14
|
+
#
|
15
|
+
# * `p` (position) and `@te` contain the index of the character
|
16
|
+
# they're pointing to ("current"), plus one. `@ts` contains the index
|
17
|
+
# of the corresponding character. The code for extracting matched token is:
|
18
|
+
#
|
19
|
+
# @source[@ts...@te]
|
20
|
+
#
|
21
|
+
# * If your input is `foooooooobar` and the rule is:
|
22
|
+
#
|
23
|
+
# 'f' 'o'+
|
24
|
+
#
|
25
|
+
# the result will be:
|
26
|
+
#
|
27
|
+
# foooooooobar
|
28
|
+
# ^ ts=0 ^ p=te=9
|
29
|
+
#
|
30
|
+
# * A Ragel lexer action should not emit more than one token, unless
|
31
|
+
# you know what you are doing.
|
32
|
+
#
|
33
|
+
# * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
|
34
|
+
#
|
35
|
+
# * If an action emits the token and transitions to another state, use
|
36
|
+
# these Ragel commands:
|
37
|
+
#
|
38
|
+
# emit($whatever)
|
39
|
+
# fnext $next_state; fbreak;
|
40
|
+
#
|
41
|
+
# * If an action does not emit a token:
|
42
|
+
#
|
43
|
+
# fgoto $next_state;
|
44
|
+
#
|
45
|
+
# * If an action features lookbehind, i.e. matches characters with the
|
46
|
+
# intent of passing them to another action:
|
47
|
+
#
|
48
|
+
# p = @ts - 1
|
49
|
+
# fgoto $next_state;
|
50
|
+
#
|
51
|
+
# or, if the lookbehind consists of a single character:
|
52
|
+
#
|
53
|
+
# fhold; fgoto $next_state;
|
54
|
+
#
|
55
|
+
# * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
|
56
|
+
# `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
|
57
|
+
# _will_ invoke the action `act`.
|
58
|
+
#
|
59
|
+
# * EOF is explicit and is matched by `c_eof`. If you want to introspect
|
60
|
+
# the state of the lexer, add this rule to the state:
|
61
|
+
#
|
62
|
+
# c_eof => do_eof;
|
63
|
+
#
|
64
|
+
# * If you proceed past EOF, the lexer will complain:
|
65
|
+
#
|
66
|
+
# NoMethodError: undefined method `ord' for nil:NilClass
|
67
|
+
#
|
68
|
+
|
69
|
+
require 'parser/lexer_literal'
|
70
|
+
require 'parser/syntax_error'
|
71
|
+
|
72
|
+
class Parser::Lexer
|
73
|
+
|
74
|
+
%% write data nofinal;
|
75
|
+
# %
|
76
|
+
|
77
|
+
attr_reader :source
|
78
|
+
attr_accessor :static_env
|
79
|
+
|
80
|
+
attr_reader :location, :comments
|
81
|
+
|
82
|
+
def initialize(version)
|
83
|
+
@version = version
|
84
|
+
|
85
|
+
reset
|
86
|
+
end
|
87
|
+
|
88
|
+
def reset(reset_state=true)
|
89
|
+
if reset_state
|
90
|
+
# Unit tests set state prior to resetting lexer.
|
91
|
+
@cs = self.class.lex_en_line_begin
|
92
|
+
end
|
93
|
+
|
94
|
+
# Ragel-internal variables:
|
95
|
+
@p = 0 # stream position (saved manually in #advance)
|
96
|
+
@ts = nil # token start
|
97
|
+
@te = nil # token end
|
98
|
+
@act = 0 # next action
|
99
|
+
|
100
|
+
@stack = [] # state stack
|
101
|
+
@top = 0 # state stack top pointer
|
102
|
+
|
103
|
+
@token_queue = []
|
104
|
+
@literal_stack = []
|
105
|
+
|
106
|
+
@newlines = [0] # sorted set of \n positions
|
107
|
+
@newline_s = nil # location of last encountered newline
|
108
|
+
@location = nil # location of last #advance'd token
|
109
|
+
|
110
|
+
@comments = "" # collected comments
|
111
|
+
|
112
|
+
@num_base = nil # last numeric base
|
113
|
+
@num_digits_s = nil # starting position of numeric digits
|
114
|
+
|
115
|
+
@escape_s = nil # starting position of current sequence
|
116
|
+
@escape = nil # last escaped sequence, as string
|
117
|
+
|
118
|
+
# See below the section on parsing heredocs.
|
119
|
+
@heredoc_e = nil
|
120
|
+
@herebody_s = nil
|
121
|
+
|
122
|
+
# Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
|
123
|
+
# encountered after a matching closing parenthesis.
|
124
|
+
@paren_nest = 0
|
125
|
+
@lambda_stack = []
|
126
|
+
end
|
127
|
+
|
128
|
+
def source=(source)
|
129
|
+
# Heredoc processing coupled with weird newline quirks
|
130
|
+
# require three '\0' (EOF) chars to be appended; after
|
131
|
+
# `p = @heredoc_s`, if `p` points at EOF, the FSM could
|
132
|
+
# not bail out early enough and will crash.
|
133
|
+
#
|
134
|
+
# Patches accepted.
|
135
|
+
#
|
136
|
+
@source = source.gsub(/\r\n/, "\n") + "\0\0\0"
|
137
|
+
end
|
138
|
+
|
139
|
+
LEX_STATES = {
|
140
|
+
:line_begin => lex_en_line_begin,
|
141
|
+
:expr_beg => lex_en_expr_beg,
|
142
|
+
:expr_value => lex_en_expr_value,
|
143
|
+
:expr_mid => lex_en_expr_mid,
|
144
|
+
:expr_dot => lex_en_expr_dot,
|
145
|
+
:expr_fname => lex_en_expr_fname,
|
146
|
+
:expr_end => lex_en_expr_end,
|
147
|
+
:expr_arg => lex_en_expr_arg,
|
148
|
+
:expr_endarg => lex_en_expr_endarg,
|
149
|
+
}
|
150
|
+
|
151
|
+
def state
|
152
|
+
LEX_STATES.invert.fetch(@cs, @cs)
|
153
|
+
end
|
154
|
+
|
155
|
+
def state=(state)
|
156
|
+
@cs = LEX_STATES.fetch(state)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return next token: [type, value].
|
160
|
+
def advance
|
161
|
+
if @token_queue.any?
|
162
|
+
return with_location(@token_queue.shift)
|
163
|
+
end
|
164
|
+
|
165
|
+
# Ugly, but dependent on Ragel output. Consider refactoring it somehow.
|
166
|
+
_lex_trans_keys = self.class.send :_lex_trans_keys
|
167
|
+
_lex_actions = self.class.send :_lex_actions
|
168
|
+
_lex_key_offsets = self.class.send :_lex_key_offsets
|
169
|
+
_lex_index_offsets = self.class.send :_lex_index_offsets
|
170
|
+
_lex_single_lengths = self.class.send :_lex_single_lengths
|
171
|
+
_lex_range_lengths = self.class.send :_lex_range_lengths
|
172
|
+
_lex_indicies = self.class.send :_lex_indicies
|
173
|
+
_lex_trans_targs = self.class.send :_lex_trans_targs
|
174
|
+
_lex_trans_actions = self.class.send :_lex_trans_actions
|
175
|
+
_lex_to_state_actions = self.class.send :_lex_to_state_actions
|
176
|
+
_lex_from_state_actions = self.class.send :_lex_from_state_actions
|
177
|
+
|
178
|
+
p, pe, eof = @p, @source.length + 1, nil
|
179
|
+
|
180
|
+
%% write exec;
|
181
|
+
# %
|
182
|
+
|
183
|
+
@p = p
|
184
|
+
|
185
|
+
if @token_queue.any?
|
186
|
+
with_location(@token_queue.shift)
|
187
|
+
elsif @cs == self.class.lex_error
|
188
|
+
with_location([ false, '$undefined', p, p + 1 ])
|
189
|
+
else
|
190
|
+
with_location([ false, '$end', p, p + 1 ])
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# Like #advance, but also pretty-print the token and its position
|
195
|
+
# in the stream to `stdout`.
|
196
|
+
def advance_and_decorate
|
197
|
+
type, val = advance
|
198
|
+
|
199
|
+
puts decorate(location, "\e[0;32m#{type} #{val.inspect}\e[0m")
|
200
|
+
|
201
|
+
[type, val]
|
202
|
+
end
|
203
|
+
|
204
|
+
# Return the current collected comment block and clear the storage.
|
205
|
+
def clear_comments
|
206
|
+
comments = @comments
|
207
|
+
@comments = ""
|
208
|
+
|
209
|
+
comments
|
210
|
+
end
|
211
|
+
|
212
|
+
# Lex `str` for the Ruby version `version` with initial state `state`.
|
213
|
+
#
|
214
|
+
# The tokens displayed by this function are not the same as tokens
|
215
|
+
# consumed by parser, because the parser manipulates lexer state on
|
216
|
+
# its own.
|
217
|
+
def self.do(source, state=nil, version=19)
|
218
|
+
lex = new(version)
|
219
|
+
lex.source = source
|
220
|
+
lex.state = state if state
|
221
|
+
|
222
|
+
loop do
|
223
|
+
type, val = lex.advance_and_decorate
|
224
|
+
break if !type
|
225
|
+
end
|
226
|
+
|
227
|
+
puts "Lex state: #{lex.state}"
|
228
|
+
end
|
229
|
+
|
230
|
+
# Used by LexerLiteral to emit tokens for string content.
|
231
|
+
def emit(type, value = tok, s = @ts, e = @te)
|
232
|
+
if s.nil? || e.nil?
|
233
|
+
raise "broken #emit invocation in #{caller[0]}"
|
234
|
+
end
|
235
|
+
|
236
|
+
@token_queue << [ type, value, s, e ]
|
237
|
+
end
|
238
|
+
|
239
|
+
def emit_table(table, s = @ts, e = @te)
|
240
|
+
token = tok(s, e)
|
241
|
+
emit(table[token], token, s, e)
|
242
|
+
end
|
243
|
+
|
244
|
+
# shim
|
245
|
+
def lineno
|
246
|
+
@location[0] + 1
|
247
|
+
end
|
248
|
+
|
249
|
+
protected
|
250
|
+
|
251
|
+
def eof_char?(char)
|
252
|
+
[0x04, 0x1a, 0x00].include? char.ord
|
253
|
+
end
|
254
|
+
|
255
|
+
def ruby18?
|
256
|
+
@version == 18
|
257
|
+
end
|
258
|
+
|
259
|
+
def ruby19?
|
260
|
+
@version == 19
|
261
|
+
end
|
262
|
+
|
263
|
+
def tok(s = @ts, e = @te)
|
264
|
+
@source[s...e]
|
265
|
+
end
|
266
|
+
|
267
|
+
def record_newline(p)
|
268
|
+
@newlines = (@newlines + [p]).uniq.sort
|
269
|
+
end
|
270
|
+
|
271
|
+
def dissect_location(start, finish)
|
272
|
+
line_number = @newlines.rindex { |nl| start >= nl }
|
273
|
+
line_first_col = @newlines[line_number]
|
274
|
+
|
275
|
+
start_col = start - line_first_col
|
276
|
+
finish_col = finish - line_first_col
|
277
|
+
|
278
|
+
[ line_number, start_col, finish_col ]
|
279
|
+
end
|
280
|
+
|
281
|
+
def with_location(item)
|
282
|
+
type, value, start, finish = *item
|
283
|
+
|
284
|
+
@location = dissect_location(start, finish)
|
285
|
+
|
286
|
+
[ type, value ]
|
287
|
+
end
|
288
|
+
|
289
|
+
def decorate(location, message="")
|
290
|
+
line_number, from, to = location
|
291
|
+
|
292
|
+
line = @source.lines.drop(line_number).first
|
293
|
+
line[from...to] = "\e[4m#{line[from...to]}\e[0m"
|
294
|
+
|
295
|
+
tail_len = to - from - 1
|
296
|
+
tail = "~" * (tail_len >= 0 ? tail_len : 0)
|
297
|
+
decoration = "#{" " * from}\e[1;31m^#{tail}\e[0m #{message}"
|
298
|
+
|
299
|
+
[ line, decoration ]
|
300
|
+
end
|
301
|
+
|
302
|
+
def warning(message, start = @ts, finish = @te)
|
303
|
+
$stderr.puts "warning: #{message}"
|
304
|
+
$stderr.puts decorate(dissect_location(start, finish))
|
305
|
+
end
|
306
|
+
|
307
|
+
def error(message)
|
308
|
+
raise Parser::SyntaxError, message
|
309
|
+
end
|
310
|
+
|
311
|
+
#
|
312
|
+
# === LITERAL STACK ===
|
313
|
+
#
|
314
|
+
|
315
|
+
def push_literal(*args)
|
316
|
+
new_literal = Parser::LexerLiteral.new(self, *args)
|
317
|
+
@literal_stack.push(new_literal)
|
318
|
+
|
319
|
+
if new_literal.type == :tWORDS_BEG
|
320
|
+
self.class.lex_en_interp_words
|
321
|
+
elsif new_literal.type == :tQWORDS_BEG
|
322
|
+
self.class.lex_en_plain_words
|
323
|
+
elsif new_literal.interpolate?
|
324
|
+
self.class.lex_en_interp_string
|
325
|
+
else
|
326
|
+
self.class.lex_en_plain_string
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def literal
|
331
|
+
@literal_stack[-1]
|
332
|
+
end
|
333
|
+
|
334
|
+
def pop_literal
|
335
|
+
old_literal = @literal_stack.pop
|
336
|
+
|
337
|
+
if old_literal.type == :tREGEXP_BEG
|
338
|
+
# Fetch modifiers.
|
339
|
+
self.class.lex_en_regexp_modifiers
|
340
|
+
else
|
341
|
+
self.class.lex_en_expr_end
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Mapping of strings to parser tokens.
|
346
|
+
|
347
|
+
PUNCTUATION = {
|
348
|
+
'=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
|
349
|
+
'!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
|
350
|
+
'-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
|
351
|
+
'%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
|
352
|
+
';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
|
353
|
+
'...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
|
354
|
+
'(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
|
355
|
+
':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
|
356
|
+
'-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
|
357
|
+
'**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
|
358
|
+
'!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
|
359
|
+
'>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
|
360
|
+
'<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
|
361
|
+
'=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
|
362
|
+
'<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
|
363
|
+
'{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
|
364
|
+
'do' => :kDO
|
365
|
+
}
|
366
|
+
|
367
|
+
PUNCTUATION_BEGIN = {
|
368
|
+
'&' => :tAMPER, '*' => :tSTAR, '+' => :tUPLUS,
|
369
|
+
'-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN,
|
370
|
+
'{' => :tLBRACE, '[' => :tLBRACK,
|
371
|
+
}
|
372
|
+
|
373
|
+
KEYWORDS = {
|
374
|
+
'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
|
375
|
+
'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
|
376
|
+
'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
|
377
|
+
'BEGIN' => :klBEGIN, 'END' => :klEND,
|
378
|
+
}
|
379
|
+
|
380
|
+
%w(class module def undef begin end then elsif else ensure case when
|
381
|
+
for break next redo retry in do return yield super self nil true
|
382
|
+
false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
|
383
|
+
KEYWORDS[keyword] = :"k#{keyword.upcase}"
|
384
|
+
end
|
385
|
+
|
386
|
+
KEYWORDS_BEGIN = {
|
387
|
+
'if' => :kIF, 'unless' => :kUNLESS,
|
388
|
+
'while' => :kWHILE, 'until' => :kUNTIL,
|
389
|
+
'rescue' => :kRESCUE
|
390
|
+
}
|
391
|
+
|
392
|
+
%%{
|
393
|
+
# %
|
394
|
+
|
395
|
+
access @;
|
396
|
+
getkey @source[p].ord;
|
397
|
+
|
398
|
+
# === CHARACTER CLASSES ===
|
399
|
+
#
|
400
|
+
# Pay close attention to the differences between c_any and any.
|
401
|
+
# c_any does not include EOF and so will cause incorrect behavior
|
402
|
+
# for machine subtraction (any-except rules) and default transitions
|
403
|
+
# for scanners.
|
404
|
+
|
405
|
+
action do_nl {
|
406
|
+
# Record position of a newline for precise line and column reporting.
|
407
|
+
#
|
408
|
+
# This action is embedded directly into c_nl, as it is idempotent and
|
409
|
+
# there are no cases when we need to skip it.
|
410
|
+
record_newline(p + 1)
|
411
|
+
@newline_s = p
|
412
|
+
}
|
413
|
+
|
414
|
+
c_nl = '\n' $ do_nl;
|
415
|
+
c_space = [ \t\r\f\v];
|
416
|
+
c_space_nl = c_space | c_nl;
|
417
|
+
c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
|
418
|
+
c_eol = c_nl | c_eof;
|
419
|
+
c_any = any - c_eof - zlen;
|
420
|
+
c_line = c_any - c_nl;
|
421
|
+
|
422
|
+
c_unicode = c_any - 0x00..0x7f;
|
423
|
+
c_lower = [a-z_] | c_unicode;
|
424
|
+
c_upper = [A-Z] | c_unicode;
|
425
|
+
c_alpha = c_lower | c_upper;
|
426
|
+
c_alnum = c_alpha | [0-9];
|
427
|
+
|
428
|
+
action do_eof {
|
429
|
+
# Sit at EOF indefinitely. #advance would return $eof each time.
|
430
|
+
# This allows to feed the lexer more data if needed; this is only used
|
431
|
+
# in tests.
|
432
|
+
#
|
433
|
+
# Note that this action is not embedded into e_eof like e_nl and e_bs
|
434
|
+
# below. This is due to the fact that scanner state at EOF is observed
|
435
|
+
# by tests, and encapsulating it in a rule would break the introspection.
|
436
|
+
fhold; fbreak;
|
437
|
+
}
|
438
|
+
|
439
|
+
#
|
440
|
+
# === TOKEN DEFINITIONS ===
|
441
|
+
#
|
442
|
+
|
443
|
+
# All operators are punctuation. There is more to punctuation
|
444
|
+
# than just operators. Operators can be overridden by user;
|
445
|
+
# punctuation can not.
|
446
|
+
|
447
|
+
# A list of operators which are valid in the function name context, but
|
448
|
+
# have different semantics in others.
|
449
|
+
operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' ;
|
450
|
+
|
451
|
+
# A list of operators which can occur within an assignment shortcut (+ → +=).
|
452
|
+
operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
|
453
|
+
'*' | '/' | '**' | '~' | '**' | '<<' | '>>' |
|
454
|
+
'%' ;
|
455
|
+
|
456
|
+
# A list of all user-definable operators not covered by groups above.
|
457
|
+
operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
|
458
|
+
'<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
|
459
|
+
|
460
|
+
# Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
|
461
|
+
# as they are ambiguous with interpolation `#{}` and should be counted.
|
462
|
+
# These braces are not present in punctuation lists.
|
463
|
+
|
464
|
+
# A list of punctuation which has different meaning when used at the
|
465
|
+
# beginning of expression.
|
466
|
+
punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '&' ;
|
467
|
+
|
468
|
+
# A list of all punctuation except punctuation_begin.
|
469
|
+
punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
|
470
|
+
'::' | '?' | ':' | '.' | '..' | '...' ;
|
471
|
+
|
472
|
+
# A list of keywords which have different meaning at the beginning of expression.
|
473
|
+
keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
|
474
|
+
|
475
|
+
# A list of keywords which accept an argument-like expression, i.e. have the
|
476
|
+
# same post-processing as method calls or commands. Example: `yield 1`,
|
477
|
+
# `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
|
478
|
+
keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
|
479
|
+
|
480
|
+
# A list of keywords which accept a literal function name as an argument.
|
481
|
+
keyword_with_fname = 'def' | 'undef' | 'alias' ;
|
482
|
+
|
483
|
+
# A list of keywords which accept an expression after them.
|
484
|
+
keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
|
485
|
+
'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
|
486
|
+
'and' | 'or' ;
|
487
|
+
|
488
|
+
# A list of keywords which accept a value, and treat the keywords from
|
489
|
+
# `keyword_modifier` list as modifiers.
|
490
|
+
keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
|
491
|
+
|
492
|
+
# A list of keywords which do not accept an expression after them.
|
493
|
+
keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
|
494
|
+
'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
|
495
|
+
'__LINE__' | '__ENCODING__';
|
496
|
+
|
497
|
+
# All keywords.
|
498
|
+
keyword = keyword_with_value | keyword_with_mid |
|
499
|
+
keyword_with_end | keyword_with_arg |
|
500
|
+
keyword_with_fname | keyword_modifier ;
|
501
|
+
|
502
|
+
constant = [A-Z] c_alnum*;
|
503
|
+
bareword = c_alpha c_alnum*;
|
504
|
+
|
505
|
+
call_or_var = c_lower c_alnum*;
|
506
|
+
class_var = '@@' bareword;
|
507
|
+
instance_var = '@' bareword;
|
508
|
+
global_var = '$'
|
509
|
+
( bareword | digit+
|
510
|
+
| [`'+~*$&?!@/\\;,.=:<>"] # `
|
511
|
+
| '-' [A-Za-z0-9_]?
|
512
|
+
)
|
513
|
+
;
|
514
|
+
|
515
|
+
# Ruby accepts (and fails on) variables with leading digit
|
516
|
+
# in literal context, but not in unquoted symbol body.
|
517
|
+
class_var_v = '@@' [0-9]? bareword;
|
518
|
+
instance_var_v = '@' [0-9]? bareword;
|
519
|
+
|
520
|
+
#
|
521
|
+
# === ESCAPE SEQUENCE PARSING ===
|
522
|
+
#
|
523
|
+
|
524
|
+
# Escape parsing code is a Ragel pattern, not a scanner, and therefore
|
525
|
+
# it shouldn't directly raise errors or perform other actions with side effects.
|
526
|
+
# In reality this would probably just mess up error reporting in pathological
|
527
|
+
# cases, through.
|
528
|
+
|
529
|
+
# The amount of code required to parse \M\C stuff correctly is ridiculous.
|
530
|
+
|
531
|
+
escaped_nl = "\\" c_nl;
|
532
|
+
|
533
|
+
action unicode_points {
|
534
|
+
@escape = ""
|
535
|
+
|
536
|
+
codepoints = tok(@escape_s + 2, p - 1)
|
537
|
+
codepoints.split(/[ \t]/).each do |codepoint_str|
|
538
|
+
codepoint = codepoint_str.to_i(16)
|
539
|
+
|
540
|
+
if codepoint >= 0x110000
|
541
|
+
@escape = lambda { error "invalid Unicode codepoint (too large)" }
|
542
|
+
break
|
543
|
+
end
|
544
|
+
|
545
|
+
@escape += codepoint.chr(Encoding::UTF_8)
|
546
|
+
end
|
547
|
+
}
|
548
|
+
|
549
|
+
action unescape_char {
|
550
|
+
@escape = {
|
551
|
+
'a' => "\a", 'b' => "\b", 'e' => "\e", 'f' => "\f",
|
552
|
+
'n' => "\n", 'r' => "\r", 's' => "\s", 't' => "\t",
|
553
|
+
'v' => "\v", '\\' => "\\"
|
554
|
+
}.fetch(@source[p - 1], @source[p - 1])
|
555
|
+
}
|
556
|
+
|
557
|
+
action invalid_complex_escape {
|
558
|
+
@escape = lambda { error "invalid escape character syntax" }
|
559
|
+
}
|
560
|
+
|
561
|
+
action slash_c_char {
|
562
|
+
@escape = (@escape.ord & 0x9f).chr
|
563
|
+
}
|
564
|
+
|
565
|
+
action slash_m_char {
|
566
|
+
@escape = (@escape.ord | 0x80).chr
|
567
|
+
}
|
568
|
+
|
569
|
+
maybe_escaped_char = (
|
570
|
+
'\\' c_any %unescape_char
|
571
|
+
| ( c_any - [\\] ) % { @escape = @source[p - 1] }
|
572
|
+
);
|
573
|
+
|
574
|
+
maybe_escaped_ctrl_char = ( # why?!
|
575
|
+
'\\' c_any %unescape_char %slash_c_char
|
576
|
+
| '?' % { @escape = "\x7f" }
|
577
|
+
| ( c_any - [\\?] ) % { @escape = @source[p - 1] } %slash_c_char
|
578
|
+
);
|
579
|
+
|
580
|
+
escape = (
|
581
|
+
# \377
|
582
|
+
[0-7]{1,3}
|
583
|
+
% { @escape = tok(@escape_s, p).to_i(8).chr }
|
584
|
+
|
585
|
+
# \xff
|
586
|
+
| ( 'x' xdigit{1,2}
|
587
|
+
% { @escape = tok(@escape_s + 1, p).to_i(16).chr }
|
588
|
+
# \u263a
|
589
|
+
| 'u' xdigit{4}
|
590
|
+
% { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
|
591
|
+
)
|
592
|
+
|
593
|
+
# %q[\x]
|
594
|
+
| 'x' ( c_any - xdigit )
|
595
|
+
% { @escape = lambda { error "invalid hex escape" } }
|
596
|
+
|
597
|
+
# %q[\u123] %q[\u{12]
|
598
|
+
| 'u' ( c_any{0,4} -
|
599
|
+
xdigit{4} - # \u1234 is valid
|
600
|
+
( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
|
601
|
+
| '{' xdigit [ \t}] # \u{1. \u{1} are valid
|
602
|
+
| '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
|
603
|
+
)
|
604
|
+
)
|
605
|
+
% { @escape = lambda { error "invalid Unicode escape" } }
|
606
|
+
|
607
|
+
# \u{123 456}
|
608
|
+
| 'u{' ( xdigit{1,6} [ \t] )*
|
609
|
+
( xdigit{1,6} '}'
|
610
|
+
%unicode_points
|
611
|
+
| ( xdigit* ( c_any - xdigit - '}' )+ '}'
|
612
|
+
| ( c_any - '}' )* c_eof
|
613
|
+
| xdigit{7,}
|
614
|
+
) % { @escape = lambda { error "unterminated Unicode escape" } }
|
615
|
+
)
|
616
|
+
|
617
|
+
# \C-\a \cx
|
618
|
+
| ( 'C-' | 'c' ) escaped_nl?
|
619
|
+
maybe_escaped_ctrl_char
|
620
|
+
|
621
|
+
# \M-a
|
622
|
+
| 'M-' escaped_nl?
|
623
|
+
maybe_escaped_char
|
624
|
+
%slash_m_char
|
625
|
+
|
626
|
+
# \C-\M-f \M-\cf \c\M-f
|
627
|
+
| ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
|
628
|
+
| 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
|
629
|
+
maybe_escaped_ctrl_char
|
630
|
+
%slash_m_char
|
631
|
+
|
632
|
+
| 'C' c_any %invalid_complex_escape
|
633
|
+
| 'M' c_any %invalid_complex_escape
|
634
|
+
| ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
|
635
|
+
|
636
|
+
| ( c_any - [0-7xuCMc] ) %unescape_char
|
637
|
+
|
638
|
+
| c_eof % { error "escape sequence meets end of file" }
|
639
|
+
);
|
640
|
+
|
641
|
+
# Use rules in form of `e_bs escape' when you need to parse a sequence.
|
642
|
+
e_bs = '\\' % {
|
643
|
+
@escape_s = p
|
644
|
+
@escape = nil
|
645
|
+
};
|
646
|
+
|
647
|
+
#
|
648
|
+
# === STRING AND HEREDOC PARSING ===
|
649
|
+
#
|
650
|
+
|
651
|
+
# Heredoc parsing is quite a complex topic. First, consider that heredocs
|
652
|
+
# can be arbitrarily nested. For example:
|
653
|
+
#
|
654
|
+
# puts <<CODE
|
655
|
+
# the result is: #{<<RESULT.inspect
|
656
|
+
# i am a heredoc
|
657
|
+
# RESULT
|
658
|
+
# }
|
659
|
+
# CODE
|
660
|
+
#
|
661
|
+
# which, incidentally, evaluates to:
|
662
|
+
#
|
663
|
+
# the result is: " i am a heredoc\n"
|
664
|
+
#
|
665
|
+
# To parse them, lexer refers to two kinds (remember, nested heredocs)
|
666
|
+
# of positions in the input stream, namely @heredoc_e
|
667
|
+
# (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
|
668
|
+
#
|
669
|
+
# @heredoc_e is simply contained inside the corresponding LexerLiteral, and
|
670
|
+
# when the heredoc is closed, the lexing is restarted from that position.
|
671
|
+
#
|
672
|
+
# @herebody_s is quite more complex. First, @herebody_s changes after each
|
673
|
+
# heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
|
674
|
+
# contains the current line, and also when a heredoc is started, @herebody_s
|
675
|
+
# contains the position from which the heredoc will be lexed.
|
676
|
+
#
|
677
|
+
# Second, as (insanity) there are nested heredocs, we need to maintain a
|
678
|
+
# stack of these positions. Each time #push_literal is called, it saves current
|
679
|
+
# @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
|
680
|
+
# containing another heredocs) is closed, the previous value is restored.
|
681
|
+
|
682
|
+
e_heredoc_nl = c_nl $ {
|
683
|
+
# After every heredoc was parsed, @herebody_s contains the
|
684
|
+
# position of next token after all heredocs.
|
685
|
+
if @herebody_s
|
686
|
+
p = @herebody_s
|
687
|
+
@herebody_s = nil
|
688
|
+
end
|
689
|
+
};
|
690
|
+
|
691
|
+
action extend_string {
|
692
|
+
if literal.nest_and_try_closing tok, @ts, @te
|
693
|
+
fgoto *pop_literal;
|
694
|
+
else
|
695
|
+
literal.extend_string tok, @ts, @te
|
696
|
+
end
|
697
|
+
}
|
698
|
+
|
699
|
+
action extend_string_escaped {
|
700
|
+
if literal.nest_and_try_closing('\\', @ts, @ts + 1)
|
701
|
+
# If the literal is actually closed by the backslash,
|
702
|
+
# rewind the input prior to consuming the escape sequence.
|
703
|
+
p = @escape_s - 1
|
704
|
+
fgoto *pop_literal;
|
705
|
+
else
|
706
|
+
# Get the first character after the backslash.
|
707
|
+
escaped_char = @source[@escape_s]
|
708
|
+
|
709
|
+
if literal.munge_escape? escaped_char
|
710
|
+
# If this particular literal uses this character as an opening
|
711
|
+
# or closing delimiter, it is an escape sequence for that
|
712
|
+
# particular character. Write it without the backslash.
|
713
|
+
|
714
|
+
if literal.regexp?
|
715
|
+
# Regular expressions should have every escape sequence in its
|
716
|
+
# raw form.
|
717
|
+
literal.extend_string(tok, @ts, @te)
|
718
|
+
else
|
719
|
+
literal.extend_string(escaped_char, @ts, @te)
|
720
|
+
end
|
721
|
+
else
|
722
|
+
# It does not. So this is an actual escape sequence, yay!
|
723
|
+
# Two things to consider here.
|
724
|
+
#
|
725
|
+
# 1. The `escape' rule should be pure and so won't raise any
|
726
|
+
# errors by itself. Instead, it stores them in lambdas.
|
727
|
+
#
|
728
|
+
# 2. Non-interpolated literals do not go through the aforementioned
|
729
|
+
# rule. As \\ and \' (and variants) are munged, the full token
|
730
|
+
# should always be written for such literals.
|
731
|
+
|
732
|
+
@escape.call if @escape.respond_to? :call
|
733
|
+
|
734
|
+
if literal.regexp?
|
735
|
+
# Ditto. Also, expand escaped newlines.
|
736
|
+
literal.extend_string(tok.gsub("\\\n", ''), @ts, @te)
|
737
|
+
else
|
738
|
+
literal.extend_string(@escape || tok, @ts, @te)
|
739
|
+
end
|
740
|
+
end
|
741
|
+
end
|
742
|
+
}
|
743
|
+
|
744
|
+
# Extend a string with a newline or a EOF character.
|
745
|
+
# As heredoc closing line can immediately precede EOF, this action
|
746
|
+
# has to handle such case specially.
|
747
|
+
action extend_string_eol {
|
748
|
+
is_eof = eof_char? @source[p]
|
749
|
+
|
750
|
+
if literal.heredoc?
|
751
|
+
# Try ending the heredoc with the complete most recently
|
752
|
+
# scanned line. @herebody_s always refers to the start of such line.
|
753
|
+
if literal.nest_and_try_closing(tok(@herebody_s, @te - 1),
|
754
|
+
@herebody_s, @te - 1)
|
755
|
+
# Adjust @herebody_s to point to the next line.
|
756
|
+
@herebody_s = @te
|
757
|
+
|
758
|
+
# Continue regular lexing after the heredoc reference (<<END).
|
759
|
+
p = literal.heredoc_e - 1
|
760
|
+
fgoto *pop_literal;
|
761
|
+
else
|
762
|
+
# Ditto.
|
763
|
+
@herebody_s = @te
|
764
|
+
end
|
765
|
+
end
|
766
|
+
|
767
|
+
if is_eof
|
768
|
+
error "unterminated string meets end of file"
|
769
|
+
end
|
770
|
+
|
771
|
+
# A literal newline is appended if the heredoc was _not_ closed
|
772
|
+
# this time. See also LexerLiteral#nest_and_try_closing for rationale of
|
773
|
+
# calling #flush_string here.
|
774
|
+
literal.extend_string tok, @ts, @te
|
775
|
+
literal.flush_string
|
776
|
+
}
|
777
|
+
|
778
|
+
#
|
779
|
+
# === INTERPOLATION PARSING ===
|
780
|
+
#
|
781
|
+
|
782
|
+
# Interpolations with immediate variable names simply call into
|
783
|
+
# the corresponding machine.
|
784
|
+
|
785
|
+
interp_var =
|
786
|
+
'#' ( global_var | class_var_v | instance_var_v );
|
787
|
+
|
788
|
+
action extend_interp_var {
|
789
|
+
literal.flush_string
|
790
|
+
emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
|
791
|
+
|
792
|
+
p = @ts
|
793
|
+
fcall expr_variable;
|
794
|
+
}
|
795
|
+
|
796
|
+
# Interpolations with code blocks must match nested curly braces, as
|
797
|
+
# interpolation ending is ambiguous with a block ending. So, every
|
798
|
+
# opening and closing brace should be matched with e_[lr]brace rules,
|
799
|
+
# which automatically perform the counting.
|
800
|
+
#
|
801
|
+
# Note that interpolations can themselves be nested, so brace balance
|
802
|
+
# is tied to the innermost literal.
|
803
|
+
#
|
804
|
+
# Also note that literals themselves should not use e_[lr]brace rules
|
805
|
+
# when matching their opening and closing delimiters, as the amount of
|
806
|
+
# braces inside the characters of a string literal is independent.
|
807
|
+
|
808
|
+
interp_code = '#{';
|
809
|
+
|
810
|
+
e_lbrace = '{' % {
|
811
|
+
if literal
|
812
|
+
literal.start_interp_brace
|
813
|
+
end
|
814
|
+
};
|
815
|
+
|
816
|
+
e_rbrace = '}' % {
|
817
|
+
if literal
|
818
|
+
if literal.end_interp_brace_and_try_closing
|
819
|
+
emit(:tRCURLY, '}')
|
820
|
+
|
821
|
+
if literal.words?
|
822
|
+
emit(:tSPACE, nil)
|
823
|
+
end
|
824
|
+
|
825
|
+
if literal.saved_herebody_s
|
826
|
+
@herebody_s = literal.saved_herebody_s
|
827
|
+
end
|
828
|
+
|
829
|
+
fhold;
|
830
|
+
fnext *@stack.pop;
|
831
|
+
fbreak;
|
832
|
+
end
|
833
|
+
end
|
834
|
+
};
|
835
|
+
|
836
|
+
action extend_interp_code {
|
837
|
+
literal.flush_string
|
838
|
+
emit(:tSTRING_DBEG, '#{')
|
839
|
+
|
840
|
+
literal.saved_herebody_s = @herebody_s
|
841
|
+
@herebody_s = nil
|
842
|
+
|
843
|
+
literal.start_interp_brace
|
844
|
+
fcall expr_beg;
|
845
|
+
}
|
846
|
+
|
847
|
+
# Actual string parsers are simply combined from the primitives defined
|
848
|
+
# above.
|
849
|
+
|
850
|
+
interp_words := |*
|
851
|
+
interp_code => extend_interp_code;
|
852
|
+
interp_var => extend_interp_var;
|
853
|
+
e_bs escape => extend_string_escaped;
|
854
|
+
c_space_nl => { literal.flush_string };
|
855
|
+
c_eol => extend_string_eol;
|
856
|
+
c_any => extend_string;
|
857
|
+
*|;
|
858
|
+
|
859
|
+
interp_string := |*
|
860
|
+
interp_code => extend_interp_code;
|
861
|
+
interp_var => extend_interp_var;
|
862
|
+
e_bs escape => extend_string_escaped;
|
863
|
+
c_eol => extend_string_eol;
|
864
|
+
c_any => extend_string;
|
865
|
+
*|;
|
866
|
+
|
867
|
+
plain_words := |*
|
868
|
+
e_bs c_any => extend_string_escaped;
|
869
|
+
c_space_nl => { literal.flush_string };
|
870
|
+
c_eol => extend_string_eol;
|
871
|
+
c_any => extend_string;
|
872
|
+
*|;
|
873
|
+
|
874
|
+
plain_string := |*
|
875
|
+
e_bs c_any => extend_string_escaped;
|
876
|
+
c_eol => extend_string_eol;
|
877
|
+
c_any => extend_string;
|
878
|
+
*|;
|
879
|
+
|
880
|
+
regexp_modifiers := |*
|
881
|
+
[A-Za-z]+
|
882
|
+
=> {
|
883
|
+
unknown_options = tok.scan(/[^imxouesn]/)
|
884
|
+
if unknown_options.any?
|
885
|
+
error "unknown regexp options: #{unknown_options.join}"
|
886
|
+
end
|
887
|
+
|
888
|
+
emit(:tREGEXP_OPT)
|
889
|
+
fgoto expr_end;
|
890
|
+
};
|
891
|
+
|
892
|
+
any
|
893
|
+
=> {
|
894
|
+
emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
|
895
|
+
fhold; fgoto expr_end;
|
896
|
+
};
|
897
|
+
*|;
|
898
|
+
|
899
|
+
#
|
900
|
+
# === EXPRESSION PARSING ===
|
901
|
+
#
|
902
|
+
|
903
|
+
# These rules implement a form of manually defined lookahead.
|
904
|
+
# The default longest-match scanning does not work here due
|
905
|
+
# to sheer ambiguity.
|
906
|
+
|
907
|
+
ambiguous_ident_suffix = # actual parsed
|
908
|
+
[?!=] %{ tm = p } | # a? a?
|
909
|
+
'==' %{ tm = p - 2 } | # a==b a == b
|
910
|
+
'=~' %{ tm = p - 2 } | # a=~b a =~ b
|
911
|
+
'=>' %{ tm = p - 2 } | # a=>b a => b
|
912
|
+
'===' %{ tm = p - 3 } # a===b a === b
|
913
|
+
;
|
914
|
+
|
915
|
+
ambiguous_symbol_suffix = # actual parsed
|
916
|
+
ambiguous_ident_suffix |
|
917
|
+
'==>' %{ tm = p - 2 } # :a==>b :a= => b
|
918
|
+
;
|
919
|
+
|
920
|
+
# Ambiguous with 1.9 hash labels.
|
921
|
+
ambiguous_const_suffix = # actual parsed
|
922
|
+
'::' %{ tm = p - 2 } # A::B A :: B
|
923
|
+
;
|
924
|
+
|
925
|
+
# Ruby 1.9 lambdas require parentheses counting in order to
|
926
|
+
# emit correct opening kDO/tLBRACE.
|
927
|
+
|
928
|
+
e_lparen = '(' % {
|
929
|
+
@paren_nest += 1
|
930
|
+
};
|
931
|
+
|
932
|
+
e_rparen = ')' % {
|
933
|
+
@paren_nest -= 1
|
934
|
+
};
|
935
|
+
|
936
|
+
# Variable lexing code is accessed from both expressions and
|
937
|
+
# string interpolation related code.
|
938
|
+
#
|
939
|
+
expr_variable := |*
|
940
|
+
global_var
|
941
|
+
=> {
|
942
|
+
if tok =~ /^\$([1-9][0-9]*)$/
|
943
|
+
emit(:tNTH_REF, $1.to_i)
|
944
|
+
elsif tok =~ /^\$([&`'+])$/
|
945
|
+
emit(:tBACK_REF, $1.to_sym)
|
946
|
+
else
|
947
|
+
emit(:tGVAR)
|
948
|
+
end
|
949
|
+
|
950
|
+
fnext *@stack.pop; fbreak;
|
951
|
+
};
|
952
|
+
|
953
|
+
class_var_v
|
954
|
+
=> {
|
955
|
+
error "`#{tok}' is not allowed as a class variable name" if tok =~ /^@@[0-9]/
|
956
|
+
|
957
|
+
emit(:tCVAR)
|
958
|
+
fnext *@stack.pop; fbreak;
|
959
|
+
};
|
960
|
+
|
961
|
+
instance_var_v
|
962
|
+
=> {
|
963
|
+
error "`#{tok}' is not allowed as an instance variable name" if tok =~ /^@[0-9]/
|
964
|
+
|
965
|
+
emit(:tIVAR)
|
966
|
+
fnext *@stack.pop; fbreak;
|
967
|
+
};
|
968
|
+
*|;
|
969
|
+
|
970
|
+
# Literal function name in definition (e.g. `def class`).
|
971
|
+
# Keywords are returned as their respective tokens; this is used
|
972
|
+
# to support singleton def `def self.foo`. Global variables are
|
973
|
+
# returned as `tGVAR`; this is used in global variable alias
|
974
|
+
# statements `alias $a $b`. Symbols are returned verbatim; this
|
975
|
+
# is used in `alias :a :"b#{foo}"` and `undef :a`.
|
976
|
+
#
|
977
|
+
# Transitions to `expr_end` afterwards.
|
978
|
+
#
|
979
|
+
expr_fname := |*
|
980
|
+
keyword
|
981
|
+
=> { emit(KEYWORDS[tok]);
|
982
|
+
fnext expr_end; fbreak; };
|
983
|
+
|
984
|
+
bareword
|
985
|
+
=> { emit(:tIDENTIFIER)
|
986
|
+
fnext expr_end; fbreak; };
|
987
|
+
|
988
|
+
bareword ambiguous_ident_suffix
|
989
|
+
=> { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
|
990
|
+
fnext expr_end; p = tm - 1; fbreak; };
|
991
|
+
|
992
|
+
operator_fname |
|
993
|
+
operator_arithmetic |
|
994
|
+
operator_rest
|
995
|
+
=> { emit_table(PUNCTUATION)
|
996
|
+
fnext expr_end; fbreak; };
|
997
|
+
|
998
|
+
':'
|
999
|
+
=> { fhold; fgoto expr_end; };
|
1000
|
+
|
1001
|
+
global_var
|
1002
|
+
=> { emit(:tGVAR)
|
1003
|
+
fbreak; };
|
1004
|
+
|
1005
|
+
c_space_nl+;
|
1006
|
+
|
1007
|
+
c_any
|
1008
|
+
=> { fhold; fgoto expr_end; };
|
1009
|
+
|
1010
|
+
c_eof => do_eof;
|
1011
|
+
*|;
|
1012
|
+
|
1013
|
+
# Literal function name in method call (e.g. `a.class`).
|
1014
|
+
#
|
1015
|
+
# Transitions to `expr_arg` afterwards.
|
1016
|
+
#
|
1017
|
+
expr_dot := |*
|
1018
|
+
bareword
|
1019
|
+
=> { emit(:tIDENTIFIER)
|
1020
|
+
fnext expr_arg; fbreak; };
|
1021
|
+
|
1022
|
+
bareword ambiguous_ident_suffix
|
1023
|
+
=> { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
|
1024
|
+
fnext expr_arg; p = tm - 1; fbreak; };
|
1025
|
+
|
1026
|
+
operator_fname |
|
1027
|
+
operator_arithmetic |
|
1028
|
+
operator_rest
|
1029
|
+
=> { emit_table(PUNCTUATION)
|
1030
|
+
fnext expr_arg; fbreak; };
|
1031
|
+
|
1032
|
+
c_space_nl+;
|
1033
|
+
|
1034
|
+
c_any
|
1035
|
+
=> { fhold; fgoto expr_end; };
|
1036
|
+
|
1037
|
+
c_eof => do_eof;
|
1038
|
+
*|;
|
1039
|
+
|
1040
|
+
# The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
|
1041
|
+
# is consumed; the current expression is a command or method call.
|
1042
|
+
#
|
1043
|
+
expr_arg := |*
|
1044
|
+
#
|
1045
|
+
# COMMAND MODE SPECIFIC TOKENS
|
1046
|
+
#
|
1047
|
+
|
1048
|
+
# cmd (1 + 2)
|
1049
|
+
# See below the rationale about expr_endarg.
|
1050
|
+
c_space+ e_lparen
|
1051
|
+
=> { emit(:tLPAREN_ARG, '(', @te - 1, @te)
|
1052
|
+
fnext expr_beg; fbreak; };
|
1053
|
+
|
1054
|
+
# meth(1 + 2)
|
1055
|
+
# Regular method call.
|
1056
|
+
e_lparen
|
1057
|
+
=> { emit(:tLPAREN2)
|
1058
|
+
fnext expr_beg; fbreak; };
|
1059
|
+
|
1060
|
+
# meth [...]
|
1061
|
+
# Array argument. Compare with indexing `meth[...]`.
|
1062
|
+
c_space+ '['
|
1063
|
+
=> { emit(:tLBRACK, '[', @te - 1, @te);
|
1064
|
+
fnext expr_beg; fbreak; };
|
1065
|
+
|
1066
|
+
# cmd {}
|
1067
|
+
# Command: method call without parentheses.
|
1068
|
+
c_space* e_lbrace
|
1069
|
+
=> {
|
1070
|
+
if @lambda_stack.last == @paren_nest
|
1071
|
+
p = @ts - 1
|
1072
|
+
fgoto expr_end;
|
1073
|
+
else
|
1074
|
+
emit(:tLCURLY, '{', @te - 1, @te)
|
1075
|
+
fnext expr_value; fbreak;
|
1076
|
+
end
|
1077
|
+
};
|
1078
|
+
|
1079
|
+
# a.b
|
1080
|
+
# Dot-call.
|
1081
|
+
'.' | '::'
|
1082
|
+
=> { emit_table(PUNCTUATION);
|
1083
|
+
fnext expr_dot; fbreak; };
|
1084
|
+
|
1085
|
+
#
|
1086
|
+
# AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
|
1087
|
+
#
|
1088
|
+
|
1089
|
+
# a ?b
|
1090
|
+
# Character literal.
|
1091
|
+
c_space+ '?'
|
1092
|
+
=> { fhold; fgoto expr_beg; };
|
1093
|
+
|
1094
|
+
# x +1
|
1095
|
+
# Ambiguous unary operator or regexp literal.
|
1096
|
+
c_space+ [+\-/]
|
1097
|
+
=> {
|
1098
|
+
warning "ambiguous first argument; put parentheses or even spaces", @te - 1, @te
|
1099
|
+
fhold; fhold; fgoto expr_beg;
|
1100
|
+
};
|
1101
|
+
|
1102
|
+
# x *1
|
1103
|
+
# Ambiguous splat or block-pass.
|
1104
|
+
c_space+ [*&]
|
1105
|
+
=> {
|
1106
|
+
what = tok(@te - 1, @te)
|
1107
|
+
warning "`#{what}' interpreted as argument prefix", @te - 1, @te
|
1108
|
+
fhold; fgoto expr_beg;
|
1109
|
+
};
|
1110
|
+
|
1111
|
+
#
|
1112
|
+
# AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
|
1113
|
+
#
|
1114
|
+
|
1115
|
+
# a ? b
|
1116
|
+
# Ternary operator.
|
1117
|
+
c_space+ '?' c_space_nl
|
1118
|
+
=> { fhold; fhold; fgoto expr_end; };
|
1119
|
+
|
1120
|
+
# x + 1: Binary operator or operator-assignment.
|
1121
|
+
c_space* operator_arithmetic
|
1122
|
+
( '=' | c_space_nl )? |
|
1123
|
+
# x rescue y: Modifier keyword.
|
1124
|
+
c_space+ keyword_modifier |
|
1125
|
+
# Miscellanea.
|
1126
|
+
c_space* punctuation_end
|
1127
|
+
=> {
|
1128
|
+
p = @ts - 1
|
1129
|
+
fgoto expr_end;
|
1130
|
+
};
|
1131
|
+
|
1132
|
+
c_space* c_nl
|
1133
|
+
=> { fhold; fgoto expr_end; };
|
1134
|
+
|
1135
|
+
c_any
|
1136
|
+
=> { fhold; fgoto expr_beg; };
|
1137
|
+
|
1138
|
+
c_eof => do_eof;
|
1139
|
+
*|;
|
1140
|
+
|
1141
|
+
# The rationale for this state is pretty complex. Normally, if an argument
|
1142
|
+
# is passed to a command and then there is a block (tLCURLY...tRCURLY),
|
1143
|
+
# the block is attached to the innermost argument (`f` in `m f {}`), or it
|
1144
|
+
# is a parse error (`m 1 {}`). But there is a special case for passing a single
|
1145
|
+
# primary expression grouped with parentheses: if you write `m (1) {}` or
|
1146
|
+
# (2.0 only) `m () {}`, then the block is attached to `m`.
|
1147
|
+
#
|
1148
|
+
# Thus, we recognize the opening `(` of a command (remember, a command is
|
1149
|
+
# a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
|
1150
|
+
# `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
|
1151
|
+
# lexer's state to `expr_endarg`, which makes it emit the possibly following
|
1152
|
+
# `{` as `tLBRACE_ARG`.
|
1153
|
+
#
|
1154
|
+
# The default post-`expr_endarg` state is `expr_end`, so this state also handles
|
1155
|
+
# `do` (as `kDO_BLOCK` in `expr_beg`). (I have no clue why the parser cannot
|
1156
|
+
# just handle `kDO`.)
|
1157
|
+
expr_endarg := |*
|
1158
|
+
e_lbrace
|
1159
|
+
=> { emit(:tLBRACE_ARG)
|
1160
|
+
fnext expr_value; };
|
1161
|
+
|
1162
|
+
'do'
|
1163
|
+
=> { emit(:kDO_BLOCK)
|
1164
|
+
fnext expr_value; };
|
1165
|
+
|
1166
|
+
c_space*;
|
1167
|
+
|
1168
|
+
c_any
|
1169
|
+
=> { fhold; fgoto expr_end; };
|
1170
|
+
|
1171
|
+
c_eof => do_eof;
|
1172
|
+
*|;
|
1173
|
+
|
1174
|
+
# The rationale for this state is that several keywords accept value
|
1175
|
+
# (i.e. should transition to `expr_beg`), do not accept it like a command
|
1176
|
+
# (i.e. not an `expr_arg`), and must behave like a statement, that is,
|
1177
|
+
# accept a modifier if/while/etc.
|
1178
|
+
#
|
1179
|
+
expr_mid := |*
|
1180
|
+
keyword_modifier
|
1181
|
+
=> { emit_table(KEYWORDS)
|
1182
|
+
fnext expr_beg; fbreak; };
|
1183
|
+
|
1184
|
+
c_space+;
|
1185
|
+
|
1186
|
+
c_nl
|
1187
|
+
=> { fhold; fgoto expr_end; };
|
1188
|
+
|
1189
|
+
c_any
|
1190
|
+
=> { fhold; fgoto expr_beg; };
|
1191
|
+
|
1192
|
+
c_eof => do_eof;
|
1193
|
+
*|;
|
1194
|
+
|
1195
|
+
# Beginning of an expression.
|
1196
|
+
#
|
1197
|
+
# Don't fallthrough to this state from `c_any`; make sure to handle
|
1198
|
+
# `c_space* c_nl` and let `expr_end` handle the newline.
|
1199
|
+
# Otherwise code like `f\ndef x` gets glued together and the parser
|
1200
|
+
# explodes.
|
1201
|
+
#
|
1202
|
+
expr_beg := |*
|
1203
|
+
# Numeric processing. Converts:
|
1204
|
+
# +5 to [tINTEGER, 5]
|
1205
|
+
# -5 to [tUMINUS_NUM] [tINTEGER, 5]
|
1206
|
+
[+\-][0-9]
|
1207
|
+
=> {
|
1208
|
+
fhold;
|
1209
|
+
if tok.start_with? '-'
|
1210
|
+
emit(:tUMINUS_NUM, '-')
|
1211
|
+
fnext expr_end; fbreak;
|
1212
|
+
end
|
1213
|
+
};
|
1214
|
+
|
1215
|
+
# splat *a
|
1216
|
+
'*'
|
1217
|
+
=> { emit(:tSTAR)
|
1218
|
+
fbreak; };
|
1219
|
+
|
1220
|
+
#
|
1221
|
+
# STRING AND REGEXP LITERALS
|
1222
|
+
#
|
1223
|
+
|
1224
|
+
# a / 42
|
1225
|
+
# a % 42
|
1226
|
+
# a %= 42 (disambiguation with %=string=)
|
1227
|
+
[/%] c_space_nl | '%=' # /
|
1228
|
+
=> {
|
1229
|
+
fhold; fhold;
|
1230
|
+
fgoto expr_end;
|
1231
|
+
};
|
1232
|
+
|
1233
|
+
# /regexp/oui
|
1234
|
+
'/'
|
1235
|
+
=> {
|
1236
|
+
type, delimiter = tok, tok
|
1237
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1238
|
+
};
|
1239
|
+
|
1240
|
+
# %<string>
|
1241
|
+
'%' ( c_any - [A-Za-z] )
|
1242
|
+
=> {
|
1243
|
+
type, delimiter = tok[0], tok[-1]
|
1244
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1245
|
+
};
|
1246
|
+
|
1247
|
+
# %w(we are the people)
|
1248
|
+
'%' [A-Za-z]+ c_any
|
1249
|
+
=> {
|
1250
|
+
type, delimiter = tok[0..-2], tok[-1]
|
1251
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1252
|
+
};
|
1253
|
+
|
1254
|
+
'%' c_eof
|
1255
|
+
=> {
|
1256
|
+
error "unterminated string meets end of file"
|
1257
|
+
};
|
1258
|
+
|
1259
|
+
# Heredoc start.
|
1260
|
+
# <<EOF | <<-END | <<"FOOBAR" | <<-`SMTH`
|
1261
|
+
'<<' '-'?
|
1262
|
+
( '"' ( c_any - c_nl - '"' )* '"'
|
1263
|
+
| "'" ( c_any - c_nl - "'" )* "'"
|
1264
|
+
| "`" ( c_any - c_nl - "`" )* "`"
|
1265
|
+
| bareword ) % { @heredoc_e = p }
|
1266
|
+
( c_any - c_nl )* c_nl % { new_herebody_s = p }
|
1267
|
+
=> {
|
1268
|
+
tok(@ts, @heredoc_e) =~ /^<<(-?)(["'`]?)(.*)\2$/
|
1269
|
+
|
1270
|
+
indent = !$1.empty?
|
1271
|
+
type = $2.empty? ? '"' : $2
|
1272
|
+
delimiter = $3
|
1273
|
+
|
1274
|
+
fnext *push_literal(type, delimiter, @ts, @heredoc_e, indent);
|
1275
|
+
|
1276
|
+
if @herebody_s.nil?
|
1277
|
+
@herebody_s = new_herebody_s
|
1278
|
+
end
|
1279
|
+
|
1280
|
+
p = @herebody_s - 1
|
1281
|
+
};
|
1282
|
+
|
1283
|
+
#
|
1284
|
+
# AMBIGUOUS TERNARY OPERATOR
|
1285
|
+
#
|
1286
|
+
|
1287
|
+
'?' ( e_bs escape
|
1288
|
+
| c_any - c_space_nl - e_bs % { @escape = nil }
|
1289
|
+
)
|
1290
|
+
=> {
|
1291
|
+
# Show an error if memorized.
|
1292
|
+
@escape.call if @escape.respond_to? :call
|
1293
|
+
|
1294
|
+
value = @escape || tok(@ts + 1)
|
1295
|
+
|
1296
|
+
if ruby18?
|
1297
|
+
emit(:tINTEGER, value.ord)
|
1298
|
+
else
|
1299
|
+
emit(:tSTRING, value)
|
1300
|
+
end
|
1301
|
+
|
1302
|
+
fbreak;
|
1303
|
+
};
|
1304
|
+
|
1305
|
+
'?' c_space_nl
|
1306
|
+
=> {
|
1307
|
+
escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
|
1308
|
+
"\v" => '\v', "\f" => '\f' }[tok[@ts + 1]]
|
1309
|
+
warning "invalid character syntax; use ?#{escape}", @ts
|
1310
|
+
|
1311
|
+
p = @ts - 1
|
1312
|
+
fgoto expr_end;
|
1313
|
+
};
|
1314
|
+
|
1315
|
+
'?' c_eof
|
1316
|
+
=> {
|
1317
|
+
error "incomplete character syntax"
|
1318
|
+
};
|
1319
|
+
|
1320
|
+
# f ?aa : b: Disambiguate with a character literal.
|
1321
|
+
'?' [A-Za-z_] bareword
|
1322
|
+
=> {
|
1323
|
+
p = @ts - 1
|
1324
|
+
fgoto expr_end;
|
1325
|
+
};
|
1326
|
+
|
1327
|
+
#
|
1328
|
+
# KEYWORDS AND PUNCTUATION
|
1329
|
+
#
|
1330
|
+
|
1331
|
+
# a(+b)
|
1332
|
+
punctuation_begin |
|
1333
|
+
# a({b=>c})
|
1334
|
+
e_lbrace |
|
1335
|
+
# a()
|
1336
|
+
e_lparen
|
1337
|
+
=> { emit_table(PUNCTUATION_BEGIN)
|
1338
|
+
fbreak; };
|
1339
|
+
|
1340
|
+
# rescue Exception => e: Block rescue.
|
1341
|
+
# Special because it should transition to expr_mid.
|
1342
|
+
'rescue'
|
1343
|
+
=> { emit_table(KEYWORDS_BEGIN)
|
1344
|
+
fnext expr_mid; fbreak; };
|
1345
|
+
|
1346
|
+
# if a: Statement if.
|
1347
|
+
keyword_modifier
|
1348
|
+
=> { emit_table(KEYWORDS_BEGIN)
|
1349
|
+
fnext expr_value; fbreak; };
|
1350
|
+
|
1351
|
+
#
|
1352
|
+
# RUBY 1.9 HASH LABELS
|
1353
|
+
#
|
1354
|
+
|
1355
|
+
bareword ':' ( c_any - ':' )
|
1356
|
+
=> {
|
1357
|
+
fhold;
|
1358
|
+
|
1359
|
+
if ruby18?
|
1360
|
+
emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
|
1361
|
+
fhold; # continue as a symbol
|
1362
|
+
else
|
1363
|
+
emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
|
1364
|
+
end
|
1365
|
+
|
1366
|
+
fbreak;
|
1367
|
+
};
|
1368
|
+
|
1369
|
+
#
|
1370
|
+
# CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
|
1371
|
+
#
|
1372
|
+
|
1373
|
+
# foo= bar: Disambiguate with bareword rule below.
|
1374
|
+
bareword ambiguous_ident_suffix |
|
1375
|
+
# def foo: Disambiguate with bareword rule below.
|
1376
|
+
keyword
|
1377
|
+
=> { p = @ts - 1
|
1378
|
+
fgoto expr_end; };
|
1379
|
+
|
1380
|
+
# a = 42; a [42]: Indexing.
|
1381
|
+
# def a; end; a [42]: Array argument.
|
1382
|
+
call_or_var
|
1383
|
+
=> {
|
1384
|
+
emit(:tIDENTIFIER)
|
1385
|
+
|
1386
|
+
if @static_env && @static_env.declared?(tok.to_sym)
|
1387
|
+
fgoto expr_end;
|
1388
|
+
else
|
1389
|
+
fgoto expr_arg;
|
1390
|
+
end
|
1391
|
+
};
|
1392
|
+
|
1393
|
+
c_space_nl+;
|
1394
|
+
|
1395
|
+
# The following rules match most binary and all unary operators.
|
1396
|
+
# Rules for binary operators provide better error reporting.
|
1397
|
+
operator_arithmetic '=' |
|
1398
|
+
operator_rest |
|
1399
|
+
punctuation_end |
|
1400
|
+
c_any
|
1401
|
+
=> { p = @ts - 1; fgoto expr_end; };
|
1402
|
+
|
1403
|
+
c_eof => do_eof;
|
1404
|
+
*|;
|
1405
|
+
|
1406
|
+
# Like expr_beg, but no 1.9 label possible.
|
1407
|
+
#
|
1408
|
+
expr_value := |*
|
1409
|
+
# a:b: a(:b), a::B, A::B
|
1410
|
+
bareword ':'
|
1411
|
+
=> { p = @ts - 1
|
1412
|
+
fgoto expr_end; };
|
1413
|
+
|
1414
|
+
c_space_nl+;
|
1415
|
+
|
1416
|
+
c_any
|
1417
|
+
=> { fhold; fgoto expr_beg; };
|
1418
|
+
|
1419
|
+
c_eof => do_eof;
|
1420
|
+
*|;
|
1421
|
+
|
1422
|
+
expr_end := |*
|
1423
|
+
#
|
1424
|
+
# STABBY LAMBDA
|
1425
|
+
#
|
1426
|
+
|
1427
|
+
'->'
|
1428
|
+
=> {
|
1429
|
+
emit_table(PUNCTUATION)
|
1430
|
+
|
1431
|
+
@lambda_stack.push @paren_nest
|
1432
|
+
fbreak;
|
1433
|
+
};
|
1434
|
+
|
1435
|
+
e_lbrace | 'do'
|
1436
|
+
=> {
|
1437
|
+
if @lambda_stack.last == @paren_nest
|
1438
|
+
@lambda_stack.pop
|
1439
|
+
|
1440
|
+
if tok == '{'
|
1441
|
+
emit(:tLAMBEG)
|
1442
|
+
else
|
1443
|
+
emit(:kDO_LAMBDA)
|
1444
|
+
end
|
1445
|
+
else
|
1446
|
+
emit_table(PUNCTUATION)
|
1447
|
+
end
|
1448
|
+
|
1449
|
+
fnext expr_value; fbreak;
|
1450
|
+
};
|
1451
|
+
|
1452
|
+
#
|
1453
|
+
# KEYWORDS
|
1454
|
+
#
|
1455
|
+
|
1456
|
+
keyword_with_fname
|
1457
|
+
=> { emit_table(KEYWORDS)
|
1458
|
+
fnext expr_fname; fbreak; };
|
1459
|
+
|
1460
|
+
'class' c_space_nl '<<'
|
1461
|
+
=> { emit(:kCLASS, 'class', @ts, @ts + 5)
|
1462
|
+
emit(:tLSHFT, '<<', @te - 2, @te)
|
1463
|
+
fnext expr_beg; fbreak; };
|
1464
|
+
|
1465
|
+
# a if b:c: Syntax error.
|
1466
|
+
keyword_modifier
|
1467
|
+
=> { emit_table(KEYWORDS)
|
1468
|
+
fnext expr_beg; fbreak; };
|
1469
|
+
|
1470
|
+
# elsif b:c: elsif b(:c)
|
1471
|
+
keyword_with_value
|
1472
|
+
=> { emit_table(KEYWORDS)
|
1473
|
+
fnext expr_value; fbreak; };
|
1474
|
+
|
1475
|
+
keyword_with_mid
|
1476
|
+
=> { emit_table(KEYWORDS)
|
1477
|
+
fnext expr_mid; fbreak; };
|
1478
|
+
|
1479
|
+
keyword_with_arg
|
1480
|
+
=> {
|
1481
|
+
emit_table(KEYWORDS)
|
1482
|
+
|
1483
|
+
if ruby18? && tok == 'not'
|
1484
|
+
fnext expr_beg; fbreak;
|
1485
|
+
else
|
1486
|
+
fnext expr_arg; fbreak;
|
1487
|
+
end
|
1488
|
+
};
|
1489
|
+
|
1490
|
+
keyword_with_end
|
1491
|
+
=> { emit_table(KEYWORDS)
|
1492
|
+
fbreak; };
|
1493
|
+
|
1494
|
+
#
|
1495
|
+
# NUMERIC LITERALS
|
1496
|
+
#
|
1497
|
+
|
1498
|
+
( '0' [Xx] %{ @num_base = 16; @num_digits_s = p }
|
1499
|
+
( xdigit+ '_' )* xdigit* '_'?
|
1500
|
+
| '0' [Dd] %{ @num_base = 10; @num_digits_s = p }
|
1501
|
+
( digit+ '_' )* digit* '_'?
|
1502
|
+
| '0' [Oo] %{ @num_base = 8; @num_digits_s = p }
|
1503
|
+
( digit+ '_' )* digit* '_'?
|
1504
|
+
| '0' [Bb] %{ @num_base = 2; @num_digits_s = p }
|
1505
|
+
( [01]+ '_' )* [01]* '_'?
|
1506
|
+
| [1-9] %{ @num_base = 10; @num_digits_s = @ts }
|
1507
|
+
( '_' digit+ )* digit* '_'?
|
1508
|
+
| '0' %{ @num_base = 8; @num_digits_s = @ts }
|
1509
|
+
( '_' digit+ )* digit* '_'?
|
1510
|
+
)
|
1511
|
+
=> {
|
1512
|
+
digits = tok(@num_digits_s)
|
1513
|
+
|
1514
|
+
if digits.end_with? '_'
|
1515
|
+
error "trailing `_' in number"
|
1516
|
+
elsif digits.empty? && @num_base == 8 && ruby18?
|
1517
|
+
# 1.8 did not raise an error on 0o.
|
1518
|
+
digits = "0"
|
1519
|
+
elsif digits.empty?
|
1520
|
+
error "numeric literal without digits"
|
1521
|
+
elsif @num_base == 8 && digits =~ /[89]/
|
1522
|
+
error "invalid octal digit"
|
1523
|
+
end
|
1524
|
+
|
1525
|
+
emit(:tINTEGER, digits.to_i(@num_base))
|
1526
|
+
fbreak;
|
1527
|
+
};
|
1528
|
+
|
1529
|
+
# Floating point literals cannot start with 0 except when a dot
|
1530
|
+
# follows immediately, probably to avoid confusion with octal literals.
|
1531
|
+
( [1-9] [0-9]* ( '_' digit+ )* |
|
1532
|
+
'0'
|
1533
|
+
)?
|
1534
|
+
(
|
1535
|
+
'.' ( digit+ '_' )* digit+ |
|
1536
|
+
( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
|
1537
|
+
)
|
1538
|
+
=> {
|
1539
|
+
if tok.start_with? '.'
|
1540
|
+
error "no .<digit> floating literal anymore; put 0 before dot"
|
1541
|
+
elsif tok =~ /^[eE]/
|
1542
|
+
# The rule above allows to specify floats as just `e10', which is
|
1543
|
+
# certainly not a float. Send a patch if you can do this better.
|
1544
|
+
emit(:tIDENTIFIER, tok)
|
1545
|
+
fbreak;
|
1546
|
+
end
|
1547
|
+
|
1548
|
+
emit(:tFLOAT, tok.to_f)
|
1549
|
+
fbreak;
|
1550
|
+
};
|
1551
|
+
|
1552
|
+
#
|
1553
|
+
# SYMBOL LITERALS
|
1554
|
+
#
|
1555
|
+
|
1556
|
+
# `echo foo` | :"bar" | :'baz'
|
1557
|
+
'`' | ':'? ['"] # '
|
1558
|
+
=> {
|
1559
|
+
type, delimiter = tok, tok[-1]
|
1560
|
+
fgoto *push_literal(type, delimiter, @ts);
|
1561
|
+
};
|
1562
|
+
|
1563
|
+
':' bareword ambiguous_symbol_suffix
|
1564
|
+
=> { emit(:tSYMBOL, tok(@ts + 1, tm))
|
1565
|
+
p = tm - 1; fbreak; };
|
1566
|
+
|
1567
|
+
':' ( bareword | global_var | class_var | instance_var |
|
1568
|
+
operator_fname | operator_arithmetic | operator_rest )
|
1569
|
+
=> { emit(:tSYMBOL, tok(@ts + 1))
|
1570
|
+
fbreak; };
|
1571
|
+
|
1572
|
+
#
|
1573
|
+
# CONSTANTS AND VARIABLES
|
1574
|
+
#
|
1575
|
+
|
1576
|
+
constant
|
1577
|
+
=> { emit(:tCONSTANT)
|
1578
|
+
fbreak; };
|
1579
|
+
|
1580
|
+
constant ambiguous_const_suffix
|
1581
|
+
=> { emit(:tCONSTANT, tok(@ts, tm))
|
1582
|
+
p = tm - 1; fbreak; };
|
1583
|
+
|
1584
|
+
global_var | class_var_v | instance_var_v
|
1585
|
+
=> { p = @ts - 1; fcall expr_variable; };
|
1586
|
+
|
1587
|
+
#
|
1588
|
+
# METHOD CALLS
|
1589
|
+
#
|
1590
|
+
|
1591
|
+
'.'
|
1592
|
+
=> { emit_table(PUNCTUATION)
|
1593
|
+
fnext expr_dot; fbreak; };
|
1594
|
+
|
1595
|
+
call_or_var
|
1596
|
+
=> { emit(:tIDENTIFIER)
|
1597
|
+
fnext expr_arg; fbreak; };
|
1598
|
+
|
1599
|
+
call_or_var [?!]
|
1600
|
+
=> { emit(:tFID)
|
1601
|
+
fnext expr_arg; fbreak; };
|
1602
|
+
|
1603
|
+
#
|
1604
|
+
# OPERATORS
|
1605
|
+
#
|
1606
|
+
|
1607
|
+
( e_lparen |
|
1608
|
+
operator_arithmetic |
|
1609
|
+
operator_rest
|
1610
|
+
) %{ tm = p } c_space_nl*
|
1611
|
+
=> { emit_table(PUNCTUATION, @ts, tm)
|
1612
|
+
fnext expr_beg; fbreak; };
|
1613
|
+
|
1614
|
+
e_rbrace | e_rparen | ']'
|
1615
|
+
=> { emit_table(PUNCTUATION)
|
1616
|
+
fbreak; };
|
1617
|
+
|
1618
|
+
operator_arithmetic '='
|
1619
|
+
=> { emit(:tOP_ASGN, tok(@ts, @te - 1))
|
1620
|
+
fnext expr_beg; fbreak; };
|
1621
|
+
|
1622
|
+
'?'
|
1623
|
+
=> { emit_table(PUNCTUATION)
|
1624
|
+
fnext expr_value; fbreak; };
|
1625
|
+
|
1626
|
+
punctuation_end
|
1627
|
+
=> { emit_table(PUNCTUATION)
|
1628
|
+
fnext expr_beg; fbreak; };
|
1629
|
+
|
1630
|
+
#
|
1631
|
+
# WHITESPACE
|
1632
|
+
#
|
1633
|
+
|
1634
|
+
'\\' e_heredoc_nl;
|
1635
|
+
'\\' ( any - c_nl ) {
|
1636
|
+
error "bare backslash only allowed before newline"
|
1637
|
+
};
|
1638
|
+
|
1639
|
+
'#' ( c_any - c_nl )*
|
1640
|
+
=> { @comments << tok(@ts, @te + 1) };
|
1641
|
+
|
1642
|
+
e_heredoc_nl
|
1643
|
+
=> { fgoto leading_dot; };
|
1644
|
+
|
1645
|
+
';'
|
1646
|
+
=> { emit_table(PUNCTUATION)
|
1647
|
+
fnext expr_value; fbreak; };
|
1648
|
+
|
1649
|
+
c_space+;
|
1650
|
+
|
1651
|
+
c_any
|
1652
|
+
=> {
|
1653
|
+
error "unexpected #{tok.inspect}"
|
1654
|
+
};
|
1655
|
+
|
1656
|
+
c_eof => do_eof;
|
1657
|
+
*|;
|
1658
|
+
|
1659
|
+
leading_dot := |*
|
1660
|
+
# Insane leading dots:
|
1661
|
+
# a #comment
|
1662
|
+
# .b: a.b
|
1663
|
+
c_space* '.' ( c_any - '.' )
|
1664
|
+
=> { fhold; fhold;
|
1665
|
+
fgoto expr_end; };
|
1666
|
+
|
1667
|
+
any
|
1668
|
+
=> { emit(:tNL, nil, @newline_s, @newline_s + 1)
|
1669
|
+
fnext line_begin; fhold; fbreak; };
|
1670
|
+
*|;
|
1671
|
+
|
1672
|
+
#
|
1673
|
+
# === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
|
1674
|
+
#
|
1675
|
+
|
1676
|
+
line_comment := |*
|
1677
|
+
'=end' c_line* c_nl
|
1678
|
+
=> { @comments << tok
|
1679
|
+
fgoto line_begin; };
|
1680
|
+
|
1681
|
+
c_line* c_nl
|
1682
|
+
=> { @comments << tok };
|
1683
|
+
|
1684
|
+
any
|
1685
|
+
=> {
|
1686
|
+
@comments = ""
|
1687
|
+
error "embedded document meats end of file (and they embark on a romantic journey)"
|
1688
|
+
};
|
1689
|
+
*|;
|
1690
|
+
|
1691
|
+
line_begin := |*
|
1692
|
+
c_space_nl+;
|
1693
|
+
|
1694
|
+
'#' c_line* c_eol
|
1695
|
+
=> { @comments << tok
|
1696
|
+
fhold; };
|
1697
|
+
|
1698
|
+
'=begin' ( c_space | c_eol )
|
1699
|
+
=> { @comments << tok(@ts, @te)
|
1700
|
+
fgoto line_comment; };
|
1701
|
+
|
1702
|
+
'__END__' c_eol
|
1703
|
+
=> { p = pe - 1 };
|
1704
|
+
|
1705
|
+
c_any
|
1706
|
+
=> { fhold; fgoto expr_value; };
|
1707
|
+
|
1708
|
+
c_eof => do_eof;
|
1709
|
+
*|;
|
1710
|
+
|
1711
|
+
}%%
|
1712
|
+
# %
|
1713
|
+
end
|