parser 2.5.1.0 → 3.0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/lib/parser.rb +4 -0
  3. data/lib/parser/all.rb +3 -0
  4. data/lib/parser/ast/processor.rb +49 -1
  5. data/lib/parser/base.rb +30 -6
  6. data/lib/parser/builders/default.rb +586 -29
  7. data/lib/parser/context.rb +17 -0
  8. data/lib/parser/current.rb +34 -7
  9. data/lib/parser/current_arg_stack.rb +46 -0
  10. data/lib/parser/diagnostic.rb +1 -1
  11. data/lib/parser/diagnostic/engine.rb +1 -2
  12. data/lib/parser/lexer.rb +23780 -0
  13. data/lib/parser/lexer/dedenter.rb +52 -49
  14. data/lib/parser/lexer/literal.rb +4 -0
  15. data/lib/parser/lexer/stack_state.rb +4 -0
  16. data/lib/parser/macruby.rb +6149 -0
  17. data/lib/parser/max_numparam_stack.rb +56 -0
  18. data/lib/parser/messages.rb +74 -44
  19. data/lib/parser/meta.rb +13 -3
  20. data/lib/parser/ruby18.rb +5667 -0
  21. data/lib/parser/ruby19.rb +6092 -0
  22. data/lib/parser/ruby20.rb +6527 -0
  23. data/lib/parser/ruby21.rb +6578 -0
  24. data/lib/parser/ruby22.rb +6613 -0
  25. data/lib/parser/ruby23.rb +6624 -0
  26. data/lib/parser/ruby24.rb +6694 -0
  27. data/lib/parser/ruby25.rb +6662 -0
  28. data/lib/parser/ruby26.rb +6676 -0
  29. data/lib/parser/ruby27.rb +7862 -0
  30. data/lib/parser/ruby28.rb +8047 -0
  31. data/lib/parser/ruby30.rb +8060 -0
  32. data/lib/parser/ruby31.rb +8075 -0
  33. data/lib/parser/rubymotion.rb +6086 -0
  34. data/lib/parser/runner.rb +36 -2
  35. data/lib/parser/runner/ruby_parse.rb +2 -2
  36. data/lib/parser/runner/ruby_rewrite.rb +2 -2
  37. data/lib/parser/source/buffer.rb +54 -29
  38. data/lib/parser/source/comment.rb +18 -5
  39. data/lib/parser/source/comment/associator.rb +34 -11
  40. data/lib/parser/source/map.rb +1 -1
  41. data/lib/parser/source/map/method_definition.rb +25 -0
  42. data/lib/parser/source/range.rb +20 -4
  43. data/lib/parser/source/tree_rewriter.rb +146 -16
  44. data/lib/parser/source/tree_rewriter/action.rb +137 -28
  45. data/lib/parser/static_environment.rb +14 -0
  46. data/lib/parser/tree_rewriter.rb +3 -3
  47. data/lib/parser/variables_stack.rb +36 -0
  48. data/lib/parser/version.rb +1 -1
  49. data/parser.gemspec +13 -21
  50. metadata +34 -98
  51. data/.gitignore +0 -32
  52. data/.travis.yml +0 -21
  53. data/.yardopts +0 -21
  54. data/CHANGELOG.md +0 -909
  55. data/CONTRIBUTING.md +0 -17
  56. data/Gemfile +0 -10
  57. data/README.md +0 -301
  58. data/Rakefile +0 -165
  59. data/doc/AST_FORMAT.md +0 -1718
  60. data/doc/CUSTOMIZATION.md +0 -37
  61. data/doc/INTERNALS.md +0 -21
  62. data/doc/css/.gitkeep +0 -0
  63. data/doc/css/common.css +0 -68
  64. data/lib/parser/lexer.rl +0 -2376
  65. data/lib/parser/macruby.y +0 -2198
  66. data/lib/parser/ruby18.y +0 -1934
  67. data/lib/parser/ruby19.y +0 -2175
  68. data/lib/parser/ruby20.y +0 -2353
  69. data/lib/parser/ruby21.y +0 -2357
  70. data/lib/parser/ruby22.y +0 -2364
  71. data/lib/parser/ruby23.y +0 -2370
  72. data/lib/parser/ruby24.y +0 -2395
  73. data/lib/parser/ruby25.y +0 -2392
  74. data/lib/parser/ruby26.y +0 -2392
  75. data/lib/parser/rubymotion.y +0 -2182
  76. data/test/bug_163/fixtures/input.rb +0 -5
  77. data/test/bug_163/fixtures/output.rb +0 -5
  78. data/test/bug_163/rewriter.rb +0 -20
  79. data/test/helper.rb +0 -52
  80. data/test/parse_helper.rb +0 -315
  81. data/test/racc_coverage_helper.rb +0 -133
  82. data/test/test_base.rb +0 -31
  83. data/test/test_current.rb +0 -27
  84. data/test/test_diagnostic.rb +0 -96
  85. data/test/test_diagnostic_engine.rb +0 -62
  86. data/test/test_encoding.rb +0 -99
  87. data/test/test_lexer.rb +0 -3537
  88. data/test/test_lexer_stack_state.rb +0 -78
  89. data/test/test_parse_helper.rb +0 -80
  90. data/test/test_parser.rb +0 -6968
  91. data/test/test_runner_rewrite.rb +0 -47
  92. data/test/test_source_buffer.rb +0 -162
  93. data/test/test_source_comment.rb +0 -36
  94. data/test/test_source_comment_associator.rb +0 -367
  95. data/test/test_source_map.rb +0 -15
  96. data/test/test_source_range.rb +0 -172
  97. data/test/test_source_rewriter.rb +0 -541
  98. data/test/test_source_rewriter_action.rb +0 -46
  99. data/test/test_source_tree_rewriter.rb +0 -173
  100. data/test/test_static_environment.rb +0 -45
  101. data/test/using_tree_rewriter/fixtures/input.rb +0 -3
  102. data/test/using_tree_rewriter/fixtures/output.rb +0 -3
  103. data/test/using_tree_rewriter/using_tree_rewriter.rb +0 -9
data/doc/CUSTOMIZATION.md DELETED
@@ -1,37 +0,0 @@
1
- # Customizing Parsers
2
-
3
- While the default setup of the parsers provided by this Gem should be suitable
4
- for most some developers might want to change parts of it. An example would be
5
- the use of a custom class for nodes instead of `Parser::AST::Node`.
6
-
7
- Customizing the AST is done by creating a custom builder class and passing it
8
- to the constructor method of a parser. The default setup comes down to the
9
- following:
10
-
11
- builder = Parser::Builders::Default.new
12
- parser = Parser::Ruby19.new(builder)
13
-
14
- When creating your own builder class it's best to subclass the default one so
15
- that you don't have to redefine every used method again:
16
-
17
- class MyBuilder < Parser::Builders::Default
18
-
19
- end
20
-
21
- builder = MyBuilder.new
22
- parser = Parser::Ruby19.new(builder)
23
-
24
- ## Custom Node Classes
25
-
26
- To use a custom node class you have to override the method
27
- `Parser::Builders::Default#n`:
28
-
29
- class MyBuilder < Parser::Builders::Default
30
- def n(type, children, location)
31
- return MyNodeClass.new(type, children, :location => location)
32
- end
33
- end
34
-
35
- Note that the used class (and corresponding instance) must be compatible with
36
- `Parser::AST::Node` so it's best to subclass it and override/add code where
37
- needed.
data/doc/INTERNALS.md DELETED
@@ -1,21 +0,0 @@
1
- Entry points
2
- ------------
3
-
4
- Parser should be kept as slim as possible. This includes not loading
5
- any potentially large files when they are likely to be unused in practice.
6
-
7
- Parser has five main (classes of) `require` entry points:
8
-
9
- * `require 'parser'`. Main entry point, requires all classes which
10
- are used across the entire library.
11
- * `require 'parser/rubyXX'`. Version-specific entry point. Can raise
12
- a NotImplementedError if current Ruby runtime is unable to parse the
13
- requested Ruby version.
14
- * `require 'parser/all'`. Requires all available parsers for released
15
- versions of Ruby. Can raise NotImplementedError.
16
- * `require 'parser/runner'`. Requires all the stuff which is useful for
17
- command-line tools but not otherwise.
18
- * `require 'parser/runner/X'`. Runner-specific entry point.
19
-
20
- All non-main entry points internally `require 'parser'`. Additionally, all
21
- runner-specific entry points internally `requre 'parser/runner'`.
data/doc/css/.gitkeep DELETED
File without changes
data/doc/css/common.css DELETED
@@ -1,68 +0,0 @@
1
- body
2
- {
3
- font-size: 14px;
4
- line-height: 1.6;
5
- margin: 0 auto;
6
- max-width: 960px;
7
- }
8
-
9
- p code
10
- {
11
- background: #f2f2f2;
12
- padding-left: 3px;
13
- padding-right: 3px;
14
- }
15
-
16
- pre.code
17
- {
18
- font-size: 13px;
19
- line-height: 1.4;
20
- }
21
-
22
- /**
23
- * YARD uses generic table styles, using a special class means those tables
24
- * don't get messed up.
25
- */
26
- .table
27
- {
28
- border: 1px solid #ccc;
29
- border-right: none;
30
- border-collapse: separate;
31
- border-spacing: 0;
32
- text-align: left;
33
- }
34
-
35
- .table.full
36
- {
37
- width: 100%;
38
- }
39
-
40
- .table .field_name
41
- {
42
- min-width: 160px;
43
- }
44
-
45
- .table thead tr th.no_sort:first-child
46
- {
47
- width: 25px;
48
- }
49
-
50
- .table thead tr th, .table tbody tr td
51
- {
52
- border-bottom: 1px solid #ccc;
53
- border-right: 1px solid #ccc;
54
- min-width: 20px;
55
- padding: 8px 5px;
56
- text-align: left;
57
- vertical-align: top;
58
- }
59
-
60
- .table tbody tr:last-child td
61
- {
62
- border-bottom: none;
63
- }
64
-
65
- .table tr:nth-child(odd) td
66
- {
67
- background: #f9f9f9;
68
- }
data/lib/parser/lexer.rl DELETED
@@ -1,2376 +0,0 @@
1
- %%machine lex; # % fix highlighting
2
-
3
- #
4
- # === BEFORE YOU START ===
5
- #
6
- # Read the Ruby Hacking Guide chapter 11, available in English at
7
- # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
- #
9
- # Remember two things about Ragel scanners:
10
- #
11
- # 1) Longest match wins.
12
- #
13
- # 2) If two matches have the same length, the first
14
- # in source code wins.
15
- #
16
- # General rules of making Ragel and Bison happy:
17
- #
18
- # * `p` (position) and `@te` contain the index of the character
19
- # they're pointing to ("current"), plus one. `@ts` contains the index
20
- # of the corresponding character. The code for extracting matched token is:
21
- #
22
- # @source_buffer.slice(@ts...@te)
23
- #
24
- # * If your input is `foooooooobar` and the rule is:
25
- #
26
- # 'f' 'o'+
27
- #
28
- # the result will be:
29
- #
30
- # foooooooobar
31
- # ^ ts=0 ^ p=te=9
32
- #
33
- # * A Ragel lexer action should not emit more than one token, unless
34
- # you know what you are doing.
35
- #
36
- # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
- #
38
- # * If an action emits the token and transitions to another state, use
39
- # these Ragel commands:
40
- #
41
- # emit($whatever)
42
- # fnext $next_state; fbreak;
43
- #
44
- # If you perform `fgoto` in an action which does not emit a token nor
45
- # rewinds the stream pointer, the parser's side-effectful,
46
- # context-sensitive lookahead actions will break in a hard to detect
47
- # and debug way.
48
- #
49
- # * If an action does not emit a token:
50
- #
51
- # fgoto $next_state;
52
- #
53
- # * If an action features lookbehind, i.e. matches characters with the
54
- # intent of passing them to another action:
55
- #
56
- # p = @ts - 1
57
- # fgoto $next_state;
58
- #
59
- # or, if the lookbehind consists of a single character:
60
- #
61
- # fhold; fgoto $next_state;
62
- #
63
- # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
- # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
- # _will_ invoke the action `act`.
66
- #
67
- # e_something stands for "something with **e**mbedded action".
68
- #
69
- # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
- # the state of the lexer, add this rule to the state:
71
- #
72
- # c_eof => do_eof;
73
- #
74
- # * If you proceed past EOF, the lexer will complain:
75
- #
76
- # NoMethodError: undefined method `ord' for nil:NilClass
77
- #
78
-
79
- class Parser::Lexer
80
-
81
- %% write data nofinal;
82
- # %
83
-
84
- ESCAPES = {
85
- ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
86
- ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
87
- ?v.ord => "\v", ?\\.ord => "\\"
88
- }.freeze
89
-
90
- REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
91
-
92
- attr_reader :source_buffer
93
-
94
- attr_accessor :diagnostics
95
- attr_accessor :static_env
96
- attr_accessor :force_utf32
97
-
98
- attr_accessor :cond, :cmdarg, :in_kwarg
99
-
100
- attr_accessor :tokens, :comments
101
-
102
- def initialize(version)
103
- @version = version
104
- @static_env = nil
105
-
106
- @tokens = nil
107
- @comments = nil
108
-
109
- reset
110
- end
111
-
112
- def reset(reset_state=true)
113
- # Ragel state:
114
- if reset_state
115
- # Unit tests set state prior to resetting lexer.
116
- @cs = self.class.lex_en_line_begin
117
-
118
- @cond = StackState.new('cond')
119
- @cmdarg = StackState.new('cmdarg')
120
- @cond_stack = []
121
- @cmdarg_stack = []
122
- end
123
-
124
- @force_utf32 = false # Set to true by some tests
125
-
126
- @source_pts = nil # @source as a codepoint array
127
-
128
- @p = 0 # stream position (saved manually in #advance)
129
- @ts = nil # token start
130
- @te = nil # token end
131
- @act = 0 # next action
132
-
133
- @stack = [] # state stack
134
- @top = 0 # state stack top pointer
135
-
136
- # Lexer state:
137
- @token_queue = []
138
- @literal_stack = []
139
-
140
- @eq_begin_s = nil # location of last encountered =begin
141
- @sharp_s = nil # location of last encountered #
142
-
143
- @newline_s = nil # location of last encountered newline
144
-
145
- @num_base = nil # last numeric base
146
- @num_digits_s = nil # starting position of numeric digits
147
- @num_suffix_s = nil # starting position of numeric suffix
148
- @num_xfrm = nil # numeric suffix-induced transformation
149
-
150
- @escape_s = nil # starting position of current sequence
151
- @escape = nil # last escaped sequence, as string
152
-
153
- @herebody_s = nil # starting position of current heredoc line
154
-
155
- # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
156
- # encountered after a matching closing parenthesis.
157
- @paren_nest = 0
158
- @lambda_stack = []
159
-
160
- # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
161
- # we store the indentation level and give it out to the parser
162
- # on request. It is not possible to infer indentation level just
163
- # from the AST because escape sequences such as `\ ` or `\t` are
164
- # expanded inside the lexer, but count as non-whitespace for
165
- # indentation purposes.
166
- @dedent_level = nil
167
-
168
- # If the lexer is in `command state' (aka expr_value)
169
- # at the entry to #advance, it will transition to expr_cmdarg
170
- # instead of expr_arg at certain points.
171
- @command_state = false
172
-
173
- # True at the end of "def foo a:"
174
- @in_kwarg = false
175
-
176
- # State before =begin / =end block comment
177
- @cs_before_block_comment = self.class.lex_en_line_begin
178
- end
179
-
180
- def source_buffer=(source_buffer)
181
- @source_buffer = source_buffer
182
-
183
- if @source_buffer
184
- source = @source_buffer.source
185
-
186
- if source.encoding == Encoding::UTF_8
187
- @source_pts = source.unpack('U*')
188
- else
189
- @source_pts = source.unpack('C*')
190
- end
191
-
192
- if @source_pts[0] == 0xfeff
193
- # Skip byte order mark.
194
- @p = 1
195
- end
196
- else
197
- @source_pts = nil
198
- end
199
- end
200
-
201
- def encoding
202
- @source_buffer.source.encoding
203
- end
204
-
205
- LEX_STATES = {
206
- :line_begin => lex_en_line_begin,
207
- :expr_dot => lex_en_expr_dot,
208
- :expr_fname => lex_en_expr_fname,
209
- :expr_value => lex_en_expr_value,
210
- :expr_beg => lex_en_expr_beg,
211
- :expr_mid => lex_en_expr_mid,
212
- :expr_arg => lex_en_expr_arg,
213
- :expr_cmdarg => lex_en_expr_cmdarg,
214
- :expr_end => lex_en_expr_end,
215
- :expr_endarg => lex_en_expr_endarg,
216
- :expr_endfn => lex_en_expr_endfn,
217
- :expr_labelarg => lex_en_expr_labelarg,
218
-
219
- :interp_string => lex_en_interp_string,
220
- :interp_words => lex_en_interp_words,
221
- :plain_string => lex_en_plain_string,
222
- :plain_words => lex_en_plain_string,
223
- }
224
-
225
- def state
226
- LEX_STATES.invert.fetch(@cs, @cs)
227
- end
228
-
229
- def state=(state)
230
- @cs = LEX_STATES.fetch(state)
231
- end
232
-
233
- def push_cmdarg
234
- @cmdarg_stack.push(@cmdarg)
235
- @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
236
- end
237
-
238
- def pop_cmdarg
239
- @cmdarg = @cmdarg_stack.pop
240
- end
241
-
242
- def push_cond
243
- @cond_stack.push(@cond)
244
- @cond = StackState.new("cond.#{@cond_stack.count}")
245
- end
246
-
247
- def pop_cond
248
- @cond = @cond_stack.pop
249
- end
250
-
251
- def dedent_level
252
- # We erase @dedent_level as a precaution to avoid accidentally
253
- # using a stale value.
254
- dedent_level, @dedent_level = @dedent_level, nil
255
- dedent_level
256
- end
257
-
258
- # Return next token: [type, value].
259
- def advance
260
- if @token_queue.any?
261
- return @token_queue.shift
262
- end
263
-
264
- # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
265
- klass = self.class
266
- _lex_trans_keys = klass.send :_lex_trans_keys
267
- _lex_key_spans = klass.send :_lex_key_spans
268
- _lex_index_offsets = klass.send :_lex_index_offsets
269
- _lex_indicies = klass.send :_lex_indicies
270
- _lex_trans_targs = klass.send :_lex_trans_targs
271
- _lex_trans_actions = klass.send :_lex_trans_actions
272
- _lex_to_state_actions = klass.send :_lex_to_state_actions
273
- _lex_from_state_actions = klass.send :_lex_from_state_actions
274
- _lex_eof_trans = klass.send :_lex_eof_trans
275
-
276
- pe = @source_pts.size + 2
277
- p, eof = @p, pe
278
-
279
- @command_state = (@cs == klass.lex_en_expr_value ||
280
- @cs == klass.lex_en_line_begin)
281
-
282
- %% write exec;
283
- # %
284
-
285
- @p = p
286
-
287
- if @token_queue.any?
288
- @token_queue.shift
289
- elsif @cs == klass.lex_error
290
- [ false, [ '$error'.freeze, range(p - 1, p) ] ]
291
- else
292
- eof = @source_pts.size
293
- [ false, [ '$eof'.freeze, range(eof, eof) ] ]
294
- end
295
- end
296
-
297
- protected
298
-
299
- def eof_codepoint?(point)
300
- [0x04, 0x1a, 0x00].include? point
301
- end
302
-
303
- def version?(*versions)
304
- versions.include?(@version)
305
- end
306
-
307
- def stack_pop
308
- @top -= 1
309
- @stack[@top]
310
- end
311
-
312
- def encode_escape(ord)
313
- ord.chr.force_encoding(@source_buffer.source.encoding)
314
- end
315
-
316
- def tok(s = @ts, e = @te)
317
- @source_buffer.slice(s...e)
318
- end
319
-
320
- def range(s = @ts, e = @te)
321
- Parser::Source::Range.new(@source_buffer, s, e)
322
- end
323
-
324
- def emit(type, value = tok, s = @ts, e = @te)
325
- token = [ type, [ value, range(s, e) ] ]
326
-
327
- @token_queue.push(token)
328
-
329
- @tokens.push(token) if @tokens
330
-
331
- token
332
- end
333
-
334
- def emit_table(table, s = @ts, e = @te)
335
- value = tok(s, e)
336
-
337
- emit(table[value], value, s, e)
338
- end
339
-
340
- def emit_do(do_block=false)
341
- if @cond.active?
342
- emit(:kDO_COND, 'do'.freeze)
343
- elsif @cmdarg.active? || do_block
344
- emit(:kDO_BLOCK, 'do'.freeze)
345
- else
346
- emit(:kDO, 'do'.freeze)
347
- end
348
- end
349
-
350
- def arg_or_cmdarg
351
- if @command_state
352
- self.class.lex_en_expr_cmdarg
353
- else
354
- self.class.lex_en_expr_arg
355
- end
356
- end
357
-
358
- def emit_comment(s = @ts, e = @te)
359
- if @comments
360
- @comments.push(Parser::Source::Comment.new(range(s, e)))
361
- end
362
-
363
- if @tokens
364
- @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
365
- end
366
-
367
- nil
368
- end
369
-
370
- def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
371
- @diagnostics.process(
372
- Parser::Diagnostic.new(type, reason, arguments, location, highlights))
373
- end
374
-
375
- #
376
- # === LITERAL STACK ===
377
- #
378
-
379
- def push_literal(*args)
380
- new_literal = Literal.new(self, *args)
381
- @literal_stack.push(new_literal)
382
- next_state_for_literal(new_literal)
383
- end
384
-
385
- def next_state_for_literal(literal)
386
- if literal.words? && literal.backslash_delimited?
387
- if literal.interpolate?
388
- self.class.lex_en_interp_backslash_delimited_words
389
- else
390
- self.class.lex_en_plain_backslash_delimited_words
391
- end
392
- elsif literal.words? && !literal.backslash_delimited?
393
- if literal.interpolate?
394
- self.class.lex_en_interp_words
395
- else
396
- self.class.lex_en_plain_words
397
- end
398
- elsif !literal.words? && literal.backslash_delimited?
399
- if literal.interpolate?
400
- self.class.lex_en_interp_backslash_delimited
401
- else
402
- self.class.lex_en_plain_backslash_delimited
403
- end
404
- else
405
- if literal.interpolate?
406
- self.class.lex_en_interp_string
407
- else
408
- self.class.lex_en_plain_string
409
- end
410
- end
411
- end
412
-
413
- def literal
414
- @literal_stack.last
415
- end
416
-
417
- def pop_literal
418
- old_literal = @literal_stack.pop
419
-
420
- @dedent_level = old_literal.dedent_level
421
-
422
- if old_literal.type == :tREGEXP_BEG
423
- # Fetch modifiers.
424
- self.class.lex_en_regexp_modifiers
425
- else
426
- self.class.lex_en_expr_end
427
- end
428
- end
429
-
430
- # Mapping of strings to parser tokens.
431
-
432
- PUNCTUATION = {
433
- '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
434
- '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
435
- '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
436
- '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
437
- ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
438
- '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
439
- '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
440
- ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
441
- '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
442
- '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
443
- '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
444
- '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
445
- '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
446
- '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
447
- '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
448
- '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
449
- '!@' => :tBANG, '&.' => :tANDDOT,
450
- }
451
-
452
- PUNCTUATION_BEGIN = {
453
- '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
454
- '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
455
- '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
456
- }
457
-
458
- KEYWORDS = {
459
- 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
460
- 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
461
- 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
462
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
463
- }
464
-
465
- KEYWORDS_BEGIN = {
466
- 'if' => :kIF, 'unless' => :kUNLESS,
467
- 'while' => :kWHILE, 'until' => :kUNTIL,
468
- 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
469
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
470
- }
471
-
472
- %w(class module def undef begin end then elsif else ensure case when
473
- for break next redo retry in do return yield super self nil true
474
- false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
475
- KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
476
- end
477
-
478
- %%{
479
- # %
480
-
481
- access @;
482
- getkey (@source_pts[p] || 0);
483
-
484
- # === CHARACTER CLASSES ===
485
- #
486
- # Pay close attention to the differences between c_any and any.
487
- # c_any does not include EOF and so will cause incorrect behavior
488
- # for machine subtraction (any-except rules) and default transitions
489
- # for scanners.
490
-
491
- action do_nl {
492
- # Record position of a newline for precise location reporting on tNL
493
- # tokens.
494
- #
495
- # This action is embedded directly into c_nl, as it is idempotent and
496
- # there are no cases when we need to skip it.
497
- @newline_s = p
498
- }
499
-
500
- c_nl = '\n' $ do_nl;
501
- c_space = [ \t\r\f\v];
502
- c_space_nl = c_space | c_nl;
503
-
504
- c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
505
- c_eol = c_nl | c_eof;
506
- c_any = any - c_eof;
507
-
508
- c_nl_zlen = c_nl | zlen;
509
- c_line = any - c_nl_zlen;
510
-
511
- c_unicode = c_any - 0x00..0x7f;
512
- c_upper = [A-Z];
513
- c_lower = [a-z_] | c_unicode;
514
- c_alpha = c_lower | c_upper;
515
- c_alnum = c_alpha | [0-9];
516
-
517
- action do_eof {
518
- # Sit at EOF indefinitely. #advance would return $eof each time.
519
- # This allows to feed the lexer more data if needed; this is only used
520
- # in tests.
521
- #
522
- # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
523
- # below. This is due to the fact that scanner state at EOF is observed
524
- # by tests, and encapsulating it in a rule would break the introspection.
525
- fhold; fbreak;
526
- }
527
-
528
- #
529
- # === TOKEN DEFINITIONS ===
530
- #
531
-
532
- # All operators are punctuation. There is more to punctuation
533
- # than just operators. Operators can be overridden by user;
534
- # punctuation can not.
535
-
536
- # A list of operators which are valid in the function name context, but
537
- # have different semantics in others.
538
- operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
539
-
540
- # A list of operators which can occur within an assignment shortcut (+ → +=).
541
- operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
542
- '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
543
-
544
- # A list of all user-definable operators not covered by groups above.
545
- operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
546
- '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
547
-
548
- # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
549
- # as they are ambiguous with interpolation `#{}` and should be counted.
550
- # These braces are not present in punctuation lists.
551
-
552
- # A list of punctuation which has different meaning when used at the
553
- # beginning of expression.
554
- punctuation_begin = '-' | '+' | '::' | '(' | '[' |
555
- '*' | '**' | '&' ;
556
-
557
- # A list of all punctuation except punctuation_begin.
558
- punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
559
- '::' | '?' | ':' | '.' | '..' | '...' ;
560
-
561
- # A list of keywords which have different meaning at the beginning of expression.
562
- keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
563
-
564
- # A list of keywords which accept an argument-like expression, i.e. have the
565
- # same post-processing as method calls or commands. Example: `yield 1`,
566
- # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
567
- keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
568
-
569
- # A list of keywords which accept a literal function name as an argument.
570
- keyword_with_fname = 'def' | 'undef' | 'alias' ;
571
-
572
- # A list of keywords which accept an expression after them.
573
- keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
574
- 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
575
- 'and' | 'or' ;
576
-
577
- # A list of keywords which accept a value, and treat the keywords from
578
- # `keyword_modifier` list as modifiers.
579
- keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
580
-
581
- # A list of keywords which do not accept an expression after them.
582
- keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
583
- 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
584
- '__LINE__' | '__ENCODING__';
585
-
586
- # All keywords.
587
- keyword = keyword_with_value | keyword_with_mid |
588
- keyword_with_end | keyword_with_arg |
589
- keyword_with_fname | keyword_modifier ;
590
-
591
- constant = c_upper c_alnum*;
592
- bareword = c_alpha c_alnum*;
593
-
594
- call_or_var = c_lower c_alnum*;
595
- class_var = '@@' bareword;
596
- instance_var = '@' bareword;
597
- global_var = '$'
598
- ( bareword | digit+
599
- | [`'+~*$&?!@/\\;,.=:<>"] # `
600
- | '-' c_alnum
601
- )
602
- ;
603
-
604
- # Ruby accepts (and fails on) variables with leading digit
605
- # in literal context, but not in unquoted symbol body.
606
- class_var_v = '@@' c_alnum+;
607
- instance_var_v = '@' c_alnum+;
608
-
609
- label = bareword [?!]? ':';
610
-
611
- #
612
- # === NUMERIC PARSING ===
613
- #
614
-
615
- int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
616
- int_dec = ( digit+ '_' )* digit* '_'? ;
617
- int_bin = ( [01]+ '_' )* [01]* '_'? ;
618
-
619
- flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
620
- flo_frac = '.' ( digit+ '_' )* digit+;
621
- flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
622
-
623
- int_suffix =
624
- '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
625
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
626
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
627
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
628
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
629
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
630
-
631
- flo_pow_suffix =
632
- '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
633
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
634
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
635
-
636
- flo_suffix =
637
- flo_pow_suffix
638
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
639
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
640
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
641
-
642
- #
643
- # === ESCAPE SEQUENCE PARSING ===
644
- #
645
-
646
- # Escape parsing code is a Ragel pattern, not a scanner, and therefore
647
- # it shouldn't directly raise errors or perform other actions with side effects.
648
- # In reality this would probably just mess up error reporting in pathological
649
- # cases, through.
650
-
651
- # The amount of code required to parse \M\C stuff correctly is ridiculous.
652
-
653
- escaped_nl = "\\" c_nl;
654
-
655
- action unicode_points {
656
- @escape = ""
657
-
658
- codepoints = tok(@escape_s + 2, p - 1)
659
- codepoint_s = @escape_s + 2
660
-
661
- if @version < 24
662
- if codepoints.start_with?(" ") || codepoints.start_with?("\t")
663
- diagnostic :fatal, :invalid_unicode_escape, nil,
664
- range(@escape_s + 2, @escape_s + 3)
665
- end
666
-
667
- if spaces_p = codepoints.index(/[ \t]{2}/)
668
- diagnostic :fatal, :invalid_unicode_escape, nil,
669
- range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
670
- end
671
-
672
- if codepoints.end_with?(" ") || codepoints.end_with?("\t")
673
- diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
674
- end
675
- end
676
-
677
- codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
678
- if spaces
679
- codepoint_s += spaces.length
680
- else
681
- codepoint = codepoint_str.to_i(16)
682
-
683
- if codepoint >= 0x110000
684
- diagnostic :error, :unicode_point_too_large, nil,
685
- range(codepoint_s, codepoint_s + codepoint_str.length)
686
- break
687
- end
688
-
689
- @escape += codepoint.chr(Encoding::UTF_8)
690
- codepoint_s += codepoint_str.length
691
- end
692
- end
693
- }
694
-
695
- action unescape_char {
696
- codepoint = @source_pts[p - 1]
697
- if (@escape = ESCAPES[codepoint]).nil?
698
- @escape = encode_escape(@source_buffer.slice(p - 1))
699
- end
700
- }
701
-
702
- action invalid_complex_escape {
703
- diagnostic :fatal, :invalid_escape
704
- }
705
-
706
- action slash_c_char {
707
- @escape = encode_escape(@escape[0].ord & 0x9f)
708
- }
709
-
710
- action slash_m_char {
711
- @escape = encode_escape(@escape[0].ord | 0x80)
712
- }
713
-
714
- maybe_escaped_char = (
715
- '\\' c_any %unescape_char
716
- | ( c_any - [\\] ) % { @escape = @source_buffer.slice(p - 1).chr }
717
- );
718
-
719
- maybe_escaped_ctrl_char = ( # why?!
720
- '\\' c_any %unescape_char %slash_c_char
721
- | '?' % { @escape = "\x7f" }
722
- | ( c_any - [\\?] ) % { @escape = @source_buffer.slice(p - 1).chr } %slash_c_char
723
- );
724
-
725
- escape = (
726
- # \377
727
- [0-7]{1,3}
728
- % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
729
-
730
- # \xff
731
- | 'x' xdigit{1,2}
732
- % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
733
-
734
- # %q[\x]
735
- | 'x' ( c_any - xdigit )
736
- % {
737
- diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
738
- }
739
-
740
- # \u263a
741
- | 'u' xdigit{4}
742
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
743
-
744
- # \u123
745
- | 'u' xdigit{0,3}
746
- % {
747
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
748
- }
749
-
750
- # u{not hex} or u{}
751
- | 'u{' ( c_any - xdigit - [ \t}] )* '}'
752
- % {
753
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
754
- }
755
-
756
- # \u{ \t 123 \t 456 \t\t }
757
- | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
758
- (
759
- ( xdigit{1,6} [ \t]* '}'
760
- %unicode_points
761
- )
762
- |
763
- ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
764
- | ( c_any - [ \t}] )* c_eof
765
- | xdigit{7,}
766
- ) % {
767
- diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
768
- }
769
- )
770
-
771
- # \C-\a \cx
772
- | ( 'C-' | 'c' ) escaped_nl?
773
- maybe_escaped_ctrl_char
774
-
775
- # \M-a
776
- | 'M-' escaped_nl?
777
- maybe_escaped_char
778
- %slash_m_char
779
-
780
- # \C-\M-f \M-\cf \c\M-f
781
- | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
782
- | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
783
- maybe_escaped_ctrl_char
784
- %slash_m_char
785
-
786
- | 'C' c_any %invalid_complex_escape
787
- | 'M' c_any %invalid_complex_escape
788
- | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
789
-
790
- | ( c_any - [0-7xuCMc] ) %unescape_char
791
-
792
- | c_eof % {
793
- diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
794
- }
795
- );
796
-
797
- # Use rules in form of `e_bs escape' when you need to parse a sequence.
798
- e_bs = '\\' % {
799
- @escape_s = p
800
- @escape = nil
801
- };
802
-
803
- #
804
- # === STRING AND HEREDOC PARSING ===
805
- #
806
-
807
- # Heredoc parsing is quite a complex topic. First, consider that heredocs
808
- # can be arbitrarily nested. For example:
809
- #
810
- # puts <<CODE
811
- # the result is: #{<<RESULT.inspect
812
- # i am a heredoc
813
- # RESULT
814
- # }
815
- # CODE
816
- #
817
- # which, incidentally, evaluates to:
818
- #
819
- # the result is: " i am a heredoc\n"
820
- #
821
- # To parse them, lexer refers to two kinds (remember, nested heredocs)
822
- # of positions in the input stream, namely heredoc_e
823
- # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
824
- #
825
- # heredoc_e is simply contained inside the corresponding Literal, and
826
- # when the heredoc is closed, the lexing is restarted from that position.
827
- #
828
- # @herebody_s is quite more complex. First, @herebody_s changes after each
829
- # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
830
- # contains the current line, and also when a heredoc is started, @herebody_s
831
- # contains the position from which the heredoc will be lexed.
832
- #
833
- # Second, as (insanity) there are nested heredocs, we need to maintain a
834
- # stack of these positions. Each time #push_literal is called, it saves current
835
- # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
836
- # containing another heredocs) is closed, the previous value is restored.
837
-
838
- e_heredoc_nl = c_nl % {
839
- # After every heredoc was parsed, @herebody_s contains the
840
- # position of next token after all heredocs.
841
- if @herebody_s
842
- p = @herebody_s
843
- @herebody_s = nil
844
- end
845
- };
846
-
847
- action extend_string {
848
- string = tok
849
-
850
- # tLABEL_END is only possible in non-cond context on >= 2.2
851
- if @version >= 22 && !@cond.active?
852
- lookahead = @source_buffer.slice(@te...@te+2)
853
- end
854
-
855
- current_literal = literal
856
- if !current_literal.heredoc? &&
857
- (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
858
- if token[0] == :tLABEL_END
859
- p += 1
860
- pop_literal
861
- fnext expr_labelarg;
862
- else
863
- fnext *pop_literal;
864
- end
865
- fbreak;
866
- else
867
- current_literal.extend_string(string, @ts, @te)
868
- end
869
- }
870
-
871
- action extend_string_escaped {
872
- current_literal = literal
873
- # Get the first character after the backslash.
874
- escaped_char = @source_buffer.slice(@escape_s).chr
875
-
876
- if current_literal.munge_escape? escaped_char
877
- # If this particular literal uses this character as an opening
878
- # or closing delimiter, it is an escape sequence for that
879
- # particular character. Write it without the backslash.
880
-
881
- if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
882
- # Regular expressions should include escaped delimiters in their
883
- # escaped form, except when the escaped character is
884
- # a closing delimiter but not a regexp metacharacter.
885
- #
886
- # The backslash itself cannot be used as a closing delimiter
887
- # at the same time as an escape symbol, but it is always munged,
888
- # so this branch also executes for the non-closing-delimiter case
889
- # for the backslash.
890
- current_literal.extend_string(tok, @ts, @te)
891
- else
892
- current_literal.extend_string(escaped_char, @ts, @te)
893
- end
894
- else
895
- # It does not. So this is an actual escape sequence, yay!
896
- if current_literal.regexp?
897
- # Regular expressions should include escape sequences in their
898
- # escaped form. On the other hand, escaped newlines are removed.
899
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
900
- elsif current_literal.heredoc? && escaped_char == "\n".freeze
901
- if current_literal.squiggly_heredoc?
902
- # Squiggly heredocs like
903
- # <<~-HERE
904
- # 1\
905
- # 2
906
- # HERE
907
- # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
908
- # This information is emitted as is, without escaping,
909
- # later this escape sequence (\\n) gets handled manually in the Lexer::Dedenter
910
- current_literal.extend_string(tok, @ts, @te)
911
- else
912
- # Plain heredocs also parse \\n as a line continuation,
913
- # but they don't need to know that there was originally a newline in the
914
- # code, so we escape it and emit as " 1 2\n"
915
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
916
- end
917
- else
918
- current_literal.extend_string(@escape || tok, @ts, @te)
919
- end
920
- end
921
- }
922
-
923
- # Extend a string with a newline or a EOF character.
924
- # As heredoc closing line can immediately precede EOF, this action
925
- # has to handle such case specially.
926
- action extend_string_eol {
927
- current_literal = literal
928
- if @te == pe
929
- diagnostic :fatal, :string_eof, nil,
930
- range(current_literal.str_s, current_literal.str_s + 1)
931
- end
932
-
933
- if current_literal.heredoc?
934
- line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
935
-
936
- if version?(18, 19, 20)
937
- # See ruby:c48b4209c
938
- line = line.gsub(/\r.*$/, ''.freeze)
939
- end
940
-
941
- # Try ending the heredoc with the complete most recently
942
- # scanned line. @herebody_s always refers to the start of such line.
943
- if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
944
- # Adjust @herebody_s to point to the next line.
945
- @herebody_s = @te
946
-
947
- # Continue regular lexing after the heredoc reference (<<END).
948
- p = current_literal.heredoc_e - 1
949
- fnext *pop_literal; fbreak;
950
- else
951
- # Calculate indentation level for <<~HEREDOCs.
952
- current_literal.infer_indent_level(line)
953
-
954
- # Ditto.
955
- @herebody_s = @te
956
- end
957
- else
958
- # Try ending the literal with a newline.
959
- if current_literal.nest_and_try_closing(tok, @ts, @te)
960
- fnext *pop_literal; fbreak;
961
- end
962
-
963
- if @herebody_s
964
- # This is a regular literal intertwined with a heredoc. Like:
965
- #
966
- # p <<-foo+"1
967
- # bar
968
- # foo
969
- # 2"
970
- #
971
- # which, incidentally, evaluates to "bar\n1\n2".
972
- p = @herebody_s - 1
973
- @herebody_s = nil
974
- end
975
- end
976
-
977
- if current_literal.words? && !eof_codepoint?(@source_pts[p])
978
- current_literal.extend_space @ts, @te
979
- else
980
- # A literal newline is appended if the heredoc was _not_ closed
981
- # this time (see fbreak above). See also Literal#nest_and_try_closing
982
- # for rationale of calling #flush_string here.
983
- current_literal.extend_string tok, @ts, @te
984
- current_literal.flush_string
985
- end
986
- }
987
-
988
- action extend_string_space {
989
- literal.extend_space @ts, @te
990
- }
991
-
992
- #
993
- # === INTERPOLATION PARSING ===
994
- #
995
-
996
- # Interpolations with immediate variable names simply call into
997
- # the corresponding machine.
998
-
999
- interp_var = '#' ( global_var | class_var_v | instance_var_v );
1000
-
1001
- action extend_interp_var {
1002
- current_literal = literal
1003
- current_literal.flush_string
1004
- current_literal.extend_content
1005
-
1006
- emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1007
-
1008
- p = @ts
1009
- fcall expr_variable;
1010
- }
1011
-
1012
- # Interpolations with code blocks must match nested curly braces, as
1013
- # interpolation ending is ambiguous with a block ending. So, every
1014
- # opening and closing brace should be matched with e_[lr]brace rules,
1015
- # which automatically perform the counting.
1016
- #
1017
- # Note that interpolations can themselves be nested, so brace balance
1018
- # is tied to the innermost literal.
1019
- #
1020
- # Also note that literals themselves should not use e_[lr]brace rules
1021
- # when matching their opening and closing delimiters, as the amount of
1022
- # braces inside the characters of a string literal is independent.
1023
-
1024
- interp_code = '#{';
1025
-
1026
- e_lbrace = '{' % {
1027
- @cond.push(false); @cmdarg.push(false)
1028
-
1029
- current_literal = literal
1030
- if current_literal
1031
- current_literal.start_interp_brace
1032
- end
1033
- };
1034
-
1035
- e_rbrace = '}' % {
1036
- current_literal = literal
1037
- if current_literal
1038
- if current_literal.end_interp_brace_and_try_closing
1039
- if version?(18, 19)
1040
- emit(:tRCURLY, '}'.freeze, p - 1, p)
1041
- if @version < 24
1042
- @cond.lexpop
1043
- @cmdarg.lexpop
1044
- else
1045
- @cond.pop
1046
- @cmdarg.pop
1047
- end
1048
- else
1049
- emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1050
- end
1051
-
1052
- if current_literal.saved_herebody_s
1053
- @herebody_s = current_literal.saved_herebody_s
1054
- end
1055
-
1056
-
1057
- fhold;
1058
- fnext *next_state_for_literal(current_literal);
1059
- fbreak;
1060
- end
1061
- end
1062
- };
1063
-
1064
- action extend_interp_code {
1065
- current_literal = literal
1066
- current_literal.flush_string
1067
- current_literal.extend_content
1068
-
1069
- emit(:tSTRING_DBEG, '#{'.freeze)
1070
-
1071
- if current_literal.heredoc?
1072
- current_literal.saved_herebody_s = @herebody_s
1073
- @herebody_s = nil
1074
- end
1075
-
1076
- current_literal.start_interp_brace
1077
- fnext expr_value;
1078
- fbreak;
1079
- }
1080
-
1081
- # Actual string parsers are simply combined from the primitives defined
1082
- # above.
1083
-
1084
- interp_words := |*
1085
- interp_code => extend_interp_code;
1086
- interp_var => extend_interp_var;
1087
- e_bs escape => extend_string_escaped;
1088
- c_space+ => extend_string_space;
1089
- c_eol => extend_string_eol;
1090
- c_any => extend_string;
1091
- *|;
1092
-
1093
- interp_string := |*
1094
- interp_code => extend_interp_code;
1095
- interp_var => extend_interp_var;
1096
- e_bs escape => extend_string_escaped;
1097
- c_eol => extend_string_eol;
1098
- c_any => extend_string;
1099
- *|;
1100
-
1101
- plain_words := |*
1102
- e_bs c_any => extend_string_escaped;
1103
- c_space+ => extend_string_space;
1104
- c_eol => extend_string_eol;
1105
- c_any => extend_string;
1106
- *|;
1107
-
1108
- plain_string := |*
1109
- '\\' c_nl => extend_string_eol;
1110
- e_bs c_any => extend_string_escaped;
1111
- c_eol => extend_string_eol;
1112
- c_any => extend_string;
1113
- *|;
1114
-
1115
- interp_backslash_delimited := |*
1116
- interp_code => extend_interp_code;
1117
- interp_var => extend_interp_var;
1118
- c_eol => extend_string_eol;
1119
- c_any => extend_string;
1120
- *|;
1121
-
1122
- plain_backslash_delimited := |*
1123
- c_eol => extend_string_eol;
1124
- c_any => extend_string;
1125
- *|;
1126
-
1127
- interp_backslash_delimited_words := |*
1128
- interp_code => extend_interp_code;
1129
- interp_var => extend_interp_var;
1130
- c_space+ => extend_string_space;
1131
- c_eol => extend_string_eol;
1132
- c_any => extend_string;
1133
- *|;
1134
-
1135
- plain_backslash_delimited_words := |*
1136
- c_space+ => extend_string_space;
1137
- c_eol => extend_string_eol;
1138
- c_any => extend_string;
1139
- *|;
1140
-
1141
- regexp_modifiers := |*
1142
- [A-Za-z]+
1143
- => {
1144
- unknown_options = tok.scan(/[^imxouesn]/)
1145
- if unknown_options.any?
1146
- diagnostic :error, :regexp_options,
1147
- { :options => unknown_options.join }
1148
- end
1149
-
1150
- emit(:tREGEXP_OPT)
1151
- fnext expr_end;
1152
- fbreak;
1153
- };
1154
-
1155
- any
1156
- => {
1157
- emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1158
- fhold;
1159
- fgoto expr_end;
1160
- };
1161
- *|;
1162
-
1163
- #
1164
- # === WHITESPACE HANDLING ===
1165
- #
1166
-
1167
- # Various contexts in Ruby allow various kinds of whitespace
1168
- # to be used. They are grouped to clarify the lexing machines
1169
- # and ease collection of comments.
1170
-
1171
- # A line of code with inline #comment at end is always equivalent
1172
- # to a line of code ending with just a newline, so an inline
1173
- # comment is deemed equivalent to non-newline whitespace
1174
- # (c_space character class).
1175
-
1176
- w_space =
1177
- c_space+
1178
- | '\\' e_heredoc_nl
1179
- ;
1180
-
1181
- w_comment =
1182
- '#' %{ @sharp_s = p - 1 }
1183
- # The (p == pe) condition compensates for added "\0" and
1184
- # the way Ragel handles EOF.
1185
- c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1186
- ;
1187
-
1188
- w_space_comment =
1189
- w_space
1190
- | w_comment
1191
- ;
1192
-
1193
- # A newline in non-literal context always interoperates with
1194
- # here document logic and can always be escaped by a backslash,
1195
- # still interoperating with here document logic in the same way,
1196
- # yet being invisible to anything else.
1197
- #
1198
- # To demonstrate:
1199
- #
1200
- # foo = <<FOO \
1201
- # bar
1202
- # FOO
1203
- # + 2
1204
- #
1205
- # is equivalent to `foo = "bar\n" + 2`.
1206
-
1207
- w_newline =
1208
- e_heredoc_nl;
1209
-
1210
- w_any =
1211
- w_space
1212
- | w_comment
1213
- | w_newline
1214
- ;
1215
-
1216
-
1217
- #
1218
- # === EXPRESSION PARSING ===
1219
- #
1220
-
1221
- # These rules implement a form of manually defined lookahead.
1222
- # The default longest-match scanning does not work here due
1223
- # to sheer ambiguity.
1224
-
1225
- ambiguous_fid_suffix = # actual parsed
1226
- [?!] %{ tm = p } | # a? a?
1227
- [?!]'=' %{ tm = p - 2 } # a!=b a != b
1228
- ;
1229
-
1230
- ambiguous_ident_suffix = # actual parsed
1231
- ambiguous_fid_suffix |
1232
- '=' %{ tm = p } | # a= a=
1233
- '==' %{ tm = p - 2 } | # a==b a == b
1234
- '=~' %{ tm = p - 2 } | # a=~b a =~ b
1235
- '=>' %{ tm = p - 2 } | # a=>b a => b
1236
- '===' %{ tm = p - 3 } # a===b a === b
1237
- ;
1238
-
1239
- ambiguous_symbol_suffix = # actual parsed
1240
- ambiguous_ident_suffix |
1241
- '==>' %{ tm = p - 2 } # :a==>b :a= => b
1242
- ;
1243
-
1244
- # Ambiguous with 1.9 hash labels.
1245
- ambiguous_const_suffix = # actual parsed
1246
- '::' %{ tm = p - 2 } # A::B A :: B
1247
- ;
1248
-
1249
- # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1250
- # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1251
-
1252
- e_lbrack = '[' % {
1253
- @cond.push(false); @cmdarg.push(false)
1254
- };
1255
-
1256
- # Ruby 1.9 lambdas require parentheses counting in order to
1257
- # emit correct opening kDO/tLBRACE.
1258
-
1259
- e_lparen = '(' % {
1260
- @cond.push(false); @cmdarg.push(false)
1261
-
1262
- @paren_nest += 1
1263
- };
1264
-
1265
- e_rparen = ')' % {
1266
- @paren_nest -= 1
1267
- };
1268
-
1269
- # Ruby is context-sensitive wrt/ local identifiers.
1270
- action local_ident {
1271
- emit(:tIDENTIFIER)
1272
-
1273
- if !@static_env.nil? && @static_env.declared?(tok)
1274
- fnext expr_endfn; fbreak;
1275
- else
1276
- fnext *arg_or_cmdarg; fbreak;
1277
- end
1278
- }
1279
-
1280
- # Variable lexing code is accessed from both expressions and
1281
- # string interpolation related code.
1282
- #
1283
- expr_variable := |*
1284
- global_var
1285
- => {
1286
- if tok =~ /^\$([1-9][0-9]*)$/
1287
- emit(:tNTH_REF, tok(@ts + 1).to_i)
1288
- elsif tok =~ /^\$([&`'+])$/
1289
- emit(:tBACK_REF)
1290
- else
1291
- emit(:tGVAR)
1292
- end
1293
-
1294
- fnext *stack_pop; fbreak;
1295
- };
1296
-
1297
- class_var_v
1298
- => {
1299
- if tok =~ /^@@[0-9]/
1300
- diagnostic :error, :cvar_name, { :name => tok }
1301
- end
1302
-
1303
- emit(:tCVAR)
1304
- fnext *stack_pop; fbreak;
1305
- };
1306
-
1307
- instance_var_v
1308
- => {
1309
- if tok =~ /^@[0-9]/
1310
- diagnostic :error, :ivar_name, { :name => tok }
1311
- end
1312
-
1313
- emit(:tIVAR)
1314
- fnext *stack_pop; fbreak;
1315
- };
1316
- *|;
1317
-
1318
- # Literal function name in definition (e.g. `def class`).
1319
- # Keywords are returned as their respective tokens; this is used
1320
- # to support singleton def `def self.foo`. Global variables are
1321
- # returned as `tGVAR`; this is used in global variable alias
1322
- # statements `alias $a $b`. Symbols are returned verbatim; this
1323
- # is used in `alias :a :"b#{foo}"` and `undef :a`.
1324
- #
1325
- # Transitions to `expr_endfn` afterwards.
1326
- #
1327
- expr_fname := |*
1328
- keyword
1329
- => { emit_table(KEYWORDS_BEGIN);
1330
- fnext expr_endfn; fbreak; };
1331
-
1332
- constant
1333
- => { emit(:tCONSTANT)
1334
- fnext expr_endfn; fbreak; };
1335
-
1336
- bareword [?=!]?
1337
- => { emit(:tIDENTIFIER)
1338
- fnext expr_endfn; fbreak; };
1339
-
1340
- global_var
1341
- => { p = @ts - 1
1342
- fnext expr_end; fcall expr_variable; };
1343
-
1344
- # If the handling was to be delegated to expr_end,
1345
- # these cases would transition to something else than
1346
- # expr_endfn, which is incorrect.
1347
- operator_fname |
1348
- operator_arithmetic |
1349
- operator_rest
1350
- => { emit_table(PUNCTUATION)
1351
- fnext expr_endfn; fbreak; };
1352
-
1353
- '::'
1354
- => { fhold; fhold; fgoto expr_end; };
1355
-
1356
- ':'
1357
- => { fhold; fgoto expr_beg; };
1358
-
1359
- '%s' c_any
1360
- => {
1361
- if version?(23)
1362
- type, delimiter = tok[0..-2], tok[-1].chr
1363
- fgoto *push_literal(type, delimiter, @ts);
1364
- else
1365
- p = @ts - 1
1366
- fgoto expr_end;
1367
- end
1368
- };
1369
-
1370
- w_any;
1371
-
1372
- c_any
1373
- => { fhold; fgoto expr_end; };
1374
-
1375
- c_eof => do_eof;
1376
- *|;
1377
-
1378
- # After literal function name in definition. Behaves like `expr_end`,
1379
- # but allows a tLABEL.
1380
- #
1381
- # Transitions to `expr_end` afterwards.
1382
- #
1383
- expr_endfn := |*
1384
- label ( any - ':' )
1385
- => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1386
- fhold; fnext expr_labelarg; fbreak; };
1387
-
1388
- w_space_comment;
1389
-
1390
- c_any
1391
- => { fhold; fgoto expr_end; };
1392
-
1393
- c_eof => do_eof;
1394
- *|;
1395
-
1396
- # Literal function name in method call (e.g. `a.class`).
1397
- #
1398
- # Transitions to `expr_arg` afterwards.
1399
- #
1400
- expr_dot := |*
1401
- constant
1402
- => { emit(:tCONSTANT)
1403
- fnext *arg_or_cmdarg; fbreak; };
1404
-
1405
- call_or_var
1406
- => { emit(:tIDENTIFIER)
1407
- fnext *arg_or_cmdarg; fbreak; };
1408
-
1409
- bareword ambiguous_fid_suffix
1410
- => { emit(:tFID, tok(@ts, tm), @ts, tm)
1411
- fnext *arg_or_cmdarg; p = tm - 1; fbreak; };
1412
-
1413
- # See the comment in `expr_fname`.
1414
- operator_fname |
1415
- operator_arithmetic |
1416
- operator_rest
1417
- => { emit_table(PUNCTUATION)
1418
- fnext expr_arg; fbreak; };
1419
-
1420
- w_any;
1421
-
1422
- c_any
1423
- => { fhold; fgoto expr_end; };
1424
-
1425
- c_eof => do_eof;
1426
- *|;
1427
-
1428
- # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1429
- # is consumed; the current expression is a command or method call.
1430
- #
1431
- expr_arg := |*
1432
- #
1433
- # COMMAND MODE SPECIFIC TOKENS
1434
- #
1435
-
1436
- # cmd (1 + 2)
1437
- # See below the rationale about expr_endarg.
1438
- w_space+ e_lparen
1439
- => {
1440
- if version?(18)
1441
- emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1442
- fnext expr_value; fbreak;
1443
- else
1444
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1445
- fnext expr_beg; fbreak;
1446
- end
1447
- };
1448
-
1449
- # meth(1 + 2)
1450
- # Regular method call.
1451
- e_lparen
1452
- => { emit(:tLPAREN2, '('.freeze)
1453
- fnext expr_beg; fbreak; };
1454
-
1455
- # meth [...]
1456
- # Array argument. Compare with indexing `meth[...]`.
1457
- w_space+ e_lbrack
1458
- => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1459
- fnext expr_beg; fbreak; };
1460
-
1461
- # cmd {}
1462
- # Command: method call without parentheses.
1463
- w_space* e_lbrace
1464
- => {
1465
- if @lambda_stack.last == @paren_nest
1466
- @lambda_stack.pop
1467
- emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1468
- else
1469
- emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1470
- end
1471
- fnext expr_value; fbreak;
1472
- };
1473
-
1474
- #
1475
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1476
- #
1477
-
1478
- # a??
1479
- # Ternary operator
1480
- '?' c_space_nl
1481
- => {
1482
- # Unlike expr_beg as invoked in the next rule, do not warn
1483
- p = @ts - 1
1484
- fgoto expr_end;
1485
- };
1486
-
1487
- # a ?b, a? ?
1488
- # Character literal or ternary operator
1489
- w_space* '?'
1490
- => { fhold; fgoto expr_beg; };
1491
-
1492
- # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1493
- # a /foo/ (but not "a / foo" or "a /=foo")
1494
- # a <<HEREDOC
1495
- w_space+ %{ tm = p }
1496
- ( [%/] ( c_any - c_space_nl - '=' ) # /
1497
- | '<<'
1498
- )
1499
- => {
1500
- if tok(tm, tm + 1) == '/'.freeze
1501
- # Ambiguous regexp literal.
1502
- diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1503
- end
1504
-
1505
- p = tm - 1
1506
- fgoto expr_beg;
1507
- };
1508
-
1509
- # x *1
1510
- # Ambiguous splat, kwsplat or block-pass.
1511
- w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1512
- => {
1513
- diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1514
- range(tm, @te)
1515
-
1516
- p = tm - 1
1517
- fgoto expr_beg;
1518
- };
1519
-
1520
- # x ::Foo
1521
- # Ambiguous toplevel constant access.
1522
- w_space+ '::'
1523
- => { fhold; fhold; fgoto expr_beg; };
1524
-
1525
- # x:b
1526
- # Symbol.
1527
- w_space* ':'
1528
- => { fhold; fgoto expr_beg; };
1529
-
1530
- w_space+ label
1531
- => { p = @ts - 1; fgoto expr_beg; };
1532
-
1533
- #
1534
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1535
- #
1536
-
1537
- # a ? b
1538
- # Ternary operator.
1539
- w_space+ %{ tm = p } '?' c_space_nl
1540
- => { p = tm - 1; fgoto expr_end; };
1541
-
1542
- # x + 1: Binary operator or operator-assignment.
1543
- w_space* operator_arithmetic
1544
- ( '=' | c_space_nl )? |
1545
- # x rescue y: Modifier keyword.
1546
- w_space* keyword_modifier |
1547
- # a &. b: Safe navigation operator.
1548
- w_space* '&.' |
1549
- # Miscellanea.
1550
- w_space* punctuation_end
1551
- => {
1552
- p = @ts - 1
1553
- fgoto expr_end;
1554
- };
1555
-
1556
- w_space;
1557
-
1558
- w_comment
1559
- => { fgoto expr_end; };
1560
-
1561
- w_newline
1562
- => { fhold; fgoto expr_end; };
1563
-
1564
- c_any
1565
- => { fhold; fgoto expr_beg; };
1566
-
1567
- c_eof => do_eof;
1568
- *|;
1569
-
1570
- # The previous token was an identifier which was seen while in the
1571
- # command mode (that is, the state at the beginning of #advance was
1572
- # expr_value). This state is very similar to expr_arg, but disambiguates
1573
- # two very rare and specific condition:
1574
- # * In 1.8 mode, "foo (lambda do end)".
1575
- # * In 1.9+ mode, "f x: -> do foo do end end".
1576
- expr_cmdarg := |*
1577
- w_space+ e_lparen
1578
- => {
1579
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1580
- if version?(18)
1581
- fnext expr_value; fbreak;
1582
- else
1583
- fnext expr_beg; fbreak;
1584
- end
1585
- };
1586
-
1587
- w_space* 'do'
1588
- => {
1589
- if @cond.active?
1590
- emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1591
- else
1592
- emit(:kDO, 'do'.freeze, @te - 2, @te)
1593
- end
1594
- fnext expr_value; fbreak;
1595
- };
1596
-
1597
- c_any |
1598
- # Disambiguate with the `do' rule above.
1599
- w_space* bareword |
1600
- w_space* label
1601
- => { p = @ts - 1
1602
- fgoto expr_arg; };
1603
-
1604
- c_eof => do_eof;
1605
- *|;
1606
-
1607
- # The rationale for this state is pretty complex. Normally, if an argument
1608
- # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1609
- # the block is attached to the innermost argument (`f` in `m f {}`), or it
1610
- # is a parse error (`m 1 {}`). But there is a special case for passing a single
1611
- # primary expression grouped with parentheses: if you write `m (1) {}` or
1612
- # (2.0 only) `m () {}`, then the block is attached to `m`.
1613
- #
1614
- # Thus, we recognize the opening `(` of a command (remember, a command is
1615
- # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1616
- # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1617
- # lexer's state to `expr_endarg`, which makes it emit the possibly following
1618
- # `{` as `tLBRACE_ARG`.
1619
- #
1620
- # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1621
- # `do` (as `kDO_BLOCK` in `expr_beg`).
1622
- expr_endarg := |*
1623
- e_lbrace
1624
- => {
1625
- if @lambda_stack.last == @paren_nest
1626
- @lambda_stack.pop
1627
- emit(:tLAMBEG, '{'.freeze)
1628
- else
1629
- emit(:tLBRACE_ARG, '{'.freeze)
1630
- end
1631
- fnext expr_value;
1632
- };
1633
-
1634
- 'do'
1635
- => { emit_do(true)
1636
- fnext expr_value; fbreak; };
1637
-
1638
- w_space_comment;
1639
-
1640
- c_any
1641
- => { fhold; fgoto expr_end; };
1642
-
1643
- c_eof => do_eof;
1644
- *|;
1645
-
1646
- # The rationale for this state is that several keywords accept value
1647
- # (i.e. should transition to `expr_beg`), do not accept it like a command
1648
- # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1649
- # accept a modifier if/while/etc.
1650
- #
1651
- expr_mid := |*
1652
- keyword_modifier
1653
- => { emit_table(KEYWORDS)
1654
- fnext expr_beg; fbreak; };
1655
-
1656
- bareword
1657
- => { p = @ts - 1; fgoto expr_beg; };
1658
-
1659
- w_space_comment;
1660
-
1661
- w_newline
1662
- => { fhold; fgoto expr_end; };
1663
-
1664
- c_any
1665
- => { fhold; fgoto expr_beg; };
1666
-
1667
- c_eof => do_eof;
1668
- *|;
1669
-
1670
- # Beginning of an expression.
1671
- #
1672
- # Don't fallthrough to this state from `c_any`; make sure to handle
1673
- # `c_space* c_nl` and let `expr_end` handle the newline.
1674
- # Otherwise code like `f\ndef x` gets glued together and the parser
1675
- # explodes.
1676
- #
1677
- expr_beg := |*
1678
- # +5, -5, - 5
1679
- [+\-] w_any* [0-9]
1680
- => {
1681
- emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1682
- fhold; fnext expr_end; fbreak;
1683
- };
1684
-
1685
- # splat *a
1686
- '*'
1687
- => { emit(:tSTAR, '*'.freeze)
1688
- fbreak; };
1689
-
1690
- #
1691
- # STRING AND REGEXP LITERALS
1692
- #
1693
-
1694
- # /regexp/oui
1695
- # /=/ (disambiguation with /=)
1696
- '/' c_any
1697
- => {
1698
- type = delimiter = tok[0].chr
1699
- fhold; fgoto *push_literal(type, delimiter, @ts);
1700
- };
1701
-
1702
- # %<string>
1703
- '%' ( any - [A-Za-z] )
1704
- => {
1705
- type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1706
- fgoto *push_literal(type, delimiter, @ts);
1707
- };
1708
-
1709
- # %w(we are the people)
1710
- '%' [A-Za-z]+ c_any
1711
- => {
1712
- type, delimiter = tok[0..-2], tok[-1].chr
1713
- fgoto *push_literal(type, delimiter, @ts);
1714
- };
1715
-
1716
- '%' c_eof
1717
- => {
1718
- diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1719
- };
1720
-
1721
- # Heredoc start.
1722
- # <<END | <<'END' | <<"END" | <<`END` |
1723
- # <<-END | <<-'END' | <<-"END" | <<-`END` |
1724
- # <<~END | <<~'END' | <<~"END" | <<~`END`
1725
- '<<' [~\-]?
1726
- ( '"' ( any - '"' )* '"'
1727
- | "'" ( any - "'" )* "'"
1728
- | "`" ( any - "`" )* "`"
1729
- | bareword ) % { heredoc_e = p }
1730
- c_line* c_nl % { new_herebody_s = p }
1731
- => {
1732
- tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1733
-
1734
- indent = !$1.empty? || !$2.empty?
1735
- dedent_body = !$2.empty?
1736
- type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1737
- delimiter = $4
1738
-
1739
- if @version >= 24
1740
- if delimiter.count("\n") > 0
1741
- if delimiter.end_with?("\n")
1742
- diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1743
- delimiter = delimiter.rstrip
1744
- else
1745
- diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1746
- end
1747
- end
1748
- end
1749
-
1750
- if dedent_body && version?(18, 19, 20, 21, 22)
1751
- emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1752
- p = @ts + 1
1753
- fnext expr_beg; fbreak;
1754
- else
1755
- fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1756
-
1757
- @herebody_s ||= new_herebody_s
1758
- p = @herebody_s - 1
1759
- end
1760
- };
1761
-
1762
- #
1763
- # SYMBOL LITERALS
1764
- #
1765
-
1766
- # :&&, :||
1767
- ':' ('&&' | '||') => {
1768
- fhold; fhold;
1769
- emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1770
- fgoto expr_fname;
1771
- };
1772
-
1773
- # :"bar", :'baz'
1774
- ':' ['"] # '
1775
- => {
1776
- type, delimiter = tok, tok[-1].chr
1777
- fgoto *push_literal(type, delimiter, @ts);
1778
- };
1779
-
1780
- # :!@ is :!
1781
- # :~@ is :~
1782
- ':' [!~] '@'
1783
- => {
1784
- emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1785
- fnext expr_end; fbreak;
1786
- };
1787
-
1788
- ':' bareword ambiguous_symbol_suffix
1789
- => {
1790
- emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1791
- p = tm - 1
1792
- fnext expr_end; fbreak;
1793
- };
1794
-
1795
- ':' ( bareword | global_var | class_var | instance_var |
1796
- operator_fname | operator_arithmetic | operator_rest )
1797
- => {
1798
- emit(:tSYMBOL, tok(@ts + 1), @ts)
1799
- fnext expr_end; fbreak;
1800
- };
1801
-
1802
- #
1803
- # AMBIGUOUS TERNARY OPERATOR
1804
- #
1805
-
1806
- # Character constant, like ?a, ?\n, ?\u1000, and so on
1807
- # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1808
- '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1809
- | (c_any - c_space_nl - e_bs) % { @escape = nil }
1810
- )
1811
- => {
1812
- value = @escape || tok(@ts + 1)
1813
-
1814
- if version?(18)
1815
- emit(:tINTEGER, value.getbyte(0))
1816
- else
1817
- emit(:tCHARACTER, value)
1818
- end
1819
-
1820
- fnext expr_end; fbreak;
1821
- };
1822
-
1823
- '?' c_space_nl
1824
- => {
1825
- escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1826
- "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1827
- diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1828
-
1829
- p = @ts - 1
1830
- fgoto expr_end;
1831
- };
1832
-
1833
- '?' c_eof
1834
- => {
1835
- diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1836
- };
1837
-
1838
- # f ?aa : b: Disambiguate with a character literal.
1839
- '?' [A-Za-z_] bareword
1840
- => {
1841
- p = @ts - 1
1842
- fgoto expr_end;
1843
- };
1844
-
1845
- #
1846
- # KEYWORDS AND PUNCTUATION
1847
- #
1848
-
1849
- # a({b=>c})
1850
- e_lbrace
1851
- => {
1852
- if @lambda_stack.last == @paren_nest
1853
- @lambda_stack.pop
1854
- emit(:tLAMBEG, '{'.freeze)
1855
- else
1856
- emit(:tLBRACE, '{'.freeze)
1857
- end
1858
- fbreak;
1859
- };
1860
-
1861
- # a([1, 2])
1862
- e_lbrack
1863
- => { emit(:tLBRACK, '['.freeze)
1864
- fbreak; };
1865
-
1866
- # a()
1867
- e_lparen
1868
- => { emit(:tLPAREN, '('.freeze)
1869
- fbreak; };
1870
-
1871
- # a(+b)
1872
- punctuation_begin
1873
- => { emit_table(PUNCTUATION_BEGIN)
1874
- fbreak; };
1875
-
1876
- # rescue Exception => e: Block rescue.
1877
- # Special because it should transition to expr_mid.
1878
- 'rescue' %{ tm = p } '=>'?
1879
- => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
1880
- p = tm - 1
1881
- fnext expr_mid; fbreak; };
1882
-
1883
- # if a: Statement if.
1884
- keyword_modifier
1885
- => { emit_table(KEYWORDS_BEGIN)
1886
- fnext expr_value; fbreak; };
1887
-
1888
- #
1889
- # RUBY 1.9 HASH LABELS
1890
- #
1891
-
1892
- label ( any - ':' )
1893
- => {
1894
- fhold;
1895
-
1896
- if version?(18)
1897
- ident = tok(@ts, @te - 2)
1898
-
1899
- emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1900
- ident, @ts, @te - 2)
1901
- fhold; # continue as a symbol
1902
-
1903
- if !@static_env.nil? && @static_env.declared?(ident)
1904
- fnext expr_end;
1905
- else
1906
- fnext *arg_or_cmdarg;
1907
- end
1908
- else
1909
- emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1910
- fnext expr_labelarg;
1911
- end
1912
-
1913
- fbreak;
1914
- };
1915
-
1916
- #
1917
- # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
1918
- #
1919
-
1920
- # foo= bar: Disambiguate with bareword rule below.
1921
- bareword ambiguous_ident_suffix |
1922
- # def foo: Disambiguate with bareword rule below.
1923
- keyword
1924
- => { p = @ts - 1
1925
- fgoto expr_end; };
1926
-
1927
- # a = 42; a [42]: Indexing.
1928
- # def a; end; a [42]: Array argument.
1929
- call_or_var
1930
- => local_ident;
1931
-
1932
- (call_or_var - keyword)
1933
- % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
1934
- w_space+ '('
1935
- => {
1936
- emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
1937
- p = ident_te - 1
1938
-
1939
- if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
1940
- fnext expr_endfn;
1941
- else
1942
- fnext expr_cmdarg;
1943
- end
1944
- fbreak;
1945
- };
1946
-
1947
- #
1948
- # WHITESPACE
1949
- #
1950
-
1951
- w_any;
1952
-
1953
- e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
1954
- => {
1955
- p = @ts - 1
1956
- @cs_before_block_comment = @cs
1957
- fgoto line_begin;
1958
- };
1959
-
1960
- #
1961
- # DEFAULT TRANSITION
1962
- #
1963
-
1964
- # The following rules match most binary and all unary operators.
1965
- # Rules for binary operators provide better error reporting.
1966
- operator_arithmetic '=' |
1967
- operator_rest |
1968
- punctuation_end |
1969
- c_any
1970
- => { p = @ts - 1; fgoto expr_end; };
1971
-
1972
- c_eof => do_eof;
1973
- *|;
1974
-
1975
- # Special newline handling for "def a b:"
1976
- #
1977
- expr_labelarg := |*
1978
- w_space_comment;
1979
-
1980
- w_newline
1981
- => {
1982
- if @in_kwarg
1983
- fhold; fgoto expr_end;
1984
- else
1985
- fgoto line_begin;
1986
- end
1987
- };
1988
-
1989
- c_any
1990
- => { fhold; fgoto expr_beg; };
1991
-
1992
- c_eof => do_eof;
1993
- *|;
1994
-
1995
- # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
1996
- #
1997
- expr_value := |*
1998
- # a:b: a(:b), a::B, A::B
1999
- label (any - ':')
2000
- => { p = @ts - 1
2001
- fgoto expr_end; };
2002
-
2003
- # "bar", 'baz'
2004
- ['"] # '
2005
- => {
2006
- fgoto *push_literal(tok, tok, @ts);
2007
- };
2008
-
2009
- w_space_comment;
2010
-
2011
- w_newline
2012
- => { fgoto line_begin; };
2013
-
2014
- c_any
2015
- => { fhold; fgoto expr_beg; };
2016
-
2017
- c_eof => do_eof;
2018
- *|;
2019
-
2020
- expr_end := |*
2021
- #
2022
- # STABBY LAMBDA
2023
- #
2024
-
2025
- '->'
2026
- => {
2027
- emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2028
-
2029
- @lambda_stack.push @paren_nest
2030
- fnext expr_endfn; fbreak;
2031
- };
2032
-
2033
- e_lbrace | 'do'
2034
- => {
2035
- if @lambda_stack.last == @paren_nest
2036
- @lambda_stack.pop
2037
-
2038
- if tok == '{'.freeze
2039
- emit(:tLAMBEG, '{'.freeze)
2040
- else # 'do'
2041
- emit(:kDO_LAMBDA, 'do'.freeze)
2042
- end
2043
- else
2044
- if tok == '{'.freeze
2045
- emit(:tLCURLY, '{'.freeze)
2046
- else # 'do'
2047
- emit_do
2048
- end
2049
- end
2050
-
2051
- fnext expr_value; fbreak;
2052
- };
2053
-
2054
- #
2055
- # KEYWORDS
2056
- #
2057
-
2058
- keyword_with_fname
2059
- => { emit_table(KEYWORDS)
2060
- fnext expr_fname; fbreak; };
2061
-
2062
- 'class' w_any* '<<'
2063
- => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2064
- emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2065
- fnext expr_value; fbreak; };
2066
-
2067
- # a if b:c: Syntax error.
2068
- keyword_modifier
2069
- => { emit_table(KEYWORDS)
2070
- fnext expr_beg; fbreak; };
2071
-
2072
- # elsif b:c: elsif b(:c)
2073
- keyword_with_value
2074
- => { emit_table(KEYWORDS)
2075
- fnext expr_value; fbreak; };
2076
-
2077
- keyword_with_mid
2078
- => { emit_table(KEYWORDS)
2079
- fnext expr_mid; fbreak; };
2080
-
2081
- keyword_with_arg
2082
- => {
2083
- emit_table(KEYWORDS)
2084
-
2085
- if version?(18) && tok == 'not'.freeze
2086
- fnext expr_beg; fbreak;
2087
- else
2088
- fnext expr_arg; fbreak;
2089
- end
2090
- };
2091
-
2092
- '__ENCODING__'
2093
- => {
2094
- if version?(18)
2095
- emit(:tIDENTIFIER)
2096
-
2097
- unless !@static_env.nil? && @static_env.declared?(tok)
2098
- fnext *arg_or_cmdarg;
2099
- end
2100
- else
2101
- emit(:k__ENCODING__, '__ENCODING__'.freeze)
2102
- end
2103
- fbreak;
2104
- };
2105
-
2106
- keyword_with_end
2107
- => { emit_table(KEYWORDS)
2108
- fbreak; };
2109
-
2110
- #
2111
- # NUMERIC LITERALS
2112
- #
2113
-
2114
- ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2115
- | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2116
- | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2117
- | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2118
- | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2119
- | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2120
- ) %{ @num_suffix_s = p } int_suffix
2121
- => {
2122
- digits = tok(@num_digits_s, @num_suffix_s)
2123
-
2124
- if digits.end_with? '_'.freeze
2125
- diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2126
- range(@te - 1, @te)
2127
- elsif digits.empty? && @num_base == 8 && version?(18)
2128
- # 1.8 did not raise an error on 0o.
2129
- digits = '0'.freeze
2130
- elsif digits.empty?
2131
- diagnostic :error, :empty_numeric
2132
- elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2133
- invalid_s = @num_digits_s + invalid_idx
2134
- diagnostic :error, :invalid_octal, nil,
2135
- range(invalid_s, invalid_s + 1)
2136
- end
2137
-
2138
- if version?(18, 19, 20)
2139
- emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2140
- p = @num_suffix_s - 1
2141
- else
2142
- @num_xfrm.call(digits.to_i(@num_base))
2143
- end
2144
- fbreak;
2145
- };
2146
-
2147
- flo_frac flo_pow?
2148
- => {
2149
- diagnostic :error, :no_dot_digit_literal
2150
- };
2151
-
2152
- flo_int [eE]
2153
- => {
2154
- if version?(18, 19, 20)
2155
- diagnostic :error,
2156
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2157
- range(@te - 1, @te)
2158
- else
2159
- emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2160
- fhold; fbreak;
2161
- end
2162
- };
2163
-
2164
- flo_int flo_frac [eE]
2165
- => {
2166
- if version?(18, 19, 20)
2167
- diagnostic :error,
2168
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2169
- range(@te - 1, @te)
2170
- else
2171
- emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2172
- fhold; fbreak;
2173
- end
2174
- };
2175
-
2176
- flo_int
2177
- ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2178
- | flo_frac %{ @num_suffix_s = p } flo_suffix
2179
- )
2180
- => {
2181
- digits = tok(@ts, @num_suffix_s)
2182
-
2183
- if version?(18, 19, 20)
2184
- emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2185
- p = @num_suffix_s - 1
2186
- else
2187
- @num_xfrm.call(digits)
2188
- end
2189
- fbreak;
2190
- };
2191
-
2192
- #
2193
- # STRING AND XSTRING LITERALS
2194
- #
2195
-
2196
- # `echo foo`, "bar", 'baz'
2197
- '`' | ['"] # '
2198
- => {
2199
- type, delimiter = tok, tok[-1].chr
2200
- fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2201
- };
2202
-
2203
- #
2204
- # CONSTANTS AND VARIABLES
2205
- #
2206
-
2207
- constant
2208
- => { emit(:tCONSTANT)
2209
- fnext *arg_or_cmdarg; fbreak; };
2210
-
2211
- constant ambiguous_const_suffix
2212
- => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2213
- p = tm - 1; fbreak; };
2214
-
2215
- global_var | class_var_v | instance_var_v
2216
- => { p = @ts - 1; fcall expr_variable; };
2217
-
2218
- #
2219
- # METHOD CALLS
2220
- #
2221
-
2222
- '.' | '&.' | '::'
2223
- => { emit_table(PUNCTUATION)
2224
- fnext expr_dot; fbreak; };
2225
-
2226
- call_or_var
2227
- => local_ident;
2228
-
2229
- bareword ambiguous_fid_suffix
2230
- => {
2231
- if tm == @te
2232
- # Suffix was consumed, e.g. foo!
2233
- emit(:tFID)
2234
- else
2235
- # Suffix was not consumed, e.g. foo!=
2236
- emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2237
- p = tm - 1
2238
- end
2239
- fnext expr_arg; fbreak;
2240
- };
2241
-
2242
- #
2243
- # OPERATORS
2244
- #
2245
-
2246
- # When '|', '~', '!', '=>' are used as operators
2247
- # they do not accept any symbols (or quoted labels) after.
2248
- # Other binary operators accept it.
2249
- ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' )
2250
- => {
2251
- emit_table(PUNCTUATION);
2252
- fnext expr_value; fbreak;
2253
- };
2254
-
2255
- ( e_lparen | '|' | '~' | '!' )
2256
- => { emit_table(PUNCTUATION)
2257
- fnext expr_beg; fbreak; };
2258
-
2259
- e_rbrace | e_rparen | ']'
2260
- => {
2261
- emit_table(PUNCTUATION)
2262
-
2263
- if @version < 24
2264
- @cond.lexpop
2265
- @cmdarg.lexpop
2266
- else
2267
- @cond.pop
2268
- @cmdarg.pop
2269
- end
2270
-
2271
- if tok == '}'.freeze || tok == ']'.freeze
2272
- if @version >= 25
2273
- fnext expr_end;
2274
- else
2275
- fnext expr_endarg;
2276
- end
2277
- else # )
2278
- # fnext expr_endfn; ?
2279
- end
2280
-
2281
- fbreak;
2282
- };
2283
-
2284
- operator_arithmetic '='
2285
- => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2286
- fnext expr_beg; fbreak; };
2287
-
2288
- '?'
2289
- => { emit(:tEH, '?'.freeze)
2290
- fnext expr_value; fbreak; };
2291
-
2292
- e_lbrack
2293
- => { emit(:tLBRACK2, '['.freeze)
2294
- fnext expr_beg; fbreak; };
2295
-
2296
- punctuation_end
2297
- => { emit_table(PUNCTUATION)
2298
- fnext expr_beg; fbreak; };
2299
-
2300
- #
2301
- # WHITESPACE
2302
- #
2303
-
2304
- w_space_comment;
2305
-
2306
- w_newline
2307
- => { fgoto leading_dot; };
2308
-
2309
- ';'
2310
- => { emit(:tSEMI, ';'.freeze)
2311
- fnext expr_value; fbreak; };
2312
-
2313
- '\\' c_line {
2314
- diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2315
- fhold;
2316
- };
2317
-
2318
- c_any
2319
- => {
2320
- diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2321
- };
2322
-
2323
- c_eof => do_eof;
2324
- *|;
2325
-
2326
- leading_dot := |*
2327
- # Insane leading dots:
2328
- # a #comment
2329
- # .b: a.b
2330
- c_space* %{ tm = p } ('.' | '&.')
2331
- => { p = tm - 1; fgoto expr_end; };
2332
-
2333
- any
2334
- => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2335
- fhold; fnext line_begin; fbreak; };
2336
- *|;
2337
-
2338
- #
2339
- # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2340
- #
2341
-
2342
- line_comment := |*
2343
- '=end' c_line* c_nl_zlen
2344
- => {
2345
- emit_comment(@eq_begin_s, @te)
2346
- fgoto *@cs_before_block_comment;
2347
- };
2348
-
2349
- c_line* c_nl;
2350
-
2351
- c_line* zlen
2352
- => {
2353
- diagnostic :fatal, :embedded_document, nil,
2354
- range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2355
- };
2356
- *|;
2357
-
2358
- line_begin := |*
2359
- w_any;
2360
-
2361
- '=begin' ( c_space | c_nl_zlen )
2362
- => { @eq_begin_s = @ts
2363
- fgoto line_comment; };
2364
-
2365
- '__END__' ( c_eol - zlen )
2366
- => { p = pe - 3 };
2367
-
2368
- c_any
2369
- => { fhold; fgoto expr_value; };
2370
-
2371
- c_eof => do_eof;
2372
- *|;
2373
-
2374
- }%%
2375
- # %
2376
- end