ruby-next-parser 2.8.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2556 @@
1
+ %%machine lex; # % fix highlighting
2
+
3
+ #
4
+ # === BEFORE YOU START ===
5
+ #
6
+ # Read the Ruby Hacking Guide chapter 11, available in English at
7
+ # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
+ #
9
+ # Remember two things about Ragel scanners:
10
+ #
11
+ # 1) Longest match wins.
12
+ #
13
+ # 2) If two matches have the same length, the first
14
+ # in source code wins.
15
+ #
16
+ # General rules of making Ragel and Bison happy:
17
+ #
18
+ # * `p` (position) and `@te` contain the index of the character
19
+ # they're pointing to ("current"), plus one. `@ts` contains the index
20
+ # of the corresponding character. The code for extracting matched token is:
21
+ #
22
+ # @source_buffer.slice(@ts...@te)
23
+ #
24
+ # * If your input is `foooooooobar` and the rule is:
25
+ #
26
+ # 'f' 'o'+
27
+ #
28
+ # the result will be:
29
+ #
30
+ # foooooooobar
31
+ # ^ ts=0 ^ p=te=9
32
+ #
33
+ # * A Ragel lexer action should not emit more than one token, unless
34
+ # you know what you are doing.
35
+ #
36
+ # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
+ #
38
+ # * If an action emits the token and transitions to another state, use
39
+ # these Ragel commands:
40
+ #
41
+ # emit($whatever)
42
+ # fnext $next_state; fbreak;
43
+ #
44
+ # If you perform `fgoto` in an action which does not emit a token nor
45
+ # rewinds the stream pointer, the parser's side-effectful,
46
+ # context-sensitive lookahead actions will break in a hard to detect
47
+ # and debug way.
48
+ #
49
+ # * If an action does not emit a token:
50
+ #
51
+ # fgoto $next_state;
52
+ #
53
+ # * If an action features lookbehind, i.e. matches characters with the
54
+ # intent of passing them to another action:
55
+ #
56
+ # p = @ts - 1
57
+ # fgoto $next_state;
58
+ #
59
+ # or, if the lookbehind consists of a single character:
60
+ #
61
+ # fhold; fgoto $next_state;
62
+ #
63
+ # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
+ # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
+ # _will_ invoke the action `act`.
66
+ #
67
+ # e_something stands for "something with **e**mbedded action".
68
+ #
69
+ # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
+ # the state of the lexer, add this rule to the state:
71
+ #
72
+ # c_eof => do_eof;
73
+ #
74
+ # * If you proceed past EOF, the lexer will complain:
75
+ #
76
+ # NoMethodError: undefined method `ord' for nil:NilClass
77
+ #
78
+
79
+ class Parser::Lexer
80
+ class Next
81
+
82
+ %% write data nofinal;
83
+ # %
84
+
85
+ ESCAPES = {
86
+ ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
87
+ ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
88
+ ?v.ord => "\v", ?\\.ord => "\\"
89
+ }.freeze
90
+
91
+ REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
92
+
93
+ attr_reader :source_buffer
94
+
95
+ attr_accessor :diagnostics
96
+ attr_accessor :static_env
97
+ attr_accessor :force_utf32
98
+
99
+ attr_accessor :cond, :cmdarg, :in_kwarg, :context, :command_start
100
+
101
+ attr_accessor :tokens, :comments
102
+
103
+ def initialize(version)
104
+ @version = version
105
+ @static_env = nil
106
+ @context = nil
107
+
108
+ @tokens = nil
109
+ @comments = nil
110
+
111
+ reset
112
+ end
113
+
114
+ def reset(reset_state=true)
115
+ # Ragel state:
116
+ if reset_state
117
+ # Unit tests set state prior to resetting lexer.
118
+ @cs = self.class.lex_en_line_begin
119
+
120
+ @cond = StackState.new('cond')
121
+ @cmdarg = StackState.new('cmdarg')
122
+ @cond_stack = []
123
+ @cmdarg_stack = []
124
+ end
125
+
126
+ @force_utf32 = false # Set to true by some tests
127
+
128
+ @source_pts = nil # @source as a codepoint array
129
+
130
+ @p = 0 # stream position (saved manually in #advance)
131
+ @ts = nil # token start
132
+ @te = nil # token end
133
+ @act = 0 # next action
134
+
135
+ @stack = [] # state stack
136
+ @top = 0 # state stack top pointer
137
+
138
+ # Lexer state:
139
+ @token_queue = []
140
+ @literal_stack = []
141
+
142
+ @eq_begin_s = nil # location of last encountered =begin
143
+ @sharp_s = nil # location of last encountered #
144
+
145
+ @newline_s = nil # location of last encountered newline
146
+
147
+ @num_base = nil # last numeric base
148
+ @num_digits_s = nil # starting position of numeric digits
149
+ @num_suffix_s = nil # starting position of numeric suffix
150
+ @num_xfrm = nil # numeric suffix-induced transformation
151
+
152
+ @escape_s = nil # starting position of current sequence
153
+ @escape = nil # last escaped sequence, as string
154
+
155
+ @herebody_s = nil # starting position of current heredoc line
156
+
157
+ # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
158
+ # encountered after a matching closing parenthesis.
159
+ @paren_nest = 0
160
+ @lambda_stack = []
161
+
162
+ # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
163
+ # we store the indentation level and give it out to the parser
164
+ # on request. It is not possible to infer indentation level just
165
+ # from the AST because escape sequences such as `\ ` or `\t` are
166
+ # expanded inside the lexer, but count as non-whitespace for
167
+ # indentation purposes.
168
+ @dedent_level = nil
169
+
170
+ # If the lexer is in `command state' (aka expr_value)
171
+ # at the entry to #advance, it will transition to expr_cmdarg
172
+ # instead of expr_arg at certain points.
173
+ @command_start = true
174
+
175
+ # True at the end of "def foo a:"
176
+ @in_kwarg = false
177
+
178
+ # State before =begin / =end block comment
179
+ @cs_before_block_comment = self.class.lex_en_line_begin
180
+ end
181
+
182
+ def source_buffer=(source_buffer)
183
+ @source_buffer = source_buffer
184
+
185
+ if @source_buffer
186
+ source = @source_buffer.source
187
+
188
+ if source.encoding == Encoding::UTF_8
189
+ @source_pts = source.unpack('U*')
190
+ else
191
+ @source_pts = source.unpack('C*')
192
+ end
193
+
194
+ if @source_pts[0] == 0xfeff
195
+ # Skip byte order mark.
196
+ @p = 1
197
+ end
198
+ else
199
+ @source_pts = nil
200
+ end
201
+ end
202
+
203
+ def encoding
204
+ @source_buffer.source.encoding
205
+ end
206
+
207
+ LEX_STATES = {
208
+ :line_begin => lex_en_line_begin,
209
+ :expr_dot => lex_en_expr_dot,
210
+ :expr_fname => lex_en_expr_fname,
211
+ :expr_value => lex_en_expr_value,
212
+ :expr_beg => lex_en_expr_beg,
213
+ :expr_mid => lex_en_expr_mid,
214
+ :expr_arg => lex_en_expr_arg,
215
+ :expr_cmdarg => lex_en_expr_cmdarg,
216
+ :expr_end => lex_en_expr_end,
217
+ :expr_endarg => lex_en_expr_endarg,
218
+ :expr_endfn => lex_en_expr_endfn,
219
+ :expr_labelarg => lex_en_expr_labelarg,
220
+
221
+ :interp_string => lex_en_interp_string,
222
+ :interp_words => lex_en_interp_words,
223
+ :plain_string => lex_en_plain_string,
224
+ :plain_words => lex_en_plain_string,
225
+ }
226
+
227
+ def state
228
+ LEX_STATES.invert.fetch(@cs, @cs)
229
+ end
230
+
231
+ def state=(state)
232
+ @cs = LEX_STATES.fetch(state)
233
+ end
234
+
235
+ def push_cmdarg
236
+ @cmdarg_stack.push(@cmdarg)
237
+ @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
238
+ end
239
+
240
+ def pop_cmdarg
241
+ @cmdarg = @cmdarg_stack.pop
242
+ end
243
+
244
+ def push_cond
245
+ @cond_stack.push(@cond)
246
+ @cond = StackState.new("cond.#{@cond_stack.count}")
247
+ end
248
+
249
+ def pop_cond
250
+ @cond = @cond_stack.pop
251
+ end
252
+
253
+ def dedent_level
254
+ # We erase @dedent_level as a precaution to avoid accidentally
255
+ # using a stale value.
256
+ dedent_level, @dedent_level = @dedent_level, nil
257
+ dedent_level
258
+ end
259
+
260
+ # Return next token: [type, value].
261
+ def advance
262
+ if @token_queue.any?
263
+ return @token_queue.shift
264
+ end
265
+
266
+ # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
267
+ klass = self.class
268
+ _lex_trans_keys = klass.send :_lex_trans_keys
269
+ _lex_key_spans = klass.send :_lex_key_spans
270
+ _lex_index_offsets = klass.send :_lex_index_offsets
271
+ _lex_indicies = klass.send :_lex_indicies
272
+ _lex_trans_targs = klass.send :_lex_trans_targs
273
+ _lex_trans_actions = klass.send :_lex_trans_actions
274
+ _lex_to_state_actions = klass.send :_lex_to_state_actions
275
+ _lex_from_state_actions = klass.send :_lex_from_state_actions
276
+ _lex_eof_trans = klass.send :_lex_eof_trans
277
+
278
+ pe = @source_pts.size + 2
279
+ p, eof = @p, pe
280
+
281
+ cmd_state = @command_start
282
+ @command_start = false
283
+
284
+ %% write exec;
285
+ # %
286
+
287
+ @p = p
288
+
289
+ if @token_queue.any?
290
+ @token_queue.shift
291
+ elsif @cs == klass.lex_error
292
+ [ false, [ '$error'.freeze, range(p - 1, p) ] ]
293
+ else
294
+ eof = @source_pts.size
295
+ [ false, [ '$eof'.freeze, range(eof, eof) ] ]
296
+ end
297
+ end
298
+
299
+ protected
300
+
301
+ def eof_codepoint?(point)
302
+ [0x04, 0x1a, 0x00].include? point
303
+ end
304
+
305
+ def version?(*versions)
306
+ versions.include?(@version)
307
+ end
308
+
309
+ def stack_pop
310
+ @top -= 1
311
+ @stack[@top]
312
+ end
313
+
314
+ def encode_escape(ord)
315
+ ord.chr.force_encoding(@source_buffer.source.encoding)
316
+ end
317
+
318
+ def tok(s = @ts, e = @te)
319
+ @source_buffer.slice(s...e)
320
+ end
321
+
322
+ def range(s = @ts, e = @te)
323
+ Parser::Source::Range.new(@source_buffer, s, e)
324
+ end
325
+
326
+ def emit(type, value = tok, s = @ts, e = @te)
327
+ token = [ type, [ value, range(s, e) ] ]
328
+
329
+ @token_queue.push(token)
330
+
331
+ @tokens.push(token) if @tokens
332
+
333
+ token
334
+ end
335
+
336
+ def emit_table(table, s = @ts, e = @te)
337
+ value = tok(s, e)
338
+
339
+ emit(table[value], value, s, e)
340
+ end
341
+
342
+ def emit_do(do_block=false)
343
+ if @cond.active?
344
+ emit(:kDO_COND, 'do'.freeze)
345
+ elsif @cmdarg.active? || do_block
346
+ emit(:kDO_BLOCK, 'do'.freeze)
347
+ else
348
+ emit(:kDO, 'do'.freeze)
349
+ end
350
+ end
351
+
352
+ def arg_or_cmdarg(cmd_state)
353
+ if cmd_state
354
+ self.class.lex_en_expr_cmdarg
355
+ else
356
+ self.class.lex_en_expr_arg
357
+ end
358
+ end
359
+
360
+ def emit_comment(s = @ts, e = @te)
361
+ if @comments
362
+ @comments.push(Parser::Source::Comment.new(range(s, e)))
363
+ end
364
+
365
+ if @tokens
366
+ @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
367
+ end
368
+
369
+ nil
370
+ end
371
+
372
+ def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
373
+ @diagnostics.process(
374
+ Parser::Diagnostic.new(type, reason, arguments, location, highlights))
375
+ end
376
+
377
+ #
378
+ # === LITERAL STACK ===
379
+ #
380
+
381
+ def push_literal(*args)
382
+ new_literal = Literal.new(self, *args)
383
+ @literal_stack.push(new_literal)
384
+ next_state_for_literal(new_literal)
385
+ end
386
+
387
+ def next_state_for_literal(literal)
388
+ if literal.words? && literal.backslash_delimited?
389
+ if literal.interpolate?
390
+ self.class.lex_en_interp_backslash_delimited_words
391
+ else
392
+ self.class.lex_en_plain_backslash_delimited_words
393
+ end
394
+ elsif literal.words? && !literal.backslash_delimited?
395
+ if literal.interpolate?
396
+ self.class.lex_en_interp_words
397
+ else
398
+ self.class.lex_en_plain_words
399
+ end
400
+ elsif !literal.words? && literal.backslash_delimited?
401
+ if literal.interpolate?
402
+ self.class.lex_en_interp_backslash_delimited
403
+ else
404
+ self.class.lex_en_plain_backslash_delimited
405
+ end
406
+ else
407
+ if literal.interpolate?
408
+ self.class.lex_en_interp_string
409
+ else
410
+ self.class.lex_en_plain_string
411
+ end
412
+ end
413
+ end
414
+
415
+ def literal
416
+ @literal_stack.last
417
+ end
418
+
419
+ def pop_literal
420
+ old_literal = @literal_stack.pop
421
+
422
+ @dedent_level = old_literal.dedent_level
423
+
424
+ if old_literal.type == :tREGEXP_BEG
425
+ # Fetch modifiers.
426
+ self.class.lex_en_regexp_modifiers
427
+ else
428
+ self.class.lex_en_expr_end
429
+ end
430
+ end
431
+
432
+ # Mapping of strings to parser tokens.
433
+
434
+ PUNCTUATION = {
435
+ '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
436
+ '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
437
+ '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
438
+ '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
439
+ ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
440
+ '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
441
+ '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
442
+ ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
443
+ '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
444
+ '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
445
+ '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
446
+ '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
447
+ '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
448
+ '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
449
+ '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
450
+ '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
451
+ '!@' => :tBANG, '&.' => :tANDDOT, '.:' => :tMETHREF
452
+ }
453
+
454
+ PUNCTUATION_BEGIN = {
455
+ '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
456
+ '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
457
+ '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
458
+ }
459
+
460
+ KEYWORDS = {
461
+ 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
462
+ 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
463
+ 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
464
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
465
+ }
466
+
467
+ KEYWORDS_BEGIN = {
468
+ 'if' => :kIF, 'unless' => :kUNLESS,
469
+ 'while' => :kWHILE, 'until' => :kUNTIL,
470
+ 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
471
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
472
+ }
473
+
474
+ %w(class module def undef begin end then elsif else ensure case when
475
+ for break next redo retry in do return yield super self nil true
476
+ false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
477
+ KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
478
+ end
479
+
480
+ %%{
481
+ # %
482
+
483
+ access @;
484
+ getkey (@source_pts[p] || 0);
485
+
486
+ # === CHARACTER CLASSES ===
487
+ #
488
+ # Pay close attention to the differences between c_any and any.
489
+ # c_any does not include EOF and so will cause incorrect behavior
490
+ # for machine subtraction (any-except rules) and default transitions
491
+ # for scanners.
492
+
493
+ action do_nl {
494
+ # Record position of a newline for precise location reporting on tNL
495
+ # tokens.
496
+ #
497
+ # This action is embedded directly into c_nl, as it is idempotent and
498
+ # there are no cases when we need to skip it.
499
+ @newline_s = p
500
+ }
501
+
502
+ c_nl = '\n' $ do_nl;
503
+ c_space = [ \t\r\f\v];
504
+ c_space_nl = c_space | c_nl;
505
+
506
+ c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
507
+ c_eol = c_nl | c_eof;
508
+ c_any = any - c_eof;
509
+
510
+ c_nl_zlen = c_nl | zlen;
511
+ c_line = any - c_nl_zlen;
512
+
513
+ c_unicode = c_any - 0x00..0x7f;
514
+ c_upper = [A-Z];
515
+ c_lower = [a-z_] | c_unicode;
516
+ c_alpha = c_lower | c_upper;
517
+ c_alnum = c_alpha | [0-9];
518
+
519
+ action do_eof {
520
+ # Sit at EOF indefinitely. #advance would return $eof each time.
521
+ # This allows to feed the lexer more data if needed; this is only used
522
+ # in tests.
523
+ #
524
+ # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
525
+ # below. This is due to the fact that scanner state at EOF is observed
526
+ # by tests, and encapsulating it in a rule would break the introspection.
527
+ fhold; fbreak;
528
+ }
529
+
530
+ #
531
+ # === TOKEN DEFINITIONS ===
532
+ #
533
+
534
+ # All operators are punctuation. There is more to punctuation
535
+ # than just operators. Operators can be overridden by user;
536
+ # punctuation can not.
537
+
538
+ # A list of operators which are valid in the function name context, but
539
+ # have different semantics in others.
540
+ operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
541
+
542
+ # A list of operators which can occur within an assignment shortcut (+ → +=).
543
+ operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
544
+ '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
545
+
546
+ # A list of all user-definable operators not covered by groups above.
547
+ operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
548
+ '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
549
+
550
+ # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
551
+ # as they are ambiguous with interpolation `#{}` and should be counted.
552
+ # These braces are not present in punctuation lists.
553
+
554
+ # A list of punctuation which has different meaning when used at the
555
+ # beginning of expression.
556
+ punctuation_begin = '-' | '+' | '::' | '(' | '[' |
557
+ '*' | '**' | '&' ;
558
+
559
+ # A list of all punctuation except punctuation_begin.
560
+ punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
561
+ '::' | '?' | ':' | '.' | '..' | '...' ;
562
+
563
+ # A list of keywords which have different meaning at the beginning of expression.
564
+ keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
565
+
566
+ # A list of keywords which accept an argument-like expression, i.e. have the
567
+ # same post-processing as method calls or commands. Example: `yield 1`,
568
+ # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
569
+ keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
570
+
571
+ # A list of keywords which accept a literal function name as an argument.
572
+ keyword_with_fname = 'def' | 'undef' | 'alias' ;
573
+
574
+ # A list of keywords which accept an expression after them.
575
+ keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
576
+ 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
577
+ 'and' | 'or' ;
578
+
579
+ # A list of keywords which accept a value, and treat the keywords from
580
+ # `keyword_modifier` list as modifiers.
581
+ keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
582
+
583
+ # A list of keywords which do not accept an expression after them.
584
+ keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
585
+ 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
586
+ '__LINE__' | '__ENCODING__';
587
+
588
+ # All keywords.
589
+ keyword = keyword_with_value | keyword_with_mid |
590
+ keyword_with_end | keyword_with_arg |
591
+ keyword_with_fname | keyword_modifier ;
592
+
593
+ constant = c_upper c_alnum*;
594
+ bareword = c_alpha c_alnum*;
595
+
596
+ call_or_var = c_lower c_alnum*;
597
+ class_var = '@@' bareword;
598
+ instance_var = '@' bareword;
599
+ global_var = '$'
600
+ ( bareword | digit+
601
+ | [`'+~*$&?!@/\\;,.=:<>"] # `
602
+ | '-' c_alnum
603
+ )
604
+ ;
605
+
606
+ # Ruby accepts (and fails on) variables with leading digit
607
+ # in literal context, but not in unquoted symbol body.
608
+ class_var_v = '@@' c_alnum+;
609
+ instance_var_v = '@' c_alnum+;
610
+
611
+ label = bareword [?!]? ':';
612
+
613
+ #
614
+ # === NUMERIC PARSING ===
615
+ #
616
+
617
+ int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
618
+ int_dec = ( digit+ '_' )* digit* '_'? ;
619
+ int_bin = ( [01]+ '_' )* [01]* '_'? ;
620
+
621
+ flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
622
+ flo_frac = '.' ( digit+ '_' )* digit+;
623
+ flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
624
+
625
+ int_suffix =
626
+ '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
627
+ | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
628
+ | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
629
+ | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
630
+ | 're' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
631
+ | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
632
+ | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
633
+
634
+ flo_pow_suffix =
635
+ '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
636
+ | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
637
+ | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
638
+
639
+ flo_suffix =
640
+ flo_pow_suffix
641
+ | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
642
+ | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
643
+ | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
644
+
645
+ #
646
+ # === ESCAPE SEQUENCE PARSING ===
647
+ #
648
+
649
+ # Escape parsing code is a Ragel pattern, not a scanner, and therefore
650
+ # it shouldn't directly raise errors or perform other actions with side effects.
651
+ # In reality this would probably just mess up error reporting in pathological
652
+ # cases, through.
653
+
654
+ # The amount of code required to parse \M\C stuff correctly is ridiculous.
655
+
656
+ escaped_nl = "\\" c_nl;
657
+
658
+ action unicode_points {
659
+ @escape = ""
660
+
661
+ codepoints = tok(@escape_s + 2, p - 1)
662
+ codepoint_s = @escape_s + 2
663
+
664
+ if @version < 24
665
+ if codepoints.start_with?(" ") || codepoints.start_with?("\t")
666
+ diagnostic :fatal, :invalid_unicode_escape, nil,
667
+ range(@escape_s + 2, @escape_s + 3)
668
+ end
669
+
670
+ if spaces_p = codepoints.index(/[ \t]{2}/)
671
+ diagnostic :fatal, :invalid_unicode_escape, nil,
672
+ range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
673
+ end
674
+
675
+ if codepoints.end_with?(" ") || codepoints.end_with?("\t")
676
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
677
+ end
678
+ end
679
+
680
+ codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
681
+ if spaces
682
+ codepoint_s += spaces.length
683
+ else
684
+ codepoint = codepoint_str.to_i(16)
685
+
686
+ if codepoint >= 0x110000
687
+ diagnostic :error, :unicode_point_too_large, nil,
688
+ range(codepoint_s, codepoint_s + codepoint_str.length)
689
+ break
690
+ end
691
+
692
+ @escape += codepoint.chr(Encoding::UTF_8)
693
+ codepoint_s += codepoint_str.length
694
+ end
695
+ end
696
+ }
697
+
698
+ action unescape_char {
699
+ codepoint = @source_pts[p - 1]
700
+ if (@escape = ESCAPES[codepoint]).nil?
701
+ @escape = encode_escape(@source_buffer.slice(p - 1))
702
+ end
703
+ }
704
+
705
+ action invalid_complex_escape {
706
+ diagnostic :fatal, :invalid_escape
707
+ }
708
+
709
+ action read_post_meta_or_ctrl_char {
710
+ @escape = @source_buffer.slice(p - 1).chr
711
+
712
+ if @version >= 27 && ((0..8).include?(@escape.ord) || (14..31).include?(@escape.ord))
713
+ diagnostic :fatal, :invalid_escape
714
+ end
715
+ }
716
+
717
+ action slash_c_char {
718
+ @escape = encode_escape(@escape[0].ord & 0x9f)
719
+ }
720
+
721
+ action slash_m_char {
722
+ @escape = encode_escape(@escape[0].ord | 0x80)
723
+ }
724
+
725
+ maybe_escaped_char = (
726
+ '\\' c_any %unescape_char
727
+ | ( c_any - [\\] ) %read_post_meta_or_ctrl_char
728
+ );
729
+
730
+ maybe_escaped_ctrl_char = ( # why?!
731
+ '\\' c_any %unescape_char %slash_c_char
732
+ | '?' % { @escape = "\x7f" }
733
+ | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char
734
+ );
735
+
736
+ escape = (
737
+ # \377
738
+ [0-7]{1,3}
739
+ % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
740
+
741
+ # \xff
742
+ | 'x' xdigit{1,2}
743
+ % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
744
+
745
+ # %q[\x]
746
+ | 'x' ( c_any - xdigit )
747
+ % {
748
+ diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
749
+ }
750
+
751
+ # \u263a
752
+ | 'u' xdigit{4}
753
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
754
+
755
+ # \u123
756
+ | 'u' xdigit{0,3}
757
+ % {
758
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
759
+ }
760
+
761
+ # u{not hex} or u{}
762
+ | 'u{' ( c_any - xdigit - [ \t}] )* '}'
763
+ % {
764
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
765
+ }
766
+
767
+ # \u{ \t 123 \t 456 \t\t }
768
+ | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
769
+ (
770
+ ( xdigit{1,6} [ \t]* '}'
771
+ %unicode_points
772
+ )
773
+ |
774
+ ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
775
+ | ( c_any - [ \t}] )* c_eof
776
+ | xdigit{7,}
777
+ ) % {
778
+ diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
779
+ }
780
+ )
781
+
782
+ # \C-\a \cx
783
+ | ( 'C-' | 'c' ) escaped_nl?
784
+ maybe_escaped_ctrl_char
785
+
786
+ # \M-a
787
+ | 'M-' escaped_nl?
788
+ maybe_escaped_char
789
+ %slash_m_char
790
+
791
+ # \C-\M-f \M-\cf \c\M-f
792
+ | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
793
+ | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
794
+ maybe_escaped_ctrl_char
795
+ %slash_m_char
796
+
797
+ | 'C' c_any %invalid_complex_escape
798
+ | 'M' c_any %invalid_complex_escape
799
+ | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
800
+
801
+ | ( c_any - [0-7xuCMc] ) %unescape_char
802
+
803
+ | c_eof % {
804
+ diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
805
+ }
806
+ );
807
+
808
+ # Use rules in form of `e_bs escape' when you need to parse a sequence.
809
+ e_bs = '\\' % {
810
+ @escape_s = p
811
+ @escape = nil
812
+ };
813
+
814
+ #
815
+ # === STRING AND HEREDOC PARSING ===
816
+ #
817
+
818
+ # Heredoc parsing is quite a complex topic. First, consider that heredocs
819
+ # can be arbitrarily nested. For example:
820
+ #
821
+ # puts <<CODE
822
+ # the result is: #{<<RESULT.inspect
823
+ # i am a heredoc
824
+ # RESULT
825
+ # }
826
+ # CODE
827
+ #
828
+ # which, incidentally, evaluates to:
829
+ #
830
+ # the result is: " i am a heredoc\n"
831
+ #
832
+ # To parse them, lexer refers to two kinds (remember, nested heredocs)
833
+ # of positions in the input stream, namely heredoc_e
834
+ # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
835
+ #
836
+ # heredoc_e is simply contained inside the corresponding Literal, and
837
+ # when the heredoc is closed, the lexing is restarted from that position.
838
+ #
839
+ # @herebody_s is quite more complex. First, @herebody_s changes after each
840
+ # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
841
+ # contains the current line, and also when a heredoc is started, @herebody_s
842
+ # contains the position from which the heredoc will be lexed.
843
+ #
844
+ # Second, as (insanity) there are nested heredocs, we need to maintain a
845
+ # stack of these positions. Each time #push_literal is called, it saves current
846
+ # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
847
+ # containing another heredocs) is closed, the previous value is restored.
848
+
849
+ e_heredoc_nl = c_nl % {
850
+ # After every heredoc was parsed, @herebody_s contains the
851
+ # position of next token after all heredocs.
852
+ if @herebody_s
853
+ p = @herebody_s
854
+ @herebody_s = nil
855
+ end
856
+ };
857
+
858
+ action extend_string {
859
+ string = tok
860
+
861
+ # tLABEL_END is only possible in non-cond context on >= 2.2
862
+ if @version >= 22 && !@cond.active?
863
+ lookahead = @source_buffer.slice(@te...@te+2)
864
+ end
865
+
866
+ current_literal = literal
867
+ if !current_literal.heredoc? &&
868
+ (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
869
+ if token[0] == :tLABEL_END
870
+ p += 1
871
+ pop_literal
872
+ fnext expr_labelarg;
873
+ else
874
+ fnext *pop_literal;
875
+ end
876
+ fbreak;
877
+ else
878
+ current_literal.extend_string(string, @ts, @te)
879
+ end
880
+ }
881
+
882
+ action extend_string_escaped {
883
+ current_literal = literal
884
+ # Get the first character after the backslash.
885
+ escaped_char = @source_buffer.slice(@escape_s).chr
886
+
887
+ if current_literal.munge_escape? escaped_char
888
+ # If this particular literal uses this character as an opening
889
+ # or closing delimiter, it is an escape sequence for that
890
+ # particular character. Write it without the backslash.
891
+
892
+ if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
893
+ # Regular expressions should include escaped delimiters in their
894
+ # escaped form, except when the escaped character is
895
+ # a closing delimiter but not a regexp metacharacter.
896
+ #
897
+ # The backslash itself cannot be used as a closing delimiter
898
+ # at the same time as an escape symbol, but it is always munged,
899
+ # so this branch also executes for the non-closing-delimiter case
900
+ # for the backslash.
901
+ current_literal.extend_string(tok, @ts, @te)
902
+ else
903
+ current_literal.extend_string(escaped_char, @ts, @te)
904
+ end
905
+ else
906
+ # It does not. So this is an actual escape sequence, yay!
907
+ if current_literal.squiggly_heredoc? && escaped_char == "\n".freeze
908
+ # Squiggly heredocs like
909
+ # <<~-HERE
910
+ # 1\
911
+ # 2
912
+ # HERE
913
+ # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
914
+ # This information is emitted as is, without escaping,
915
+ # later this escape sequence (\\\n) gets handled manually in the Lexer::Dedenter
916
+ current_literal.extend_string(tok, @ts, @te)
917
+ elsif current_literal.supports_line_continuation_via_slash? && escaped_char == "\n".freeze
918
+ # Heredocs, regexp and a few other types of literals support line
919
+ # continuation via \\\n sequence. The code like
920
+ # "a\
921
+ # b"
922
+ # must be parsed as "ab"
923
+ current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
924
+ elsif current_literal.regexp?
925
+ # Regular expressions should include escape sequences in their
926
+ # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x")
927
+ current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
928
+ else
929
+ current_literal.extend_string(@escape || tok, @ts, @te)
930
+ end
931
+ end
932
+ }
933
+
934
+ # Extend a string with a newline or a EOF character.
935
+ # As heredoc closing line can immediately precede EOF, this action
936
+ # has to handle such case specially.
937
+ action extend_string_eol {
938
+ current_literal = literal
939
+ if @te == pe
940
+ diagnostic :fatal, :string_eof, nil,
941
+ range(current_literal.str_s, current_literal.str_s + 1)
942
+ end
943
+
944
+ if current_literal.heredoc?
945
+ line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
946
+
947
+ if version?(18, 19, 20)
948
+ # See ruby:c48b4209c
949
+ line = line.gsub(/\r.*$/, ''.freeze)
950
+ end
951
+
952
+ # Try ending the heredoc with the complete most recently
953
+ # scanned line. @herebody_s always refers to the start of such line.
954
+ if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
955
+ # Adjust @herebody_s to point to the next line.
956
+ @herebody_s = @te
957
+
958
+ # Continue regular lexing after the heredoc reference (<<END).
959
+ p = current_literal.heredoc_e - 1
960
+ fnext *pop_literal; fbreak;
961
+ else
962
+ # Calculate indentation level for <<~HEREDOCs.
963
+ current_literal.infer_indent_level(line)
964
+
965
+ # Ditto.
966
+ @herebody_s = @te
967
+ end
968
+ else
969
+ # Try ending the literal with a newline.
970
+ if current_literal.nest_and_try_closing(tok, @ts, @te)
971
+ fnext *pop_literal; fbreak;
972
+ end
973
+
974
+ if @herebody_s
975
+ # This is a regular literal intertwined with a heredoc. Like:
976
+ #
977
+ # p <<-foo+"1
978
+ # bar
979
+ # foo
980
+ # 2"
981
+ #
982
+ # which, incidentally, evaluates to "bar\n1\n2".
983
+ p = @herebody_s - 1
984
+ @herebody_s = nil
985
+ end
986
+ end
987
+
988
+ if current_literal.words? && !eof_codepoint?(@source_pts[p])
989
+ current_literal.extend_space @ts, @te
990
+ else
991
+ # A literal newline is appended if the heredoc was _not_ closed
992
+ # this time (see fbreak above). See also Literal#nest_and_try_closing
993
+ # for rationale of calling #flush_string here.
994
+ current_literal.extend_string tok, @ts, @te
995
+ current_literal.flush_string
996
+ end
997
+ }
998
+
999
+ action extend_string_space {
1000
+ literal.extend_space @ts, @te
1001
+ }
1002
+
1003
+ #
1004
+ # === INTERPOLATION PARSING ===
1005
+ #
1006
+
1007
+ # Interpolations with immediate variable names simply call into
1008
+ # the corresponding machine.
1009
+
1010
+ interp_var = '#' ( global_var | class_var_v | instance_var_v );
1011
+
1012
+ action extend_interp_var {
1013
+ current_literal = literal
1014
+ current_literal.flush_string
1015
+ current_literal.extend_content
1016
+
1017
+ emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1018
+
1019
+ p = @ts
1020
+ fcall expr_variable;
1021
+ }
1022
+
1023
+ # Special case for Ruby > 2.7
1024
+ # If interpolated instance/class variable starts with a digit we parse it as a plain substring
1025
+ # However, "#$1" is still a regular interpolation
1026
+ interp_digit_var = '#' ('@' | '@@') digit c_alpha*;
1027
+
1028
+ action extend_interp_digit_var {
1029
+ if @version >= 27
1030
+ literal.extend_string(tok, @ts, @te)
1031
+ else
1032
+ message = tok.start_with?('#@@') ? :cvar_name : :ivar_name
1033
+ diagnostic :error, message, { :name => tok(@ts + 1, @te) }, range(@ts + 1, @te)
1034
+ end
1035
+ }
1036
+
1037
+ # Interpolations with code blocks must match nested curly braces, as
1038
+ # interpolation ending is ambiguous with a block ending. So, every
1039
+ # opening and closing brace should be matched with e_[lr]brace rules,
1040
+ # which automatically perform the counting.
1041
+ #
1042
+ # Note that interpolations can themselves be nested, so brace balance
1043
+ # is tied to the innermost literal.
1044
+ #
1045
+ # Also note that literals themselves should not use e_[lr]brace rules
1046
+ # when matching their opening and closing delimiters, as the amount of
1047
+ # braces inside the characters of a string literal is independent.
1048
+
1049
+ interp_code = '#{';
1050
+
1051
+ e_lbrace = '{' % {
1052
+ @cond.push(false); @cmdarg.push(false)
1053
+
1054
+ current_literal = literal
1055
+ if current_literal
1056
+ current_literal.start_interp_brace
1057
+ end
1058
+ };
1059
+
1060
+ e_rbrace = '}' % {
1061
+ current_literal = literal
1062
+ if current_literal
1063
+ if current_literal.end_interp_brace_and_try_closing
1064
+ if version?(18, 19)
1065
+ emit(:tRCURLY, '}'.freeze, p - 1, p)
1066
+ @cond.lexpop
1067
+ @cmdarg.lexpop
1068
+ else
1069
+ emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1070
+ end
1071
+
1072
+ if current_literal.saved_herebody_s
1073
+ @herebody_s = current_literal.saved_herebody_s
1074
+ end
1075
+
1076
+
1077
+ fhold;
1078
+ fnext *next_state_for_literal(current_literal);
1079
+ fbreak;
1080
+ end
1081
+ end
1082
+
1083
+ @paren_nest -= 1
1084
+ };
1085
+
1086
+ action extend_interp_code {
1087
+ current_literal = literal
1088
+ current_literal.flush_string
1089
+ current_literal.extend_content
1090
+
1091
+ emit(:tSTRING_DBEG, '#{'.freeze)
1092
+
1093
+ if current_literal.heredoc?
1094
+ current_literal.saved_herebody_s = @herebody_s
1095
+ @herebody_s = nil
1096
+ end
1097
+
1098
+ current_literal.start_interp_brace
1099
+ @command_start = true
1100
+ fnext expr_value;
1101
+ fbreak;
1102
+ }
1103
+
1104
+ # Actual string parsers are simply combined from the primitives defined
1105
+ # above.
1106
+
1107
+ interp_words := |*
1108
+ interp_code => extend_interp_code;
1109
+ interp_digit_var => extend_interp_digit_var;
1110
+ interp_var => extend_interp_var;
1111
+ e_bs escape => extend_string_escaped;
1112
+ c_space+ => extend_string_space;
1113
+ c_eol => extend_string_eol;
1114
+ c_any => extend_string;
1115
+ *|;
1116
+
1117
+ interp_string := |*
1118
+ interp_code => extend_interp_code;
1119
+ interp_digit_var => extend_interp_digit_var;
1120
+ interp_var => extend_interp_var;
1121
+ e_bs escape => extend_string_escaped;
1122
+ c_eol => extend_string_eol;
1123
+ c_any => extend_string;
1124
+ *|;
1125
+
1126
+ plain_words := |*
1127
+ e_bs c_any => extend_string_escaped;
1128
+ c_space+ => extend_string_space;
1129
+ c_eol => extend_string_eol;
1130
+ c_any => extend_string;
1131
+ *|;
1132
+
1133
+ plain_string := |*
1134
+ '\\' c_nl => extend_string_eol;
1135
+ e_bs c_any => extend_string_escaped;
1136
+ c_eol => extend_string_eol;
1137
+ c_any => extend_string;
1138
+ *|;
1139
+
1140
+ interp_backslash_delimited := |*
1141
+ interp_code => extend_interp_code;
1142
+ interp_digit_var => extend_interp_digit_var;
1143
+ interp_var => extend_interp_var;
1144
+ c_eol => extend_string_eol;
1145
+ c_any => extend_string;
1146
+ *|;
1147
+
1148
+ plain_backslash_delimited := |*
1149
+ c_eol => extend_string_eol;
1150
+ c_any => extend_string;
1151
+ *|;
1152
+
1153
+ interp_backslash_delimited_words := |*
1154
+ interp_code => extend_interp_code;
1155
+ interp_digit_var => extend_interp_digit_var;
1156
+ interp_var => extend_interp_var;
1157
+ c_space+ => extend_string_space;
1158
+ c_eol => extend_string_eol;
1159
+ c_any => extend_string;
1160
+ *|;
1161
+
1162
+ plain_backslash_delimited_words := |*
1163
+ c_space+ => extend_string_space;
1164
+ c_eol => extend_string_eol;
1165
+ c_any => extend_string;
1166
+ *|;
1167
+
1168
+ regexp_modifiers := |*
1169
+ [A-Za-z]+
1170
+ => {
1171
+ unknown_options = tok.scan(/[^imxouesn]/)
1172
+ if unknown_options.any?
1173
+ diagnostic :error, :regexp_options,
1174
+ { :options => unknown_options.join }
1175
+ end
1176
+
1177
+ emit(:tREGEXP_OPT)
1178
+ fnext expr_end;
1179
+ fbreak;
1180
+ };
1181
+
1182
+ any
1183
+ => {
1184
+ emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1185
+ fhold;
1186
+ fgoto expr_end;
1187
+ };
1188
+ *|;
1189
+
1190
+ #
1191
+ # === WHITESPACE HANDLING ===
1192
+ #
1193
+
1194
+ # Various contexts in Ruby allow various kinds of whitespace
1195
+ # to be used. They are grouped to clarify the lexing machines
1196
+ # and ease collection of comments.
1197
+
1198
+ # A line of code with inline #comment at end is always equivalent
1199
+ # to a line of code ending with just a newline, so an inline
1200
+ # comment is deemed equivalent to non-newline whitespace
1201
+ # (c_space character class).
1202
+
1203
+ w_space =
1204
+ c_space+
1205
+ | '\\' e_heredoc_nl
1206
+ ;
1207
+
1208
+ w_comment =
1209
+ '#' %{ @sharp_s = p - 1 }
1210
+ # The (p == pe) condition compensates for added "\0" and
1211
+ # the way Ragel handles EOF.
1212
+ c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1213
+ ;
1214
+
1215
+ w_space_comment =
1216
+ w_space
1217
+ | w_comment
1218
+ ;
1219
+
1220
+ # A newline in non-literal context always interoperates with
1221
+ # here document logic and can always be escaped by a backslash,
1222
+ # still interoperating with here document logic in the same way,
1223
+ # yet being invisible to anything else.
1224
+ #
1225
+ # To demonstrate:
1226
+ #
1227
+ # foo = <<FOO \
1228
+ # bar
1229
+ # FOO
1230
+ # + 2
1231
+ #
1232
+ # is equivalent to `foo = "bar\n" + 2`.
1233
+
1234
+ w_newline =
1235
+ e_heredoc_nl;
1236
+
1237
+ w_any =
1238
+ w_space
1239
+ | w_comment
1240
+ | w_newline
1241
+ ;
1242
+
1243
+
1244
+ #
1245
+ # === EXPRESSION PARSING ===
1246
+ #
1247
+
1248
+ # These rules implement a form of manually defined lookahead.
1249
+ # The default longest-match scanning does not work here due
1250
+ # to sheer ambiguity.
1251
+
1252
+ ambiguous_fid_suffix = # actual parsed
1253
+ [?!] %{ tm = p } | # a? a?
1254
+ [?!]'=' %{ tm = p - 2 } # a!=b a != b
1255
+ ;
1256
+
1257
+ ambiguous_ident_suffix = # actual parsed
1258
+ ambiguous_fid_suffix |
1259
+ '=' %{ tm = p } | # a= a=
1260
+ '==' %{ tm = p - 2 } | # a==b a == b
1261
+ '=~' %{ tm = p - 2 } | # a=~b a =~ b
1262
+ '=>' %{ tm = p - 2 } | # a=>b a => b
1263
+ '===' %{ tm = p - 3 } # a===b a === b
1264
+ ;
1265
+
1266
+ ambiguous_symbol_suffix = # actual parsed
1267
+ ambiguous_ident_suffix |
1268
+ '==>' %{ tm = p - 2 } # :a==>b :a= => b
1269
+ ;
1270
+
1271
+ # Ambiguous with 1.9 hash labels.
1272
+ ambiguous_const_suffix = # actual parsed
1273
+ '::' %{ tm = p - 2 } # A::B A :: B
1274
+ ;
1275
+
1276
+ # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1277
+ # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1278
+
1279
+ e_lbrack = '[' % {
1280
+ @cond.push(false); @cmdarg.push(false)
1281
+
1282
+ @paren_nest += 1
1283
+ };
1284
+
1285
+ e_rbrack = ']' % {
1286
+ @paren_nest -= 1
1287
+ };
1288
+
1289
+ # Ruby 1.9 lambdas require parentheses counting in order to
1290
+ # emit correct opening kDO/tLBRACE.
1291
+
1292
+ e_lparen = '(' % {
1293
+ @cond.push(false); @cmdarg.push(false)
1294
+
1295
+ @paren_nest += 1
1296
+
1297
+ if version?(18)
1298
+ @command_start = true
1299
+ end
1300
+ };
1301
+
1302
+ e_rparen = ')' % {
1303
+ @paren_nest -= 1
1304
+ };
1305
+
1306
+ # Ruby is context-sensitive wrt/ local identifiers.
1307
+ action local_ident {
1308
+ emit(:tIDENTIFIER)
1309
+
1310
+ if !@static_env.nil? && @static_env.declared?(tok)
1311
+ fnext expr_endfn; fbreak;
1312
+ else
1313
+ fnext *arg_or_cmdarg(cmd_state); fbreak;
1314
+ end
1315
+ }
1316
+
1317
+ # Variable lexing code is accessed from both expressions and
1318
+ # string interpolation related code.
1319
+ #
1320
+ expr_variable := |*
1321
+ global_var
1322
+ => {
1323
+ if tok =~ /^\$([1-9][0-9]*)$/
1324
+ emit(:tNTH_REF, tok(@ts + 1).to_i)
1325
+ elsif tok =~ /^\$([&`'+])$/
1326
+ emit(:tBACK_REF)
1327
+ else
1328
+ emit(:tGVAR)
1329
+ end
1330
+
1331
+ fnext *stack_pop; fbreak;
1332
+ };
1333
+
1334
+ class_var_v
1335
+ => {
1336
+ if tok =~ /^@@[0-9]/
1337
+ diagnostic :error, :cvar_name, { :name => tok }
1338
+ end
1339
+
1340
+ emit(:tCVAR)
1341
+ fnext *stack_pop; fbreak;
1342
+ };
1343
+
1344
+ instance_var_v
1345
+ => {
1346
+ if tok =~ /^@[0-9]/
1347
+ diagnostic :error, :ivar_name, { :name => tok }
1348
+ end
1349
+
1350
+ emit(:tIVAR)
1351
+ fnext *stack_pop; fbreak;
1352
+ };
1353
+ *|;
1354
+
1355
+ # Literal function name in definition (e.g. `def class`).
1356
+ # Keywords are returned as their respective tokens; this is used
1357
+ # to support singleton def `def self.foo`. Global variables are
1358
+ # returned as `tGVAR`; this is used in global variable alias
1359
+ # statements `alias $a $b`. Symbols are returned verbatim; this
1360
+ # is used in `alias :a :"b#{foo}"` and `undef :a`.
1361
+ #
1362
+ # Transitions to `expr_endfn` afterwards.
1363
+ #
1364
+ expr_fname := |*
1365
+ keyword
1366
+ => { emit_table(KEYWORDS_BEGIN);
1367
+ fnext expr_endfn; fbreak; };
1368
+
1369
+ constant
1370
+ => { emit(:tCONSTANT)
1371
+ fnext expr_endfn; fbreak; };
1372
+
1373
+ bareword [?=!]?
1374
+ => { emit(:tIDENTIFIER)
1375
+ fnext expr_endfn; fbreak; };
1376
+
1377
+ global_var
1378
+ => { p = @ts - 1
1379
+ fnext expr_end; fcall expr_variable; };
1380
+
1381
+ # If the handling was to be delegated to expr_end,
1382
+ # these cases would transition to something else than
1383
+ # expr_endfn, which is incorrect.
1384
+ operator_fname |
1385
+ operator_arithmetic |
1386
+ operator_rest
1387
+ => { emit_table(PUNCTUATION)
1388
+ fnext expr_endfn; fbreak; };
1389
+
1390
+ '::'
1391
+ => { fhold; fhold; fgoto expr_end; };
1392
+
1393
+ ':'
1394
+ => { fhold; fgoto expr_beg; };
1395
+
1396
+ '%s' c_any
1397
+ => {
1398
+ if version?(23)
1399
+ type, delimiter = tok[0..-2], tok[-1].chr
1400
+ fgoto *push_literal(type, delimiter, @ts);
1401
+ else
1402
+ p = @ts - 1
1403
+ fgoto expr_end;
1404
+ end
1405
+ };
1406
+
1407
+ w_any;
1408
+
1409
+ c_any
1410
+ => { fhold; fgoto expr_end; };
1411
+
1412
+ c_eof => do_eof;
1413
+ *|;
1414
+
1415
+ # After literal function name in definition. Behaves like `expr_end`,
1416
+ # but allows a tLABEL.
1417
+ #
1418
+ # Transitions to `expr_end` afterwards.
1419
+ #
1420
+ expr_endfn := |*
1421
+ label ( any - ':' )
1422
+ => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1423
+ fhold; fnext expr_labelarg; fbreak; };
1424
+
1425
+ w_space_comment;
1426
+
1427
+ c_any
1428
+ => { fhold; fgoto expr_end; };
1429
+
1430
+ c_eof => do_eof;
1431
+ *|;
1432
+
1433
+ # Literal function name in method call (e.g. `a.class`).
1434
+ #
1435
+ # Transitions to `expr_arg` afterwards.
1436
+ #
1437
+ expr_dot := |*
1438
+ constant
1439
+ => { emit(:tCONSTANT)
1440
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
1441
+
1442
+ call_or_var
1443
+ => { emit(:tIDENTIFIER)
1444
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
1445
+
1446
+ bareword ambiguous_fid_suffix
1447
+ => { emit(:tFID, tok(@ts, tm), @ts, tm)
1448
+ fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
1449
+
1450
+ # See the comment in `expr_fname`.
1451
+ operator_fname |
1452
+ operator_arithmetic |
1453
+ operator_rest
1454
+ => { emit_table(PUNCTUATION)
1455
+ fnext expr_arg; fbreak; };
1456
+
1457
+ w_any;
1458
+
1459
+ c_any
1460
+ => { fhold; fgoto expr_end; };
1461
+
1462
+ c_eof => do_eof;
1463
+ *|;
1464
+
1465
+ # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1466
+ # is consumed; the current expression is a command or method call.
1467
+ #
1468
+ expr_arg := |*
1469
+ #
1470
+ # COMMAND MODE SPECIFIC TOKENS
1471
+ #
1472
+
1473
+ # cmd (1 + 2)
1474
+ # See below the rationale about expr_endarg.
1475
+ w_space+ e_lparen
1476
+ => {
1477
+ if version?(18)
1478
+ emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1479
+ fnext expr_value; fbreak;
1480
+ else
1481
+ emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1482
+ fnext expr_beg; fbreak;
1483
+ end
1484
+ };
1485
+
1486
+ # meth(1 + 2)
1487
+ # Regular method call.
1488
+ e_lparen
1489
+ => { emit(:tLPAREN2, '('.freeze)
1490
+ fnext expr_beg; fbreak; };
1491
+
1492
+ # meth [...]
1493
+ # Array argument. Compare with indexing `meth[...]`.
1494
+ w_space+ e_lbrack
1495
+ => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1496
+ fnext expr_beg; fbreak; };
1497
+
1498
+ # cmd {}
1499
+ # Command: method call without parentheses.
1500
+ w_space* e_lbrace
1501
+ => {
1502
+ if @lambda_stack.last == @paren_nest
1503
+ @lambda_stack.pop
1504
+ emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1505
+ else
1506
+ emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1507
+ end
1508
+ @command_start = true
1509
+ @paren_nest += 1
1510
+ fnext expr_value; fbreak;
1511
+ };
1512
+
1513
+ #
1514
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1515
+ #
1516
+
1517
+ # a??
1518
+ # Ternary operator
1519
+ '?' c_space_nl
1520
+ => {
1521
+ # Unlike expr_beg as invoked in the next rule, do not warn
1522
+ p = @ts - 1
1523
+ fgoto expr_end;
1524
+ };
1525
+
1526
+ # a ?b, a? ?
1527
+ # Character literal or ternary operator
1528
+ w_space* '?'
1529
+ => { fhold; fgoto expr_beg; };
1530
+
1531
+ # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1532
+ # a /foo/ (but not "a / foo" or "a /=foo")
1533
+ # a <<HEREDOC
1534
+ w_space+ %{ tm = p }
1535
+ ( [%/] ( c_any - c_space_nl - '=' ) # /
1536
+ | '<<'
1537
+ )
1538
+ => {
1539
+ if tok(tm, tm + 1) == '/'.freeze
1540
+ # Ambiguous regexp literal.
1541
+ diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1542
+ end
1543
+
1544
+ p = tm - 1
1545
+ fgoto expr_beg;
1546
+ };
1547
+
1548
+ # x *1
1549
+ # Ambiguous splat, kwsplat or block-pass.
1550
+ w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1551
+ => {
1552
+ diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1553
+ range(tm, @te)
1554
+
1555
+ p = tm - 1
1556
+ fgoto expr_beg;
1557
+ };
1558
+
1559
+ # x ::Foo
1560
+ # Ambiguous toplevel constant access.
1561
+ w_space+ '::'
1562
+ => { fhold; fhold; fgoto expr_beg; };
1563
+
1564
+ # x:b
1565
+ # Symbol.
1566
+ w_space* ':'
1567
+ => { fhold; fgoto expr_beg; };
1568
+
1569
+ w_space+ label
1570
+ => { p = @ts - 1; fgoto expr_beg; };
1571
+
1572
+ #
1573
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1574
+ #
1575
+
1576
+ # a ? b
1577
+ # Ternary operator.
1578
+ w_space+ %{ tm = p } '?' c_space_nl
1579
+ => { p = tm - 1; fgoto expr_end; };
1580
+
1581
+ # x + 1: Binary operator or operator-assignment.
1582
+ w_space* operator_arithmetic
1583
+ ( '=' | c_space_nl )? |
1584
+ # x rescue y: Modifier keyword.
1585
+ w_space* keyword_modifier |
1586
+ # a &. b: Safe navigation operator.
1587
+ w_space* '&.' |
1588
+ # Miscellanea.
1589
+ w_space* punctuation_end
1590
+ => {
1591
+ p = @ts - 1
1592
+ fgoto expr_end;
1593
+ };
1594
+
1595
+ w_space;
1596
+
1597
+ w_comment
1598
+ => { fgoto expr_end; };
1599
+
1600
+ w_newline
1601
+ => { fhold; fgoto expr_end; };
1602
+
1603
+ c_any
1604
+ => { fhold; fgoto expr_beg; };
1605
+
1606
+ c_eof => do_eof;
1607
+ *|;
1608
+
1609
+ # The previous token was an identifier which was seen while in the
1610
+ # command mode (that is, the state at the beginning of #advance was
1611
+ # expr_value). This state is very similar to expr_arg, but disambiguates
1612
+ # two very rare and specific condition:
1613
+ # * In 1.8 mode, "foo (lambda do end)".
1614
+ # * In 1.9+ mode, "f x: -> do foo do end end".
1615
+ expr_cmdarg := |*
1616
+ w_space+ e_lparen
1617
+ => {
1618
+ emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1619
+ if version?(18)
1620
+ fnext expr_value; fbreak;
1621
+ else
1622
+ fnext expr_beg; fbreak;
1623
+ end
1624
+ };
1625
+
1626
+ w_space* 'do'
1627
+ => {
1628
+ if @cond.active?
1629
+ emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1630
+ else
1631
+ emit(:kDO, 'do'.freeze, @te - 2, @te)
1632
+ end
1633
+ fnext expr_value; fbreak;
1634
+ };
1635
+
1636
+ c_any |
1637
+ # Disambiguate with the `do' rule above.
1638
+ w_space* bareword |
1639
+ w_space* label
1640
+ => { p = @ts - 1
1641
+ fgoto expr_arg; };
1642
+
1643
+ c_eof => do_eof;
1644
+ *|;
1645
+
1646
+ # The rationale for this state is pretty complex. Normally, if an argument
1647
+ # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1648
+ # the block is attached to the innermost argument (`f` in `m f {}`), or it
1649
+ # is a parse error (`m 1 {}`). But there is a special case for passing a single
1650
+ # primary expression grouped with parentheses: if you write `m (1) {}` or
1651
+ # (2.0 only) `m () {}`, then the block is attached to `m`.
1652
+ #
1653
+ # Thus, we recognize the opening `(` of a command (remember, a command is
1654
+ # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1655
+ # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1656
+ # lexer's state to `expr_endarg`, which makes it emit the possibly following
1657
+ # `{` as `tLBRACE_ARG`.
1658
+ #
1659
+ # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1660
+ # `do` (as `kDO_BLOCK` in `expr_beg`).
1661
+ expr_endarg := |*
1662
+ e_lbrace
1663
+ => {
1664
+ if @lambda_stack.last == @paren_nest
1665
+ @lambda_stack.pop
1666
+ emit(:tLAMBEG, '{'.freeze)
1667
+ else
1668
+ emit(:tLBRACE_ARG, '{'.freeze)
1669
+ end
1670
+ @paren_nest += 1
1671
+ @command_start = true
1672
+ fnext expr_value; fbreak;
1673
+ };
1674
+
1675
+ 'do'
1676
+ => { emit_do(true)
1677
+ fnext expr_value; fbreak; };
1678
+
1679
+ w_space_comment;
1680
+
1681
+ c_any
1682
+ => { fhold; fgoto expr_end; };
1683
+
1684
+ c_eof => do_eof;
1685
+ *|;
1686
+
1687
+ # The rationale for this state is that several keywords accept value
1688
+ # (i.e. should transition to `expr_beg`), do not accept it like a command
1689
+ # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1690
+ # accept a modifier if/while/etc.
1691
+ #
1692
+ expr_mid := |*
1693
+ keyword_modifier
1694
+ => { emit_table(KEYWORDS)
1695
+ fnext expr_beg; fbreak; };
1696
+
1697
+ bareword
1698
+ => { p = @ts - 1; fgoto expr_beg; };
1699
+
1700
+ w_space_comment;
1701
+
1702
+ w_newline
1703
+ => { fhold; fgoto expr_end; };
1704
+
1705
+ c_any
1706
+ => { fhold; fgoto expr_beg; };
1707
+
1708
+ c_eof => do_eof;
1709
+ *|;
1710
+
1711
+ # Beginning of an expression.
1712
+ #
1713
+ # Don't fallthrough to this state from `c_any`; make sure to handle
1714
+ # `c_space* c_nl` and let `expr_end` handle the newline.
1715
+ # Otherwise code like `f\ndef x` gets glued together and the parser
1716
+ # explodes.
1717
+ #
1718
+ expr_beg := |*
1719
+ # +5, -5, - 5
1720
+ [+\-] w_any* [0-9]
1721
+ => {
1722
+ emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1723
+ fhold; fnext expr_end; fbreak;
1724
+ };
1725
+
1726
+ # splat *a
1727
+ '*'
1728
+ => { emit(:tSTAR, '*'.freeze)
1729
+ fbreak; };
1730
+
1731
+ #
1732
+ # STRING AND REGEXP LITERALS
1733
+ #
1734
+
1735
+ # /regexp/oui
1736
+ # /=/ (disambiguation with /=)
1737
+ '/' c_any
1738
+ => {
1739
+ type = delimiter = tok[0].chr
1740
+ fhold; fgoto *push_literal(type, delimiter, @ts);
1741
+ };
1742
+
1743
+ # %<string>
1744
+ '%' ( any - [A-Za-z] )
1745
+ => {
1746
+ type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1747
+ fgoto *push_literal(type, delimiter, @ts);
1748
+ };
1749
+
1750
+ # %w(we are the people)
1751
+ '%' [A-Za-z]+ c_any
1752
+ => {
1753
+ type, delimiter = tok[0..-2], tok[-1].chr
1754
+ fgoto *push_literal(type, delimiter, @ts);
1755
+ };
1756
+
1757
+ '%' c_eof
1758
+ => {
1759
+ diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1760
+ };
1761
+
1762
+ # Heredoc start.
1763
+ # <<END | <<'END' | <<"END" | <<`END` |
1764
+ # <<-END | <<-'END' | <<-"END" | <<-`END` |
1765
+ # <<~END | <<~'END' | <<~"END" | <<~`END`
1766
+ '<<' [~\-]?
1767
+ ( '"' ( any - '"' )* '"'
1768
+ | "'" ( any - "'" )* "'"
1769
+ | "`" ( any - "`" )* "`"
1770
+ | bareword ) % { heredoc_e = p }
1771
+ c_line* c_nl % { new_herebody_s = p }
1772
+ => {
1773
+ tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1774
+
1775
+ indent = !$1.empty? || !$2.empty?
1776
+ dedent_body = !$2.empty?
1777
+ type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1778
+ delimiter = $4
1779
+
1780
+ if @version >= 27
1781
+ if delimiter.count("\n") > 0 || delimiter.count("\r") > 0
1782
+ diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1783
+ end
1784
+ elsif @version >= 24
1785
+ if delimiter.count("\n") > 0
1786
+ if delimiter.end_with?("\n")
1787
+ diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1788
+ delimiter = delimiter.rstrip
1789
+ else
1790
+ diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1791
+ end
1792
+ end
1793
+ end
1794
+
1795
+ if dedent_body && version?(18, 19, 20, 21, 22)
1796
+ emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1797
+ p = @ts + 1
1798
+ fnext expr_beg; fbreak;
1799
+ else
1800
+ fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1801
+
1802
+ @herebody_s ||= new_herebody_s
1803
+ p = @herebody_s - 1
1804
+ end
1805
+ };
1806
+
1807
+ # Escaped unterminated heredoc start
1808
+ # <<'END | <<"END | <<`END |
1809
+ # <<-'END | <<-"END | <<-`END |
1810
+ # <<~'END | <<~"END | <<~`END
1811
+ #
1812
+ # If the heredoc is terminated the rule above should handle it
1813
+ '<<' [~\-]?
1814
+ ('"' (any - c_nl - '"')*
1815
+ |"'" (any - c_nl - "'")*
1816
+ |"`" (any - c_nl - "`")
1817
+ )
1818
+ => {
1819
+ diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1820
+ };
1821
+
1822
+ #
1823
+ # SYMBOL LITERALS
1824
+ #
1825
+
1826
+ # :&&, :||
1827
+ ':' ('&&' | '||') => {
1828
+ fhold; fhold;
1829
+ emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1830
+ fgoto expr_fname;
1831
+ };
1832
+
1833
+ # :"bar", :'baz'
1834
+ ':' ['"] # '
1835
+ => {
1836
+ type, delimiter = tok, tok[-1].chr
1837
+ fgoto *push_literal(type, delimiter, @ts);
1838
+ };
1839
+
1840
+ # :!@ is :!
1841
+ # :~@ is :~
1842
+ ':' [!~] '@'
1843
+ => {
1844
+ emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1845
+ fnext expr_end; fbreak;
1846
+ };
1847
+
1848
+ ':' bareword ambiguous_symbol_suffix
1849
+ => {
1850
+ emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1851
+ p = tm - 1
1852
+ fnext expr_end; fbreak;
1853
+ };
1854
+
1855
+ ':' ( bareword | global_var | class_var | instance_var |
1856
+ operator_fname | operator_arithmetic | operator_rest )
1857
+ => {
1858
+ emit(:tSYMBOL, tok(@ts + 1), @ts)
1859
+ fnext expr_end; fbreak;
1860
+ };
1861
+
1862
+ ':' ( '@' %{ tm = p - 1; diag_msg = :ivar_name }
1863
+ | '@@' %{ tm = p - 2; diag_msg = :cvar_name }
1864
+ ) [0-9]*
1865
+ => {
1866
+ if @version >= 27
1867
+ diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te)
1868
+ else
1869
+ emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1)
1870
+ p = @ts
1871
+ end
1872
+
1873
+ fnext expr_end; fbreak;
1874
+ };
1875
+
1876
+ #
1877
+ # AMBIGUOUS TERNARY OPERATOR
1878
+ #
1879
+
1880
+ # Character constant, like ?a, ?\n, ?\u1000, and so on
1881
+ # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1882
+ '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1883
+ | (c_any - c_space_nl - e_bs) % { @escape = nil }
1884
+ )
1885
+ => {
1886
+ value = @escape || tok(@ts + 1)
1887
+
1888
+ if version?(18)
1889
+ emit(:tINTEGER, value.getbyte(0))
1890
+ else
1891
+ emit(:tCHARACTER, value)
1892
+ end
1893
+
1894
+ fnext expr_end; fbreak;
1895
+ };
1896
+
1897
+ '?' c_space_nl
1898
+ => {
1899
+ escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1900
+ "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1901
+ diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1902
+
1903
+ p = @ts - 1
1904
+ fgoto expr_end;
1905
+ };
1906
+
1907
+ '?' c_eof
1908
+ => {
1909
+ diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1910
+ };
1911
+
1912
+ # f ?aa : b: Disambiguate with a character literal.
1913
+ '?' [A-Za-z_] bareword
1914
+ => {
1915
+ p = @ts - 1
1916
+ fgoto expr_end;
1917
+ };
1918
+
1919
+ #
1920
+ # AMBIGUOUS EMPTY BLOCK ARGUMENTS
1921
+ #
1922
+
1923
+ # Ruby >= 2.7 emits it as two tPIPE terminals
1924
+ # while Ruby < 2.7 as a single tOROP (like in `a || b`)
1925
+ '||'
1926
+ => {
1927
+ if @version >= 27
1928
+ emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1)
1929
+ fhold;
1930
+ fnext expr_beg; fbreak;
1931
+ else
1932
+ p -= 2
1933
+ fgoto expr_end;
1934
+ end
1935
+ };
1936
+
1937
+ #
1938
+ # KEYWORDS AND PUNCTUATION
1939
+ #
1940
+
1941
+ # a({b=>c})
1942
+ e_lbrace
1943
+ => {
1944
+ if @lambda_stack.last == @paren_nest
1945
+ @lambda_stack.pop
1946
+ @command_start = true
1947
+ emit(:tLAMBEG, '{'.freeze)
1948
+ else
1949
+ emit(:tLBRACE, '{'.freeze)
1950
+ end
1951
+ @paren_nest += 1
1952
+ fbreak;
1953
+ };
1954
+
1955
+ # a([1, 2])
1956
+ e_lbrack
1957
+ => { emit(:tLBRACK, '['.freeze)
1958
+ fbreak; };
1959
+
1960
+ # a()
1961
+ e_lparen
1962
+ => { emit(:tLPAREN, '('.freeze)
1963
+ fbreak; };
1964
+
1965
+ # a(+b)
1966
+ punctuation_begin
1967
+ => { emit_table(PUNCTUATION_BEGIN)
1968
+ fbreak; };
1969
+
1970
+ # rescue Exception => e: Block rescue.
1971
+ # Special because it should transition to expr_mid.
1972
+ 'rescue' %{ tm = p } '=>'?
1973
+ => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
1974
+ p = tm - 1
1975
+ fnext expr_mid; fbreak; };
1976
+
1977
+ # if a: Statement if.
1978
+ keyword_modifier
1979
+ => { emit_table(KEYWORDS_BEGIN)
1980
+ @command_start = true
1981
+ fnext expr_value; fbreak; };
1982
+
1983
+ #
1984
+ # RUBY 1.9 HASH LABELS
1985
+ #
1986
+
1987
+ label ( any - ':' )
1988
+ => {
1989
+ fhold;
1990
+
1991
+ if version?(18)
1992
+ ident = tok(@ts, @te - 2)
1993
+
1994
+ emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1995
+ ident, @ts, @te - 2)
1996
+ fhold; # continue as a symbol
1997
+
1998
+ if !@static_env.nil? && @static_env.declared?(ident)
1999
+ fnext expr_end;
2000
+ else
2001
+ fnext *arg_or_cmdarg(cmd_state);
2002
+ end
2003
+ else
2004
+ emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
2005
+ fnext expr_labelarg;
2006
+ end
2007
+
2008
+ fbreak;
2009
+ };
2010
+
2011
+ #
2012
+ # RUBY 2.7 BEGINLESS RANGE
2013
+
2014
+ '..'
2015
+ => {
2016
+ if @version >= 27
2017
+ emit(:tBDOT2)
2018
+ else
2019
+ emit(:tDOT2)
2020
+ end
2021
+
2022
+ fnext expr_beg; fbreak;
2023
+ };
2024
+
2025
+ '...'
2026
+ => {
2027
+ if @version >= 27
2028
+ emit(:tBDOT3)
2029
+ else
2030
+ emit(:tDOT3)
2031
+ end
2032
+
2033
+ fnext expr_beg; fbreak;
2034
+ };
2035
+
2036
+ #
2037
+ # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
2038
+ #
2039
+
2040
+ # foo= bar: Disambiguate with bareword rule below.
2041
+ bareword ambiguous_ident_suffix |
2042
+ # def foo: Disambiguate with bareword rule below.
2043
+ keyword
2044
+ => { p = @ts - 1
2045
+ fgoto expr_end; };
2046
+
2047
+ # a = 42; a [42]: Indexing.
2048
+ # def a; end; a [42]: Array argument.
2049
+ call_or_var
2050
+ => local_ident;
2051
+
2052
+ (call_or_var - keyword)
2053
+ % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
2054
+ w_space+ '('
2055
+ => {
2056
+ emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
2057
+ p = ident_te - 1
2058
+
2059
+ if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
2060
+ fnext expr_endfn;
2061
+ else
2062
+ fnext expr_cmdarg;
2063
+ end
2064
+ fbreak;
2065
+ };
2066
+
2067
+ #
2068
+ # WHITESPACE
2069
+ #
2070
+
2071
+ w_any;
2072
+
2073
+ e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
2074
+ => {
2075
+ p = @ts - 1
2076
+ @cs_before_block_comment = @cs
2077
+ fgoto line_begin;
2078
+ };
2079
+
2080
+ #
2081
+ # DEFAULT TRANSITION
2082
+ #
2083
+
2084
+ # The following rules match most binary and all unary operators.
2085
+ # Rules for binary operators provide better error reporting.
2086
+ operator_arithmetic '=' |
2087
+ operator_rest |
2088
+ punctuation_end |
2089
+ c_any
2090
+ => { p = @ts - 1; fgoto expr_end; };
2091
+
2092
+ c_eof => do_eof;
2093
+ *|;
2094
+
2095
+ # Special newline handling for "def a b:"
2096
+ #
2097
+ expr_labelarg := |*
2098
+ w_space_comment;
2099
+
2100
+ w_newline
2101
+ => {
2102
+ if @in_kwarg
2103
+ fhold; fgoto expr_end;
2104
+ else
2105
+ fgoto line_begin;
2106
+ end
2107
+ };
2108
+
2109
+ c_any
2110
+ => { fhold; fgoto expr_beg; };
2111
+
2112
+ c_eof => do_eof;
2113
+ *|;
2114
+
2115
+ # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
2116
+ #
2117
+ expr_value := |*
2118
+ # a:b: a(:b), a::B, A::B
2119
+ label (any - ':')
2120
+ => { p = @ts - 1
2121
+ fgoto expr_end; };
2122
+
2123
+ # "bar", 'baz'
2124
+ ['"] # '
2125
+ => {
2126
+ fgoto *push_literal(tok, tok, @ts);
2127
+ };
2128
+
2129
+ w_space_comment;
2130
+
2131
+ w_newline
2132
+ => { fgoto line_begin; };
2133
+
2134
+ c_any
2135
+ => { fhold; fgoto expr_beg; };
2136
+
2137
+ c_eof => do_eof;
2138
+ *|;
2139
+
2140
+ expr_end := |*
2141
+ #
2142
+ # STABBY LAMBDA
2143
+ #
2144
+
2145
+ '->'
2146
+ => {
2147
+ emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2148
+
2149
+ @lambda_stack.push @paren_nest
2150
+ fnext expr_endfn; fbreak;
2151
+ };
2152
+
2153
+ e_lbrace | 'do'
2154
+ => {
2155
+ if @lambda_stack.last == @paren_nest
2156
+ @lambda_stack.pop
2157
+
2158
+ if tok == '{'.freeze
2159
+ emit(:tLAMBEG, '{'.freeze)
2160
+ else # 'do'
2161
+ emit(:kDO_LAMBDA, 'do'.freeze)
2162
+ end
2163
+ else
2164
+ if tok == '{'.freeze
2165
+ emit(:tLCURLY, '{'.freeze)
2166
+ else # 'do'
2167
+ emit_do
2168
+ end
2169
+ end
2170
+ if tok == '{'.freeze
2171
+ @paren_nest += 1
2172
+ end
2173
+ @command_start = true
2174
+
2175
+ fnext expr_value; fbreak;
2176
+ };
2177
+
2178
+ #
2179
+ # KEYWORDS
2180
+ #
2181
+
2182
+ keyword_with_fname
2183
+ => { emit_table(KEYWORDS)
2184
+ fnext expr_fname; fbreak; };
2185
+
2186
+ 'class' w_any* '<<'
2187
+ => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2188
+ emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2189
+ fnext expr_value; fbreak; };
2190
+
2191
+ # a if b:c: Syntax error.
2192
+ keyword_modifier
2193
+ => { emit_table(KEYWORDS)
2194
+ fnext expr_beg; fbreak; };
2195
+
2196
+ # elsif b:c: elsif b(:c)
2197
+ keyword_with_value
2198
+ => { emit_table(KEYWORDS)
2199
+ @command_start = true
2200
+ fnext expr_value; fbreak; };
2201
+
2202
+ keyword_with_mid
2203
+ => { emit_table(KEYWORDS)
2204
+ fnext expr_mid; fbreak; };
2205
+
2206
+ keyword_with_arg
2207
+ => {
2208
+ emit_table(KEYWORDS)
2209
+
2210
+ if version?(18) && tok == 'not'.freeze
2211
+ fnext expr_beg; fbreak;
2212
+ else
2213
+ fnext expr_arg; fbreak;
2214
+ end
2215
+ };
2216
+
2217
+ '__ENCODING__'
2218
+ => {
2219
+ if version?(18)
2220
+ emit(:tIDENTIFIER)
2221
+
2222
+ unless !@static_env.nil? && @static_env.declared?(tok)
2223
+ fnext *arg_or_cmdarg(cmd_state);
2224
+ end
2225
+ else
2226
+ emit(:k__ENCODING__, '__ENCODING__'.freeze)
2227
+ end
2228
+ fbreak;
2229
+ };
2230
+
2231
+ keyword_with_end
2232
+ => { emit_table(KEYWORDS)
2233
+ fbreak; };
2234
+
2235
+ #
2236
+ # NUMERIC LITERALS
2237
+ #
2238
+
2239
+ ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2240
+ | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2241
+ | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2242
+ | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2243
+ | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2244
+ | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2245
+ ) %{ @num_suffix_s = p } int_suffix
2246
+ => {
2247
+ digits = tok(@num_digits_s, @num_suffix_s)
2248
+
2249
+ if digits.end_with? '_'.freeze
2250
+ diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2251
+ range(@te - 1, @te)
2252
+ elsif digits.empty? && @num_base == 8 && version?(18)
2253
+ # 1.8 did not raise an error on 0o.
2254
+ digits = '0'.freeze
2255
+ elsif digits.empty?
2256
+ diagnostic :error, :empty_numeric
2257
+ elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2258
+ invalid_s = @num_digits_s + invalid_idx
2259
+ diagnostic :error, :invalid_octal, nil,
2260
+ range(invalid_s, invalid_s + 1)
2261
+ end
2262
+
2263
+ if version?(18, 19, 20)
2264
+ emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2265
+ p = @num_suffix_s - 1
2266
+ else
2267
+ @num_xfrm.call(digits.to_i(@num_base))
2268
+ end
2269
+ fbreak;
2270
+ };
2271
+
2272
+ flo_frac flo_pow?
2273
+ => {
2274
+ diagnostic :error, :no_dot_digit_literal
2275
+ };
2276
+
2277
+ flo_int [eE]
2278
+ => {
2279
+ if version?(18, 19, 20)
2280
+ diagnostic :error,
2281
+ :trailing_in_number, { :character => tok(@te - 1, @te) },
2282
+ range(@te - 1, @te)
2283
+ else
2284
+ emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2285
+ fhold; fbreak;
2286
+ end
2287
+ };
2288
+
2289
+ flo_int flo_frac [eE]
2290
+ => {
2291
+ if version?(18, 19, 20)
2292
+ diagnostic :error,
2293
+ :trailing_in_number, { :character => tok(@te - 1, @te) },
2294
+ range(@te - 1, @te)
2295
+ else
2296
+ emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2297
+ fhold; fbreak;
2298
+ end
2299
+ };
2300
+
2301
+ flo_int
2302
+ ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2303
+ | flo_frac %{ @num_suffix_s = p } flo_suffix
2304
+ )
2305
+ => {
2306
+ digits = tok(@ts, @num_suffix_s)
2307
+
2308
+ if version?(18, 19, 20)
2309
+ emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2310
+ p = @num_suffix_s - 1
2311
+ else
2312
+ @num_xfrm.call(digits)
2313
+ end
2314
+ fbreak;
2315
+ };
2316
+
2317
+ #
2318
+ # STRING AND XSTRING LITERALS
2319
+ #
2320
+
2321
+ # `echo foo`, "bar", 'baz'
2322
+ '`' | ['"] # '
2323
+ => {
2324
+ type, delimiter = tok, tok[-1].chr
2325
+ fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2326
+ };
2327
+
2328
+ #
2329
+ # CONSTANTS AND VARIABLES
2330
+ #
2331
+
2332
+ constant
2333
+ => { emit(:tCONSTANT)
2334
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
2335
+
2336
+ constant ambiguous_const_suffix
2337
+ => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2338
+ p = tm - 1; fbreak; };
2339
+
2340
+ global_var | class_var_v | instance_var_v
2341
+ => { p = @ts - 1; fcall expr_variable; };
2342
+
2343
+ #
2344
+ # METHOD CALLS
2345
+ #
2346
+
2347
+ '.:' w_space+
2348
+ => { emit(:tDOT, '.', @ts, @ts + 1)
2349
+ emit(:tCOLON, ':', @ts + 1, @ts + 2)
2350
+ p = p - tok.length + 2
2351
+ fnext expr_dot; fbreak; };
2352
+
2353
+ '.:'
2354
+ => {
2355
+ if @version >= 27
2356
+ emit_table(PUNCTUATION)
2357
+ else
2358
+ emit(:tDOT, tok(@ts, @ts + 1), @ts, @ts + 1)
2359
+ fhold;
2360
+ end
2361
+
2362
+ fnext expr_dot; fbreak;
2363
+ };
2364
+
2365
+ '.' | '&.' | '::'
2366
+ => { emit_table(PUNCTUATION)
2367
+ fnext expr_dot; fbreak; };
2368
+
2369
+ call_or_var
2370
+ => local_ident;
2371
+
2372
+ bareword ambiguous_fid_suffix
2373
+ => {
2374
+ if tm == @te
2375
+ # Suffix was consumed, e.g. foo!
2376
+ emit(:tFID)
2377
+ else
2378
+ # Suffix was not consumed, e.g. foo!=
2379
+ emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2380
+ p = tm - 1
2381
+ end
2382
+ fnext expr_arg; fbreak;
2383
+ };
2384
+
2385
+ #
2386
+ # OPERATORS
2387
+ #
2388
+
2389
+ '*' | '=>'
2390
+ => {
2391
+ emit_table(PUNCTUATION)
2392
+ fgoto expr_value;
2393
+ };
2394
+
2395
+ # When '|', '~', '!', '=>' are used as operators
2396
+ # they do not accept any symbols (or quoted labels) after.
2397
+ # Other binary operators accept it.
2398
+ ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
2399
+ => {
2400
+ emit_table(PUNCTUATION);
2401
+ fnext expr_value; fbreak;
2402
+ };
2403
+
2404
+ ( e_lparen | '|' | '~' | '!' )
2405
+ => { emit_table(PUNCTUATION)
2406
+ fnext expr_beg; fbreak; };
2407
+
2408
+ e_rbrace | e_rparen | e_rbrack
2409
+ => {
2410
+ emit_table(PUNCTUATION)
2411
+
2412
+ if @version < 24
2413
+ @cond.lexpop
2414
+ @cmdarg.lexpop
2415
+ else
2416
+ @cond.pop
2417
+ @cmdarg.pop
2418
+ end
2419
+
2420
+ if tok == '}'.freeze || tok == ']'.freeze
2421
+ if @version >= 25
2422
+ fnext expr_end;
2423
+ else
2424
+ fnext expr_endarg;
2425
+ end
2426
+ else # )
2427
+ # fnext expr_endfn; ?
2428
+ end
2429
+
2430
+ fbreak;
2431
+ };
2432
+
2433
+ operator_arithmetic '='
2434
+ => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2435
+ fnext expr_beg; fbreak; };
2436
+
2437
+ '?'
2438
+ => { emit(:tEH, '?'.freeze)
2439
+ fnext expr_value; fbreak; };
2440
+
2441
+ e_lbrack
2442
+ => { emit(:tLBRACK2, '['.freeze)
2443
+ fnext expr_beg; fbreak; };
2444
+
2445
+ '...' c_nl
2446
+ => {
2447
+ if @paren_nest == 0
2448
+ diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1)
2449
+ end
2450
+
2451
+ emit(:tDOT3, '...'.freeze, @ts, @te - 1)
2452
+ fhold;
2453
+ fnext expr_beg; fbreak;
2454
+ };
2455
+
2456
+ punctuation_end
2457
+ => { emit_table(PUNCTUATION)
2458
+ fnext expr_beg; fbreak; };
2459
+
2460
+ #
2461
+ # WHITESPACE
2462
+ #
2463
+
2464
+ w_space_comment;
2465
+
2466
+ w_newline
2467
+ => { fgoto leading_dot; };
2468
+
2469
+ ';'
2470
+ => { emit(:tSEMI, ';'.freeze)
2471
+ @command_start = true
2472
+ fnext expr_value; fbreak; };
2473
+
2474
+ '\\' c_line {
2475
+ diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2476
+ fhold;
2477
+ };
2478
+
2479
+ c_any
2480
+ => {
2481
+ diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2482
+ };
2483
+
2484
+ c_eof => do_eof;
2485
+ *|;
2486
+
2487
+ leading_dot := |*
2488
+ # Insane leading dots:
2489
+ # a #comment
2490
+ # # post-2.7 comment
2491
+ # .b: a.b
2492
+
2493
+ # Here we use '\n' instead of w_newline to not modify @newline_s
2494
+ # and eventually properly emit tNL
2495
+ (c_space* w_space_comment '\n')+
2496
+ => {
2497
+ if @version < 27
2498
+ # Ruby before 2.7 doesn't support comments before leading dot.
2499
+ # If a line after "a" starts with a comment then "a" is a self-contained statement.
2500
+ # So in that case we emit a special tNL token and start reading the
2501
+ # next line as a separate statement.
2502
+ #
2503
+ # Note: block comments before leading dot are not supported on any version of Ruby.
2504
+ emit(:tNL, nil, @newline_s, @newline_s + 1)
2505
+ fhold; fnext line_begin; fbreak;
2506
+ end
2507
+ };
2508
+
2509
+ c_space* %{ tm = p } ('.' | '&.')
2510
+ => { p = tm - 1; fgoto expr_end; };
2511
+
2512
+ any
2513
+ => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2514
+ fhold; fnext line_begin; fbreak; };
2515
+ *|;
2516
+
2517
+ #
2518
+ # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2519
+ #
2520
+
2521
+ line_comment := |*
2522
+ '=end' c_line* c_nl_zlen
2523
+ => {
2524
+ emit_comment(@eq_begin_s, @te)
2525
+ fgoto *@cs_before_block_comment;
2526
+ };
2527
+
2528
+ c_line* c_nl;
2529
+
2530
+ c_line* zlen
2531
+ => {
2532
+ diagnostic :fatal, :embedded_document, nil,
2533
+ range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2534
+ };
2535
+ *|;
2536
+
2537
+ line_begin := |*
2538
+ w_any;
2539
+
2540
+ '=begin' ( c_space | c_nl_zlen )
2541
+ => { @eq_begin_s = @ts
2542
+ fgoto line_comment; };
2543
+
2544
+ '__END__' ( c_eol - zlen )
2545
+ => { p = pe - 3 };
2546
+
2547
+ c_any
2548
+ => { cmd_state = true; fhold; fgoto expr_value; };
2549
+
2550
+ c_eof => do_eof;
2551
+ *|;
2552
+
2553
+ }%%
2554
+ # %
2555
+ end
2556
+ end