ruby-next-parser 2.8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2556 @@
1
+ %%machine lex; # % fix highlighting
2
+
3
+ #
4
+ # === BEFORE YOU START ===
5
+ #
6
+ # Read the Ruby Hacking Guide chapter 11, available in English at
7
+ # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
+ #
9
+ # Remember two things about Ragel scanners:
10
+ #
11
+ # 1) Longest match wins.
12
+ #
13
+ # 2) If two matches have the same length, the first
14
+ # in source code wins.
15
+ #
16
+ # General rules of making Ragel and Bison happy:
17
+ #
18
+ # * `p` (position) and `@te` contain the index of the character
19
+ # they're pointing to ("current"), plus one. `@ts` contains the index
20
+ # of the corresponding character. The code for extracting matched token is:
21
+ #
22
+ # @source_buffer.slice(@ts...@te)
23
+ #
24
+ # * If your input is `foooooooobar` and the rule is:
25
+ #
26
+ # 'f' 'o'+
27
+ #
28
+ # the result will be:
29
+ #
30
+ # foooooooobar
31
+ # ^ ts=0 ^ p=te=9
32
+ #
33
+ # * A Ragel lexer action should not emit more than one token, unless
34
+ # you know what you are doing.
35
+ #
36
+ # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
+ #
38
+ # * If an action emits the token and transitions to another state, use
39
+ # these Ragel commands:
40
+ #
41
+ # emit($whatever)
42
+ # fnext $next_state; fbreak;
43
+ #
44
+ # If you perform `fgoto` in an action which does not emit a token nor
45
+ # rewinds the stream pointer, the parser's side-effectful,
46
+ # context-sensitive lookahead actions will break in a hard to detect
47
+ # and debug way.
48
+ #
49
+ # * If an action does not emit a token:
50
+ #
51
+ # fgoto $next_state;
52
+ #
53
+ # * If an action features lookbehind, i.e. matches characters with the
54
+ # intent of passing them to another action:
55
+ #
56
+ # p = @ts - 1
57
+ # fgoto $next_state;
58
+ #
59
+ # or, if the lookbehind consists of a single character:
60
+ #
61
+ # fhold; fgoto $next_state;
62
+ #
63
+ # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
+ # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
+ # _will_ invoke the action `act`.
66
+ #
67
+ # e_something stands for "something with **e**mbedded action".
68
+ #
69
+ # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
+ # the state of the lexer, add this rule to the state:
71
+ #
72
+ # c_eof => do_eof;
73
+ #
74
+ # * If you proceed past EOF, the lexer will complain:
75
+ #
76
+ # NoMethodError: undefined method `ord' for nil:NilClass
77
+ #
78
+
79
+ class Parser::Lexer
80
+ class Next
81
+
82
+ %% write data nofinal;
83
+ # %
84
+
85
+ ESCAPES = {
86
+ ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
87
+ ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
88
+ ?v.ord => "\v", ?\\.ord => "\\"
89
+ }.freeze
90
+
91
+ REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
92
+
93
+ attr_reader :source_buffer
94
+
95
+ attr_accessor :diagnostics
96
+ attr_accessor :static_env
97
+ attr_accessor :force_utf32
98
+
99
+ attr_accessor :cond, :cmdarg, :in_kwarg, :context, :command_start
100
+
101
+ attr_accessor :tokens, :comments
102
+
103
+ def initialize(version)
104
+ @version = version
105
+ @static_env = nil
106
+ @context = nil
107
+
108
+ @tokens = nil
109
+ @comments = nil
110
+
111
+ reset
112
+ end
113
+
114
+ def reset(reset_state=true)
115
+ # Ragel state:
116
+ if reset_state
117
+ # Unit tests set state prior to resetting lexer.
118
+ @cs = self.class.lex_en_line_begin
119
+
120
+ @cond = StackState.new('cond')
121
+ @cmdarg = StackState.new('cmdarg')
122
+ @cond_stack = []
123
+ @cmdarg_stack = []
124
+ end
125
+
126
+ @force_utf32 = false # Set to true by some tests
127
+
128
+ @source_pts = nil # @source as a codepoint array
129
+
130
+ @p = 0 # stream position (saved manually in #advance)
131
+ @ts = nil # token start
132
+ @te = nil # token end
133
+ @act = 0 # next action
134
+
135
+ @stack = [] # state stack
136
+ @top = 0 # state stack top pointer
137
+
138
+ # Lexer state:
139
+ @token_queue = []
140
+ @literal_stack = []
141
+
142
+ @eq_begin_s = nil # location of last encountered =begin
143
+ @sharp_s = nil # location of last encountered #
144
+
145
+ @newline_s = nil # location of last encountered newline
146
+
147
+ @num_base = nil # last numeric base
148
+ @num_digits_s = nil # starting position of numeric digits
149
+ @num_suffix_s = nil # starting position of numeric suffix
150
+ @num_xfrm = nil # numeric suffix-induced transformation
151
+
152
+ @escape_s = nil # starting position of current sequence
153
+ @escape = nil # last escaped sequence, as string
154
+
155
+ @herebody_s = nil # starting position of current heredoc line
156
+
157
+ # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
158
+ # encountered after a matching closing parenthesis.
159
+ @paren_nest = 0
160
+ @lambda_stack = []
161
+
162
+ # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
163
+ # we store the indentation level and give it out to the parser
164
+ # on request. It is not possible to infer indentation level just
165
+ # from the AST because escape sequences such as `\ ` or `\t` are
166
+ # expanded inside the lexer, but count as non-whitespace for
167
+ # indentation purposes.
168
+ @dedent_level = nil
169
+
170
+ # If the lexer is in `command state' (aka expr_value)
171
+ # at the entry to #advance, it will transition to expr_cmdarg
172
+ # instead of expr_arg at certain points.
173
+ @command_start = true
174
+
175
+ # True at the end of "def foo a:"
176
+ @in_kwarg = false
177
+
178
+ # State before =begin / =end block comment
179
+ @cs_before_block_comment = self.class.lex_en_line_begin
180
+ end
181
+
182
+ def source_buffer=(source_buffer)
183
+ @source_buffer = source_buffer
184
+
185
+ if @source_buffer
186
+ source = @source_buffer.source
187
+
188
+ if source.encoding == Encoding::UTF_8
189
+ @source_pts = source.unpack('U*')
190
+ else
191
+ @source_pts = source.unpack('C*')
192
+ end
193
+
194
+ if @source_pts[0] == 0xfeff
195
+ # Skip byte order mark.
196
+ @p = 1
197
+ end
198
+ else
199
+ @source_pts = nil
200
+ end
201
+ end
202
+
203
+ def encoding
204
+ @source_buffer.source.encoding
205
+ end
206
+
207
+ LEX_STATES = {
208
+ :line_begin => lex_en_line_begin,
209
+ :expr_dot => lex_en_expr_dot,
210
+ :expr_fname => lex_en_expr_fname,
211
+ :expr_value => lex_en_expr_value,
212
+ :expr_beg => lex_en_expr_beg,
213
+ :expr_mid => lex_en_expr_mid,
214
+ :expr_arg => lex_en_expr_arg,
215
+ :expr_cmdarg => lex_en_expr_cmdarg,
216
+ :expr_end => lex_en_expr_end,
217
+ :expr_endarg => lex_en_expr_endarg,
218
+ :expr_endfn => lex_en_expr_endfn,
219
+ :expr_labelarg => lex_en_expr_labelarg,
220
+
221
+ :interp_string => lex_en_interp_string,
222
+ :interp_words => lex_en_interp_words,
223
+ :plain_string => lex_en_plain_string,
224
+ :plain_words => lex_en_plain_string,
225
+ }
226
+
227
+ def state
228
+ LEX_STATES.invert.fetch(@cs, @cs)
229
+ end
230
+
231
+ def state=(state)
232
+ @cs = LEX_STATES.fetch(state)
233
+ end
234
+
235
+ def push_cmdarg
236
+ @cmdarg_stack.push(@cmdarg)
237
+ @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
238
+ end
239
+
240
+ def pop_cmdarg
241
+ @cmdarg = @cmdarg_stack.pop
242
+ end
243
+
244
+ def push_cond
245
+ @cond_stack.push(@cond)
246
+ @cond = StackState.new("cond.#{@cond_stack.count}")
247
+ end
248
+
249
+ def pop_cond
250
+ @cond = @cond_stack.pop
251
+ end
252
+
253
+ def dedent_level
254
+ # We erase @dedent_level as a precaution to avoid accidentally
255
+ # using a stale value.
256
+ dedent_level, @dedent_level = @dedent_level, nil
257
+ dedent_level
258
+ end
259
+
260
+ # Return next token: [type, value].
261
+ def advance
262
+ if @token_queue.any?
263
+ return @token_queue.shift
264
+ end
265
+
266
+ # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
267
+ klass = self.class
268
+ _lex_trans_keys = klass.send :_lex_trans_keys
269
+ _lex_key_spans = klass.send :_lex_key_spans
270
+ _lex_index_offsets = klass.send :_lex_index_offsets
271
+ _lex_indicies = klass.send :_lex_indicies
272
+ _lex_trans_targs = klass.send :_lex_trans_targs
273
+ _lex_trans_actions = klass.send :_lex_trans_actions
274
+ _lex_to_state_actions = klass.send :_lex_to_state_actions
275
+ _lex_from_state_actions = klass.send :_lex_from_state_actions
276
+ _lex_eof_trans = klass.send :_lex_eof_trans
277
+
278
+ pe = @source_pts.size + 2
279
+ p, eof = @p, pe
280
+
281
+ cmd_state = @command_start
282
+ @command_start = false
283
+
284
+ %% write exec;
285
+ # %
286
+
287
+ @p = p
288
+
289
+ if @token_queue.any?
290
+ @token_queue.shift
291
+ elsif @cs == klass.lex_error
292
+ [ false, [ '$error'.freeze, range(p - 1, p) ] ]
293
+ else
294
+ eof = @source_pts.size
295
+ [ false, [ '$eof'.freeze, range(eof, eof) ] ]
296
+ end
297
+ end
298
+
299
+ protected
300
+
301
+ def eof_codepoint?(point)
302
+ [0x04, 0x1a, 0x00].include? point
303
+ end
304
+
305
+ def version?(*versions)
306
+ versions.include?(@version)
307
+ end
308
+
309
+ def stack_pop
310
+ @top -= 1
311
+ @stack[@top]
312
+ end
313
+
314
+ def encode_escape(ord)
315
+ ord.chr.force_encoding(@source_buffer.source.encoding)
316
+ end
317
+
318
+ def tok(s = @ts, e = @te)
319
+ @source_buffer.slice(s...e)
320
+ end
321
+
322
+ def range(s = @ts, e = @te)
323
+ Parser::Source::Range.new(@source_buffer, s, e)
324
+ end
325
+
326
+ def emit(type, value = tok, s = @ts, e = @te)
327
+ token = [ type, [ value, range(s, e) ] ]
328
+
329
+ @token_queue.push(token)
330
+
331
+ @tokens.push(token) if @tokens
332
+
333
+ token
334
+ end
335
+
336
+ def emit_table(table, s = @ts, e = @te)
337
+ value = tok(s, e)
338
+
339
+ emit(table[value], value, s, e)
340
+ end
341
+
342
+ def emit_do(do_block=false)
343
+ if @cond.active?
344
+ emit(:kDO_COND, 'do'.freeze)
345
+ elsif @cmdarg.active? || do_block
346
+ emit(:kDO_BLOCK, 'do'.freeze)
347
+ else
348
+ emit(:kDO, 'do'.freeze)
349
+ end
350
+ end
351
+
352
+ def arg_or_cmdarg(cmd_state)
353
+ if cmd_state
354
+ self.class.lex_en_expr_cmdarg
355
+ else
356
+ self.class.lex_en_expr_arg
357
+ end
358
+ end
359
+
360
+ def emit_comment(s = @ts, e = @te)
361
+ if @comments
362
+ @comments.push(Parser::Source::Comment.new(range(s, e)))
363
+ end
364
+
365
+ if @tokens
366
+ @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
367
+ end
368
+
369
+ nil
370
+ end
371
+
372
+ def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
373
+ @diagnostics.process(
374
+ Parser::Diagnostic.new(type, reason, arguments, location, highlights))
375
+ end
376
+
377
+ #
378
+ # === LITERAL STACK ===
379
+ #
380
+
381
+ def push_literal(*args)
382
+ new_literal = Literal.new(self, *args)
383
+ @literal_stack.push(new_literal)
384
+ next_state_for_literal(new_literal)
385
+ end
386
+
387
+ def next_state_for_literal(literal)
388
+ if literal.words? && literal.backslash_delimited?
389
+ if literal.interpolate?
390
+ self.class.lex_en_interp_backslash_delimited_words
391
+ else
392
+ self.class.lex_en_plain_backslash_delimited_words
393
+ end
394
+ elsif literal.words? && !literal.backslash_delimited?
395
+ if literal.interpolate?
396
+ self.class.lex_en_interp_words
397
+ else
398
+ self.class.lex_en_plain_words
399
+ end
400
+ elsif !literal.words? && literal.backslash_delimited?
401
+ if literal.interpolate?
402
+ self.class.lex_en_interp_backslash_delimited
403
+ else
404
+ self.class.lex_en_plain_backslash_delimited
405
+ end
406
+ else
407
+ if literal.interpolate?
408
+ self.class.lex_en_interp_string
409
+ else
410
+ self.class.lex_en_plain_string
411
+ end
412
+ end
413
+ end
414
+
415
+ def literal
416
+ @literal_stack.last
417
+ end
418
+
419
+ def pop_literal
420
+ old_literal = @literal_stack.pop
421
+
422
+ @dedent_level = old_literal.dedent_level
423
+
424
+ if old_literal.type == :tREGEXP_BEG
425
+ # Fetch modifiers.
426
+ self.class.lex_en_regexp_modifiers
427
+ else
428
+ self.class.lex_en_expr_end
429
+ end
430
+ end
431
+
432
+ # Mapping of strings to parser tokens.
433
+
434
+ PUNCTUATION = {
435
+ '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
436
+ '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
437
+ '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
438
+ '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
439
+ ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
440
+ '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
441
+ '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
442
+ ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
443
+ '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
444
+ '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
445
+ '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
446
+ '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
447
+ '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
448
+ '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
449
+ '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
450
+ '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
451
+ '!@' => :tBANG, '&.' => :tANDDOT, '.:' => :tMETHREF
452
+ }
453
+
454
+ PUNCTUATION_BEGIN = {
455
+ '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
456
+ '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
457
+ '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
458
+ }
459
+
460
+ KEYWORDS = {
461
+ 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
462
+ 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
463
+ 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
464
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
465
+ }
466
+
467
+ KEYWORDS_BEGIN = {
468
+ 'if' => :kIF, 'unless' => :kUNLESS,
469
+ 'while' => :kWHILE, 'until' => :kUNTIL,
470
+ 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
471
+ 'BEGIN' => :klBEGIN, 'END' => :klEND,
472
+ }
473
+
474
+ %w(class module def undef begin end then elsif else ensure case when
475
+ for break next redo retry in do return yield super self nil true
476
+ false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
477
+ KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
478
+ end
479
+
480
+ %%{
481
+ # %
482
+
483
+ access @;
484
+ getkey (@source_pts[p] || 0);
485
+
486
+ # === CHARACTER CLASSES ===
487
+ #
488
+ # Pay close attention to the differences between c_any and any.
489
+ # c_any does not include EOF and so will cause incorrect behavior
490
+ # for machine subtraction (any-except rules) and default transitions
491
+ # for scanners.
492
+
493
+ action do_nl {
494
+ # Record position of a newline for precise location reporting on tNL
495
+ # tokens.
496
+ #
497
+ # This action is embedded directly into c_nl, as it is idempotent and
498
+ # there are no cases when we need to skip it.
499
+ @newline_s = p
500
+ }
501
+
502
+ c_nl = '\n' $ do_nl;
503
+ c_space = [ \t\r\f\v];
504
+ c_space_nl = c_space | c_nl;
505
+
506
+ c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
507
+ c_eol = c_nl | c_eof;
508
+ c_any = any - c_eof;
509
+
510
+ c_nl_zlen = c_nl | zlen;
511
+ c_line = any - c_nl_zlen;
512
+
513
+ c_unicode = c_any - 0x00..0x7f;
514
+ c_upper = [A-Z];
515
+ c_lower = [a-z_] | c_unicode;
516
+ c_alpha = c_lower | c_upper;
517
+ c_alnum = c_alpha | [0-9];
518
+
519
+ action do_eof {
520
+ # Sit at EOF indefinitely. #advance would return $eof each time.
521
+ # This allows to feed the lexer more data if needed; this is only used
522
+ # in tests.
523
+ #
524
+ # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
525
+ # below. This is due to the fact that scanner state at EOF is observed
526
+ # by tests, and encapsulating it in a rule would break the introspection.
527
+ fhold; fbreak;
528
+ }
529
+
530
+ #
531
+ # === TOKEN DEFINITIONS ===
532
+ #
533
+
534
+ # All operators are punctuation. There is more to punctuation
535
+ # than just operators. Operators can be overridden by user;
536
+ # punctuation can not.
537
+
538
+ # A list of operators which are valid in the function name context, but
539
+ # have different semantics in others.
540
+ operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
541
+
542
+ # A list of operators which can occur within an assignment shortcut (+ → +=).
543
+ operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
544
+ '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
545
+
546
+ # A list of all user-definable operators not covered by groups above.
547
+ operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
548
+ '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
549
+
550
+ # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
551
+ # as they are ambiguous with interpolation `#{}` and should be counted.
552
+ # These braces are not present in punctuation lists.
553
+
554
+ # A list of punctuation which has different meaning when used at the
555
+ # beginning of expression.
556
+ punctuation_begin = '-' | '+' | '::' | '(' | '[' |
557
+ '*' | '**' | '&' ;
558
+
559
+ # A list of all punctuation except punctuation_begin.
560
+ punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
561
+ '::' | '?' | ':' | '.' | '..' | '...' ;
562
+
563
+ # A list of keywords which have different meaning at the beginning of expression.
564
+ keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
565
+
566
+ # A list of keywords which accept an argument-like expression, i.e. have the
567
+ # same post-processing as method calls or commands. Example: `yield 1`,
568
+ # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
569
+ keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
570
+
571
+ # A list of keywords which accept a literal function name as an argument.
572
+ keyword_with_fname = 'def' | 'undef' | 'alias' ;
573
+
574
+ # A list of keywords which accept an expression after them.
575
+ keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
576
+ 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
577
+ 'and' | 'or' ;
578
+
579
+ # A list of keywords which accept a value, and treat the keywords from
580
+ # `keyword_modifier` list as modifiers.
581
+ keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
582
+
583
+ # A list of keywords which do not accept an expression after them.
584
+ keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
585
+ 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
586
+ '__LINE__' | '__ENCODING__';
587
+
588
+ # All keywords.
589
+ keyword = keyword_with_value | keyword_with_mid |
590
+ keyword_with_end | keyword_with_arg |
591
+ keyword_with_fname | keyword_modifier ;
592
+
593
+ constant = c_upper c_alnum*;
594
+ bareword = c_alpha c_alnum*;
595
+
596
+ call_or_var = c_lower c_alnum*;
597
+ class_var = '@@' bareword;
598
+ instance_var = '@' bareword;
599
+ global_var = '$'
600
+ ( bareword | digit+
601
+ | [`'+~*$&?!@/\\;,.=:<>"] # `
602
+ | '-' c_alnum
603
+ )
604
+ ;
605
+
606
+ # Ruby accepts (and fails on) variables with leading digit
607
+ # in literal context, but not in unquoted symbol body.
608
+ class_var_v = '@@' c_alnum+;
609
+ instance_var_v = '@' c_alnum+;
610
+
611
+ label = bareword [?!]? ':';
612
+
613
+ #
614
+ # === NUMERIC PARSING ===
615
+ #
616
+
617
+ int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
618
+ int_dec = ( digit+ '_' )* digit* '_'? ;
619
+ int_bin = ( [01]+ '_' )* [01]* '_'? ;
620
+
621
+ flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
622
+ flo_frac = '.' ( digit+ '_' )* digit+;
623
+ flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
624
+
625
+ int_suffix =
626
+ '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
627
+ | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
628
+ | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
629
+ | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
630
+ | 're' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
631
+ | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
632
+ | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
633
+
634
+ flo_pow_suffix =
635
+ '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
636
+ | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
637
+ | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
638
+
639
+ flo_suffix =
640
+ flo_pow_suffix
641
+ | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
642
+ | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
643
+ | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
644
+
645
+ #
646
+ # === ESCAPE SEQUENCE PARSING ===
647
+ #
648
+
649
+ # Escape parsing code is a Ragel pattern, not a scanner, and therefore
650
+ # it shouldn't directly raise errors or perform other actions with side effects.
651
+ # In reality this would probably just mess up error reporting in pathological
652
+ # cases, through.
653
+
654
+ # The amount of code required to parse \M\C stuff correctly is ridiculous.
655
+
656
+ escaped_nl = "\\" c_nl;
657
+
658
+ action unicode_points {
659
+ @escape = ""
660
+
661
+ codepoints = tok(@escape_s + 2, p - 1)
662
+ codepoint_s = @escape_s + 2
663
+
664
+ if @version < 24
665
+ if codepoints.start_with?(" ") || codepoints.start_with?("\t")
666
+ diagnostic :fatal, :invalid_unicode_escape, nil,
667
+ range(@escape_s + 2, @escape_s + 3)
668
+ end
669
+
670
+ if spaces_p = codepoints.index(/[ \t]{2}/)
671
+ diagnostic :fatal, :invalid_unicode_escape, nil,
672
+ range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
673
+ end
674
+
675
+ if codepoints.end_with?(" ") || codepoints.end_with?("\t")
676
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
677
+ end
678
+ end
679
+
680
+ codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
681
+ if spaces
682
+ codepoint_s += spaces.length
683
+ else
684
+ codepoint = codepoint_str.to_i(16)
685
+
686
+ if codepoint >= 0x110000
687
+ diagnostic :error, :unicode_point_too_large, nil,
688
+ range(codepoint_s, codepoint_s + codepoint_str.length)
689
+ break
690
+ end
691
+
692
+ @escape += codepoint.chr(Encoding::UTF_8)
693
+ codepoint_s += codepoint_str.length
694
+ end
695
+ end
696
+ }
697
+
698
+ action unescape_char {
699
+ codepoint = @source_pts[p - 1]
700
+ if (@escape = ESCAPES[codepoint]).nil?
701
+ @escape = encode_escape(@source_buffer.slice(p - 1))
702
+ end
703
+ }
704
+
705
+ action invalid_complex_escape {
706
+ diagnostic :fatal, :invalid_escape
707
+ }
708
+
709
+ action read_post_meta_or_ctrl_char {
710
+ @escape = @source_buffer.slice(p - 1).chr
711
+
712
+ if @version >= 27 && ((0..8).include?(@escape.ord) || (14..31).include?(@escape.ord))
713
+ diagnostic :fatal, :invalid_escape
714
+ end
715
+ }
716
+
717
+ action slash_c_char {
718
+ @escape = encode_escape(@escape[0].ord & 0x9f)
719
+ }
720
+
721
+ action slash_m_char {
722
+ @escape = encode_escape(@escape[0].ord | 0x80)
723
+ }
724
+
725
+ maybe_escaped_char = (
726
+ '\\' c_any %unescape_char
727
+ | ( c_any - [\\] ) %read_post_meta_or_ctrl_char
728
+ );
729
+
730
+ maybe_escaped_ctrl_char = ( # why?!
731
+ '\\' c_any %unescape_char %slash_c_char
732
+ | '?' % { @escape = "\x7f" }
733
+ | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char
734
+ );
735
+
736
+ escape = (
737
+ # \377
738
+ [0-7]{1,3}
739
+ % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
740
+
741
+ # \xff
742
+ | 'x' xdigit{1,2}
743
+ % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
744
+
745
+ # %q[\x]
746
+ | 'x' ( c_any - xdigit )
747
+ % {
748
+ diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
749
+ }
750
+
751
+ # \u263a
752
+ | 'u' xdigit{4}
753
+ % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
754
+
755
+ # \u123
756
+ | 'u' xdigit{0,3}
757
+ % {
758
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
759
+ }
760
+
761
+ # u{not hex} or u{}
762
+ | 'u{' ( c_any - xdigit - [ \t}] )* '}'
763
+ % {
764
+ diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
765
+ }
766
+
767
+ # \u{ \t 123 \t 456 \t\t }
768
+ | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
769
+ (
770
+ ( xdigit{1,6} [ \t]* '}'
771
+ %unicode_points
772
+ )
773
+ |
774
+ ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
775
+ | ( c_any - [ \t}] )* c_eof
776
+ | xdigit{7,}
777
+ ) % {
778
+ diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
779
+ }
780
+ )
781
+
782
+ # \C-\a \cx
783
+ | ( 'C-' | 'c' ) escaped_nl?
784
+ maybe_escaped_ctrl_char
785
+
786
+ # \M-a
787
+ | 'M-' escaped_nl?
788
+ maybe_escaped_char
789
+ %slash_m_char
790
+
791
+ # \C-\M-f \M-\cf \c\M-f
792
+ | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
793
+ | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
794
+ maybe_escaped_ctrl_char
795
+ %slash_m_char
796
+
797
+ | 'C' c_any %invalid_complex_escape
798
+ | 'M' c_any %invalid_complex_escape
799
+ | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
800
+
801
+ | ( c_any - [0-7xuCMc] ) %unescape_char
802
+
803
+ | c_eof % {
804
+ diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
805
+ }
806
+ );
807
+
808
+ # Use rules in form of `e_bs escape' when you need to parse a sequence.
809
+ e_bs = '\\' % {
810
+ @escape_s = p
811
+ @escape = nil
812
+ };
813
+
814
+ #
815
+ # === STRING AND HEREDOC PARSING ===
816
+ #
817
+
818
+ # Heredoc parsing is quite a complex topic. First, consider that heredocs
819
+ # can be arbitrarily nested. For example:
820
+ #
821
+ # puts <<CODE
822
+ # the result is: #{<<RESULT.inspect
823
+ # i am a heredoc
824
+ # RESULT
825
+ # }
826
+ # CODE
827
+ #
828
+ # which, incidentally, evaluates to:
829
+ #
830
+ # the result is: " i am a heredoc\n"
831
+ #
832
+ # To parse them, lexer refers to two kinds (remember, nested heredocs)
833
+ # of positions in the input stream, namely heredoc_e
834
+ # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
835
+ #
836
+ # heredoc_e is simply contained inside the corresponding Literal, and
837
+ # when the heredoc is closed, the lexing is restarted from that position.
838
+ #
839
+ # @herebody_s is quite more complex. First, @herebody_s changes after each
840
+ # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
841
+ # contains the current line, and also when a heredoc is started, @herebody_s
842
+ # contains the position from which the heredoc will be lexed.
843
+ #
844
+ # Second, as (insanity) there are nested heredocs, we need to maintain a
845
+ # stack of these positions. Each time #push_literal is called, it saves current
846
+ # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
847
+ # containing another heredocs) is closed, the previous value is restored.
848
+
849
+ e_heredoc_nl = c_nl % {
850
+ # After every heredoc was parsed, @herebody_s contains the
851
+ # position of next token after all heredocs.
852
+ if @herebody_s
853
+ p = @herebody_s
854
+ @herebody_s = nil
855
+ end
856
+ };
857
+
858
+ action extend_string {
859
+ string = tok
860
+
861
+ # tLABEL_END is only possible in non-cond context on >= 2.2
862
+ if @version >= 22 && !@cond.active?
863
+ lookahead = @source_buffer.slice(@te...@te+2)
864
+ end
865
+
866
+ current_literal = literal
867
+ if !current_literal.heredoc? &&
868
+ (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
869
+ if token[0] == :tLABEL_END
870
+ p += 1
871
+ pop_literal
872
+ fnext expr_labelarg;
873
+ else
874
+ fnext *pop_literal;
875
+ end
876
+ fbreak;
877
+ else
878
+ current_literal.extend_string(string, @ts, @te)
879
+ end
880
+ }
881
+
882
+ action extend_string_escaped {
883
+ current_literal = literal
884
+ # Get the first character after the backslash.
885
+ escaped_char = @source_buffer.slice(@escape_s).chr
886
+
887
+ if current_literal.munge_escape? escaped_char
888
+ # If this particular literal uses this character as an opening
889
+ # or closing delimiter, it is an escape sequence for that
890
+ # particular character. Write it without the backslash.
891
+
892
+ if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
893
+ # Regular expressions should include escaped delimiters in their
894
+ # escaped form, except when the escaped character is
895
+ # a closing delimiter but not a regexp metacharacter.
896
+ #
897
+ # The backslash itself cannot be used as a closing delimiter
898
+ # at the same time as an escape symbol, but it is always munged,
899
+ # so this branch also executes for the non-closing-delimiter case
900
+ # for the backslash.
901
+ current_literal.extend_string(tok, @ts, @te)
902
+ else
903
+ current_literal.extend_string(escaped_char, @ts, @te)
904
+ end
905
+ else
906
+ # It does not. So this is an actual escape sequence, yay!
907
+ if current_literal.squiggly_heredoc? && escaped_char == "\n".freeze
908
+ # Squiggly heredocs like
909
+ # <<~-HERE
910
+ # 1\
911
+ # 2
912
+ # HERE
913
+ # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
914
+ # This information is emitted as is, without escaping,
915
+ # later this escape sequence (\\\n) gets handled manually in the Lexer::Dedenter
916
+ current_literal.extend_string(tok, @ts, @te)
917
+ elsif current_literal.supports_line_continuation_via_slash? && escaped_char == "\n".freeze
918
+ # Heredocs, regexp and a few other types of literals support line
919
+ # continuation via \\\n sequence. The code like
920
+ # "a\
921
+ # b"
922
+ # must be parsed as "ab"
923
+ current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
924
+ elsif current_literal.regexp?
925
+ # Regular expressions should include escape sequences in their
926
+ # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x")
927
+ current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
928
+ else
929
+ current_literal.extend_string(@escape || tok, @ts, @te)
930
+ end
931
+ end
932
+ }
933
+
934
+ # Extend a string with a newline or a EOF character.
935
+ # As heredoc closing line can immediately precede EOF, this action
936
+ # has to handle such case specially.
937
+ action extend_string_eol {
938
+ current_literal = literal
939
+ if @te == pe
940
+ diagnostic :fatal, :string_eof, nil,
941
+ range(current_literal.str_s, current_literal.str_s + 1)
942
+ end
943
+
944
+ if current_literal.heredoc?
945
+ line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
946
+
947
+ if version?(18, 19, 20)
948
+ # See ruby:c48b4209c
949
+ line = line.gsub(/\r.*$/, ''.freeze)
950
+ end
951
+
952
+ # Try ending the heredoc with the complete most recently
953
+ # scanned line. @herebody_s always refers to the start of such line.
954
+ if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
955
+ # Adjust @herebody_s to point to the next line.
956
+ @herebody_s = @te
957
+
958
+ # Continue regular lexing after the heredoc reference (<<END).
959
+ p = current_literal.heredoc_e - 1
960
+ fnext *pop_literal; fbreak;
961
+ else
962
+ # Calculate indentation level for <<~HEREDOCs.
963
+ current_literal.infer_indent_level(line)
964
+
965
+ # Ditto.
966
+ @herebody_s = @te
967
+ end
968
+ else
969
+ # Try ending the literal with a newline.
970
+ if current_literal.nest_and_try_closing(tok, @ts, @te)
971
+ fnext *pop_literal; fbreak;
972
+ end
973
+
974
+ if @herebody_s
975
+ # This is a regular literal intertwined with a heredoc. Like:
976
+ #
977
+ # p <<-foo+"1
978
+ # bar
979
+ # foo
980
+ # 2"
981
+ #
982
+ # which, incidentally, evaluates to "bar\n1\n2".
983
+ p = @herebody_s - 1
984
+ @herebody_s = nil
985
+ end
986
+ end
987
+
988
+ if current_literal.words? && !eof_codepoint?(@source_pts[p])
989
+ current_literal.extend_space @ts, @te
990
+ else
991
+ # A literal newline is appended if the heredoc was _not_ closed
992
+ # this time (see fbreak above). See also Literal#nest_and_try_closing
993
+ # for rationale of calling #flush_string here.
994
+ current_literal.extend_string tok, @ts, @te
995
+ current_literal.flush_string
996
+ end
997
+ }
998
+
999
+ action extend_string_space {
1000
+ literal.extend_space @ts, @te
1001
+ }
1002
+
1003
+ #
1004
+ # === INTERPOLATION PARSING ===
1005
+ #
1006
+
1007
+ # Interpolations with immediate variable names simply call into
1008
+ # the corresponding machine.
1009
+
1010
+ interp_var = '#' ( global_var | class_var_v | instance_var_v );
1011
+
1012
+ action extend_interp_var {
1013
+ current_literal = literal
1014
+ current_literal.flush_string
1015
+ current_literal.extend_content
1016
+
1017
+ emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1018
+
1019
+ p = @ts
1020
+ fcall expr_variable;
1021
+ }
1022
+
1023
+ # Special case for Ruby > 2.7
1024
+ # If interpolated instance/class variable starts with a digit we parse it as a plain substring
1025
+ # However, "#$1" is still a regular interpolation
1026
+ interp_digit_var = '#' ('@' | '@@') digit c_alpha*;
1027
+
1028
+ action extend_interp_digit_var {
1029
+ if @version >= 27
1030
+ literal.extend_string(tok, @ts, @te)
1031
+ else
1032
+ message = tok.start_with?('#@@') ? :cvar_name : :ivar_name
1033
+ diagnostic :error, message, { :name => tok(@ts + 1, @te) }, range(@ts + 1, @te)
1034
+ end
1035
+ }
1036
+
1037
+ # Interpolations with code blocks must match nested curly braces, as
1038
+ # interpolation ending is ambiguous with a block ending. So, every
1039
+ # opening and closing brace should be matched with e_[lr]brace rules,
1040
+ # which automatically perform the counting.
1041
+ #
1042
+ # Note that interpolations can themselves be nested, so brace balance
1043
+ # is tied to the innermost literal.
1044
+ #
1045
+ # Also note that literals themselves should not use e_[lr]brace rules
1046
+ # when matching their opening and closing delimiters, as the amount of
1047
+ # braces inside the characters of a string literal is independent.
1048
+
1049
+ interp_code = '#{';
1050
+
1051
+ e_lbrace = '{' % {
1052
+ @cond.push(false); @cmdarg.push(false)
1053
+
1054
+ current_literal = literal
1055
+ if current_literal
1056
+ current_literal.start_interp_brace
1057
+ end
1058
+ };
1059
+
1060
+ e_rbrace = '}' % {
1061
+ current_literal = literal
1062
+ if current_literal
1063
+ if current_literal.end_interp_brace_and_try_closing
1064
+ if version?(18, 19)
1065
+ emit(:tRCURLY, '}'.freeze, p - 1, p)
1066
+ @cond.lexpop
1067
+ @cmdarg.lexpop
1068
+ else
1069
+ emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1070
+ end
1071
+
1072
+ if current_literal.saved_herebody_s
1073
+ @herebody_s = current_literal.saved_herebody_s
1074
+ end
1075
+
1076
+
1077
+ fhold;
1078
+ fnext *next_state_for_literal(current_literal);
1079
+ fbreak;
1080
+ end
1081
+ end
1082
+
1083
+ @paren_nest -= 1
1084
+ };
1085
+
1086
+ action extend_interp_code {
1087
+ current_literal = literal
1088
+ current_literal.flush_string
1089
+ current_literal.extend_content
1090
+
1091
+ emit(:tSTRING_DBEG, '#{'.freeze)
1092
+
1093
+ if current_literal.heredoc?
1094
+ current_literal.saved_herebody_s = @herebody_s
1095
+ @herebody_s = nil
1096
+ end
1097
+
1098
+ current_literal.start_interp_brace
1099
+ @command_start = true
1100
+ fnext expr_value;
1101
+ fbreak;
1102
+ }
1103
+
1104
+ # Actual string parsers are simply combined from the primitives defined
1105
+ # above.
1106
+
1107
+ interp_words := |*
1108
+ interp_code => extend_interp_code;
1109
+ interp_digit_var => extend_interp_digit_var;
1110
+ interp_var => extend_interp_var;
1111
+ e_bs escape => extend_string_escaped;
1112
+ c_space+ => extend_string_space;
1113
+ c_eol => extend_string_eol;
1114
+ c_any => extend_string;
1115
+ *|;
1116
+
1117
+ interp_string := |*
1118
+ interp_code => extend_interp_code;
1119
+ interp_digit_var => extend_interp_digit_var;
1120
+ interp_var => extend_interp_var;
1121
+ e_bs escape => extend_string_escaped;
1122
+ c_eol => extend_string_eol;
1123
+ c_any => extend_string;
1124
+ *|;
1125
+
1126
+ plain_words := |*
1127
+ e_bs c_any => extend_string_escaped;
1128
+ c_space+ => extend_string_space;
1129
+ c_eol => extend_string_eol;
1130
+ c_any => extend_string;
1131
+ *|;
1132
+
1133
+ plain_string := |*
1134
+ '\\' c_nl => extend_string_eol;
1135
+ e_bs c_any => extend_string_escaped;
1136
+ c_eol => extend_string_eol;
1137
+ c_any => extend_string;
1138
+ *|;
1139
+
1140
+ interp_backslash_delimited := |*
1141
+ interp_code => extend_interp_code;
1142
+ interp_digit_var => extend_interp_digit_var;
1143
+ interp_var => extend_interp_var;
1144
+ c_eol => extend_string_eol;
1145
+ c_any => extend_string;
1146
+ *|;
1147
+
1148
+ plain_backslash_delimited := |*
1149
+ c_eol => extend_string_eol;
1150
+ c_any => extend_string;
1151
+ *|;
1152
+
1153
+ interp_backslash_delimited_words := |*
1154
+ interp_code => extend_interp_code;
1155
+ interp_digit_var => extend_interp_digit_var;
1156
+ interp_var => extend_interp_var;
1157
+ c_space+ => extend_string_space;
1158
+ c_eol => extend_string_eol;
1159
+ c_any => extend_string;
1160
+ *|;
1161
+
1162
+ plain_backslash_delimited_words := |*
1163
+ c_space+ => extend_string_space;
1164
+ c_eol => extend_string_eol;
1165
+ c_any => extend_string;
1166
+ *|;
1167
+
1168
+ regexp_modifiers := |*
1169
+ [A-Za-z]+
1170
+ => {
1171
+ unknown_options = tok.scan(/[^imxouesn]/)
1172
+ if unknown_options.any?
1173
+ diagnostic :error, :regexp_options,
1174
+ { :options => unknown_options.join }
1175
+ end
1176
+
1177
+ emit(:tREGEXP_OPT)
1178
+ fnext expr_end;
1179
+ fbreak;
1180
+ };
1181
+
1182
+ any
1183
+ => {
1184
+ emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1185
+ fhold;
1186
+ fgoto expr_end;
1187
+ };
1188
+ *|;
1189
+
1190
+ #
1191
+ # === WHITESPACE HANDLING ===
1192
+ #
1193
+
1194
+ # Various contexts in Ruby allow various kinds of whitespace
1195
+ # to be used. They are grouped to clarify the lexing machines
1196
+ # and ease collection of comments.
1197
+
1198
+ # A line of code with inline #comment at end is always equivalent
1199
+ # to a line of code ending with just a newline, so an inline
1200
+ # comment is deemed equivalent to non-newline whitespace
1201
+ # (c_space character class).
1202
+
1203
+ w_space =
1204
+ c_space+
1205
+ | '\\' e_heredoc_nl
1206
+ ;
1207
+
1208
+ w_comment =
1209
+ '#' %{ @sharp_s = p - 1 }
1210
+ # The (p == pe) condition compensates for added "\0" and
1211
+ # the way Ragel handles EOF.
1212
+ c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1213
+ ;
1214
+
1215
+ w_space_comment =
1216
+ w_space
1217
+ | w_comment
1218
+ ;
1219
+
1220
+ # A newline in non-literal context always interoperates with
1221
+ # here document logic and can always be escaped by a backslash,
1222
+ # still interoperating with here document logic in the same way,
1223
+ # yet being invisible to anything else.
1224
+ #
1225
+ # To demonstrate:
1226
+ #
1227
+ # foo = <<FOO \
1228
+ # bar
1229
+ # FOO
1230
+ # + 2
1231
+ #
1232
+ # is equivalent to `foo = "bar\n" + 2`.
1233
+
1234
+ w_newline =
1235
+ e_heredoc_nl;
1236
+
1237
+ w_any =
1238
+ w_space
1239
+ | w_comment
1240
+ | w_newline
1241
+ ;
1242
+
1243
+
1244
+ #
1245
+ # === EXPRESSION PARSING ===
1246
+ #
1247
+
1248
+ # These rules implement a form of manually defined lookahead.
1249
+ # The default longest-match scanning does not work here due
1250
+ # to sheer ambiguity.
1251
+
1252
+ ambiguous_fid_suffix = # actual parsed
1253
+ [?!] %{ tm = p } | # a? a?
1254
+ [?!]'=' %{ tm = p - 2 } # a!=b a != b
1255
+ ;
1256
+
1257
+ ambiguous_ident_suffix = # actual parsed
1258
+ ambiguous_fid_suffix |
1259
+ '=' %{ tm = p } | # a= a=
1260
+ '==' %{ tm = p - 2 } | # a==b a == b
1261
+ '=~' %{ tm = p - 2 } | # a=~b a =~ b
1262
+ '=>' %{ tm = p - 2 } | # a=>b a => b
1263
+ '===' %{ tm = p - 3 } # a===b a === b
1264
+ ;
1265
+
1266
+ ambiguous_symbol_suffix = # actual parsed
1267
+ ambiguous_ident_suffix |
1268
+ '==>' %{ tm = p - 2 } # :a==>b :a= => b
1269
+ ;
1270
+
1271
+ # Ambiguous with 1.9 hash labels.
1272
+ ambiguous_const_suffix = # actual parsed
1273
+ '::' %{ tm = p - 2 } # A::B A :: B
1274
+ ;
1275
+
1276
+ # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1277
+ # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1278
+
1279
+ e_lbrack = '[' % {
1280
+ @cond.push(false); @cmdarg.push(false)
1281
+
1282
+ @paren_nest += 1
1283
+ };
1284
+
1285
+ e_rbrack = ']' % {
1286
+ @paren_nest -= 1
1287
+ };
1288
+
1289
+ # Ruby 1.9 lambdas require parentheses counting in order to
1290
+ # emit correct opening kDO/tLBRACE.
1291
+
1292
+ e_lparen = '(' % {
1293
+ @cond.push(false); @cmdarg.push(false)
1294
+
1295
+ @paren_nest += 1
1296
+
1297
+ if version?(18)
1298
+ @command_start = true
1299
+ end
1300
+ };
1301
+
1302
+ e_rparen = ')' % {
1303
+ @paren_nest -= 1
1304
+ };
1305
+
1306
+ # Ruby is context-sensitive wrt/ local identifiers.
1307
+ action local_ident {
1308
+ emit(:tIDENTIFIER)
1309
+
1310
+ if !@static_env.nil? && @static_env.declared?(tok)
1311
+ fnext expr_endfn; fbreak;
1312
+ else
1313
+ fnext *arg_or_cmdarg(cmd_state); fbreak;
1314
+ end
1315
+ }
1316
+
1317
+ # Variable lexing code is accessed from both expressions and
1318
+ # string interpolation related code.
1319
+ #
1320
+ expr_variable := |*
1321
+ global_var
1322
+ => {
1323
+ if tok =~ /^\$([1-9][0-9]*)$/
1324
+ emit(:tNTH_REF, tok(@ts + 1).to_i)
1325
+ elsif tok =~ /^\$([&`'+])$/
1326
+ emit(:tBACK_REF)
1327
+ else
1328
+ emit(:tGVAR)
1329
+ end
1330
+
1331
+ fnext *stack_pop; fbreak;
1332
+ };
1333
+
1334
+ class_var_v
1335
+ => {
1336
+ if tok =~ /^@@[0-9]/
1337
+ diagnostic :error, :cvar_name, { :name => tok }
1338
+ end
1339
+
1340
+ emit(:tCVAR)
1341
+ fnext *stack_pop; fbreak;
1342
+ };
1343
+
1344
+ instance_var_v
1345
+ => {
1346
+ if tok =~ /^@[0-9]/
1347
+ diagnostic :error, :ivar_name, { :name => tok }
1348
+ end
1349
+
1350
+ emit(:tIVAR)
1351
+ fnext *stack_pop; fbreak;
1352
+ };
1353
+ *|;
1354
+
1355
+ # Literal function name in definition (e.g. `def class`).
1356
+ # Keywords are returned as their respective tokens; this is used
1357
+ # to support singleton def `def self.foo`. Global variables are
1358
+ # returned as `tGVAR`; this is used in global variable alias
1359
+ # statements `alias $a $b`. Symbols are returned verbatim; this
1360
+ # is used in `alias :a :"b#{foo}"` and `undef :a`.
1361
+ #
1362
+ # Transitions to `expr_endfn` afterwards.
1363
+ #
1364
+ expr_fname := |*
1365
+ keyword
1366
+ => { emit_table(KEYWORDS_BEGIN);
1367
+ fnext expr_endfn; fbreak; };
1368
+
1369
+ constant
1370
+ => { emit(:tCONSTANT)
1371
+ fnext expr_endfn; fbreak; };
1372
+
1373
+ bareword [?=!]?
1374
+ => { emit(:tIDENTIFIER)
1375
+ fnext expr_endfn; fbreak; };
1376
+
1377
+ global_var
1378
+ => { p = @ts - 1
1379
+ fnext expr_end; fcall expr_variable; };
1380
+
1381
+ # If the handling was to be delegated to expr_end,
1382
+ # these cases would transition to something else than
1383
+ # expr_endfn, which is incorrect.
1384
+ operator_fname |
1385
+ operator_arithmetic |
1386
+ operator_rest
1387
+ => { emit_table(PUNCTUATION)
1388
+ fnext expr_endfn; fbreak; };
1389
+
1390
+ '::'
1391
+ => { fhold; fhold; fgoto expr_end; };
1392
+
1393
+ ':'
1394
+ => { fhold; fgoto expr_beg; };
1395
+
1396
+ '%s' c_any
1397
+ => {
1398
+ if version?(23)
1399
+ type, delimiter = tok[0..-2], tok[-1].chr
1400
+ fgoto *push_literal(type, delimiter, @ts);
1401
+ else
1402
+ p = @ts - 1
1403
+ fgoto expr_end;
1404
+ end
1405
+ };
1406
+
1407
+ w_any;
1408
+
1409
+ c_any
1410
+ => { fhold; fgoto expr_end; };
1411
+
1412
+ c_eof => do_eof;
1413
+ *|;
1414
+
1415
+ # After literal function name in definition. Behaves like `expr_end`,
1416
+ # but allows a tLABEL.
1417
+ #
1418
+ # Transitions to `expr_end` afterwards.
1419
+ #
1420
+ expr_endfn := |*
1421
+ label ( any - ':' )
1422
+ => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1423
+ fhold; fnext expr_labelarg; fbreak; };
1424
+
1425
+ w_space_comment;
1426
+
1427
+ c_any
1428
+ => { fhold; fgoto expr_end; };
1429
+
1430
+ c_eof => do_eof;
1431
+ *|;
1432
+
1433
+ # Literal function name in method call (e.g. `a.class`).
1434
+ #
1435
+ # Transitions to `expr_arg` afterwards.
1436
+ #
1437
+ expr_dot := |*
1438
+ constant
1439
+ => { emit(:tCONSTANT)
1440
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
1441
+
1442
+ call_or_var
1443
+ => { emit(:tIDENTIFIER)
1444
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
1445
+
1446
+ bareword ambiguous_fid_suffix
1447
+ => { emit(:tFID, tok(@ts, tm), @ts, tm)
1448
+ fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
1449
+
1450
+ # See the comment in `expr_fname`.
1451
+ operator_fname |
1452
+ operator_arithmetic |
1453
+ operator_rest
1454
+ => { emit_table(PUNCTUATION)
1455
+ fnext expr_arg; fbreak; };
1456
+
1457
+ w_any;
1458
+
1459
+ c_any
1460
+ => { fhold; fgoto expr_end; };
1461
+
1462
+ c_eof => do_eof;
1463
+ *|;
1464
+
1465
+ # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1466
+ # is consumed; the current expression is a command or method call.
1467
+ #
1468
+ expr_arg := |*
1469
+ #
1470
+ # COMMAND MODE SPECIFIC TOKENS
1471
+ #
1472
+
1473
+ # cmd (1 + 2)
1474
+ # See below the rationale about expr_endarg.
1475
+ w_space+ e_lparen
1476
+ => {
1477
+ if version?(18)
1478
+ emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1479
+ fnext expr_value; fbreak;
1480
+ else
1481
+ emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1482
+ fnext expr_beg; fbreak;
1483
+ end
1484
+ };
1485
+
1486
+ # meth(1 + 2)
1487
+ # Regular method call.
1488
+ e_lparen
1489
+ => { emit(:tLPAREN2, '('.freeze)
1490
+ fnext expr_beg; fbreak; };
1491
+
1492
+ # meth [...]
1493
+ # Array argument. Compare with indexing `meth[...]`.
1494
+ w_space+ e_lbrack
1495
+ => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1496
+ fnext expr_beg; fbreak; };
1497
+
1498
+ # cmd {}
1499
+ # Command: method call without parentheses.
1500
+ w_space* e_lbrace
1501
+ => {
1502
+ if @lambda_stack.last == @paren_nest
1503
+ @lambda_stack.pop
1504
+ emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1505
+ else
1506
+ emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1507
+ end
1508
+ @command_start = true
1509
+ @paren_nest += 1
1510
+ fnext expr_value; fbreak;
1511
+ };
1512
+
1513
+ #
1514
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1515
+ #
1516
+
1517
+ # a??
1518
+ # Ternary operator
1519
+ '?' c_space_nl
1520
+ => {
1521
+ # Unlike expr_beg as invoked in the next rule, do not warn
1522
+ p = @ts - 1
1523
+ fgoto expr_end;
1524
+ };
1525
+
1526
+ # a ?b, a? ?
1527
+ # Character literal or ternary operator
1528
+ w_space* '?'
1529
+ => { fhold; fgoto expr_beg; };
1530
+
1531
+ # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1532
+ # a /foo/ (but not "a / foo" or "a /=foo")
1533
+ # a <<HEREDOC
1534
+ w_space+ %{ tm = p }
1535
+ ( [%/] ( c_any - c_space_nl - '=' ) # /
1536
+ | '<<'
1537
+ )
1538
+ => {
1539
+ if tok(tm, tm + 1) == '/'.freeze
1540
+ # Ambiguous regexp literal.
1541
+ diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1542
+ end
1543
+
1544
+ p = tm - 1
1545
+ fgoto expr_beg;
1546
+ };
1547
+
1548
+ # x *1
1549
+ # Ambiguous splat, kwsplat or block-pass.
1550
+ w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1551
+ => {
1552
+ diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1553
+ range(tm, @te)
1554
+
1555
+ p = tm - 1
1556
+ fgoto expr_beg;
1557
+ };
1558
+
1559
+ # x ::Foo
1560
+ # Ambiguous toplevel constant access.
1561
+ w_space+ '::'
1562
+ => { fhold; fhold; fgoto expr_beg; };
1563
+
1564
+ # x:b
1565
+ # Symbol.
1566
+ w_space* ':'
1567
+ => { fhold; fgoto expr_beg; };
1568
+
1569
+ w_space+ label
1570
+ => { p = @ts - 1; fgoto expr_beg; };
1571
+
1572
+ #
1573
+ # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1574
+ #
1575
+
1576
+ # a ? b
1577
+ # Ternary operator.
1578
+ w_space+ %{ tm = p } '?' c_space_nl
1579
+ => { p = tm - 1; fgoto expr_end; };
1580
+
1581
+ # x + 1: Binary operator or operator-assignment.
1582
+ w_space* operator_arithmetic
1583
+ ( '=' | c_space_nl )? |
1584
+ # x rescue y: Modifier keyword.
1585
+ w_space* keyword_modifier |
1586
+ # a &. b: Safe navigation operator.
1587
+ w_space* '&.' |
1588
+ # Miscellanea.
1589
+ w_space* punctuation_end
1590
+ => {
1591
+ p = @ts - 1
1592
+ fgoto expr_end;
1593
+ };
1594
+
1595
+ w_space;
1596
+
1597
+ w_comment
1598
+ => { fgoto expr_end; };
1599
+
1600
+ w_newline
1601
+ => { fhold; fgoto expr_end; };
1602
+
1603
+ c_any
1604
+ => { fhold; fgoto expr_beg; };
1605
+
1606
+ c_eof => do_eof;
1607
+ *|;
1608
+
1609
+ # The previous token was an identifier which was seen while in the
1610
+ # command mode (that is, the state at the beginning of #advance was
1611
+ # expr_value). This state is very similar to expr_arg, but disambiguates
1612
+ # two very rare and specific condition:
1613
+ # * In 1.8 mode, "foo (lambda do end)".
1614
+ # * In 1.9+ mode, "f x: -> do foo do end end".
1615
+ expr_cmdarg := |*
1616
+ w_space+ e_lparen
1617
+ => {
1618
+ emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1619
+ if version?(18)
1620
+ fnext expr_value; fbreak;
1621
+ else
1622
+ fnext expr_beg; fbreak;
1623
+ end
1624
+ };
1625
+
1626
+ w_space* 'do'
1627
+ => {
1628
+ if @cond.active?
1629
+ emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1630
+ else
1631
+ emit(:kDO, 'do'.freeze, @te - 2, @te)
1632
+ end
1633
+ fnext expr_value; fbreak;
1634
+ };
1635
+
1636
+ c_any |
1637
+ # Disambiguate with the `do' rule above.
1638
+ w_space* bareword |
1639
+ w_space* label
1640
+ => { p = @ts - 1
1641
+ fgoto expr_arg; };
1642
+
1643
+ c_eof => do_eof;
1644
+ *|;
1645
+
1646
+ # The rationale for this state is pretty complex. Normally, if an argument
1647
+ # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1648
+ # the block is attached to the innermost argument (`f` in `m f {}`), or it
1649
+ # is a parse error (`m 1 {}`). But there is a special case for passing a single
1650
+ # primary expression grouped with parentheses: if you write `m (1) {}` or
1651
+ # (2.0 only) `m () {}`, then the block is attached to `m`.
1652
+ #
1653
+ # Thus, we recognize the opening `(` of a command (remember, a command is
1654
+ # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1655
+ # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1656
+ # lexer's state to `expr_endarg`, which makes it emit the possibly following
1657
+ # `{` as `tLBRACE_ARG`.
1658
+ #
1659
+ # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1660
+ # `do` (as `kDO_BLOCK` in `expr_beg`).
1661
+ expr_endarg := |*
1662
+ e_lbrace
1663
+ => {
1664
+ if @lambda_stack.last == @paren_nest
1665
+ @lambda_stack.pop
1666
+ emit(:tLAMBEG, '{'.freeze)
1667
+ else
1668
+ emit(:tLBRACE_ARG, '{'.freeze)
1669
+ end
1670
+ @paren_nest += 1
1671
+ @command_start = true
1672
+ fnext expr_value; fbreak;
1673
+ };
1674
+
1675
+ 'do'
1676
+ => { emit_do(true)
1677
+ fnext expr_value; fbreak; };
1678
+
1679
+ w_space_comment;
1680
+
1681
+ c_any
1682
+ => { fhold; fgoto expr_end; };
1683
+
1684
+ c_eof => do_eof;
1685
+ *|;
1686
+
1687
+ # The rationale for this state is that several keywords accept value
1688
+ # (i.e. should transition to `expr_beg`), do not accept it like a command
1689
+ # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1690
+ # accept a modifier if/while/etc.
1691
+ #
1692
+ expr_mid := |*
1693
+ keyword_modifier
1694
+ => { emit_table(KEYWORDS)
1695
+ fnext expr_beg; fbreak; };
1696
+
1697
+ bareword
1698
+ => { p = @ts - 1; fgoto expr_beg; };
1699
+
1700
+ w_space_comment;
1701
+
1702
+ w_newline
1703
+ => { fhold; fgoto expr_end; };
1704
+
1705
+ c_any
1706
+ => { fhold; fgoto expr_beg; };
1707
+
1708
+ c_eof => do_eof;
1709
+ *|;
1710
+
1711
+ # Beginning of an expression.
1712
+ #
1713
+ # Don't fallthrough to this state from `c_any`; make sure to handle
1714
+ # `c_space* c_nl` and let `expr_end` handle the newline.
1715
+ # Otherwise code like `f\ndef x` gets glued together and the parser
1716
+ # explodes.
1717
+ #
1718
+ expr_beg := |*
1719
+ # +5, -5, - 5
1720
+ [+\-] w_any* [0-9]
1721
+ => {
1722
+ emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1723
+ fhold; fnext expr_end; fbreak;
1724
+ };
1725
+
1726
+ # splat *a
1727
+ '*'
1728
+ => { emit(:tSTAR, '*'.freeze)
1729
+ fbreak; };
1730
+
1731
+ #
1732
+ # STRING AND REGEXP LITERALS
1733
+ #
1734
+
1735
+ # /regexp/oui
1736
+ # /=/ (disambiguation with /=)
1737
+ '/' c_any
1738
+ => {
1739
+ type = delimiter = tok[0].chr
1740
+ fhold; fgoto *push_literal(type, delimiter, @ts);
1741
+ };
1742
+
1743
+ # %<string>
1744
+ '%' ( any - [A-Za-z] )
1745
+ => {
1746
+ type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1747
+ fgoto *push_literal(type, delimiter, @ts);
1748
+ };
1749
+
1750
+ # %w(we are the people)
1751
+ '%' [A-Za-z]+ c_any
1752
+ => {
1753
+ type, delimiter = tok[0..-2], tok[-1].chr
1754
+ fgoto *push_literal(type, delimiter, @ts);
1755
+ };
1756
+
1757
+ '%' c_eof
1758
+ => {
1759
+ diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1760
+ };
1761
+
1762
+ # Heredoc start.
1763
+ # <<END | <<'END' | <<"END" | <<`END` |
1764
+ # <<-END | <<-'END' | <<-"END" | <<-`END` |
1765
+ # <<~END | <<~'END' | <<~"END" | <<~`END`
1766
+ '<<' [~\-]?
1767
+ ( '"' ( any - '"' )* '"'
1768
+ | "'" ( any - "'" )* "'"
1769
+ | "`" ( any - "`" )* "`"
1770
+ | bareword ) % { heredoc_e = p }
1771
+ c_line* c_nl % { new_herebody_s = p }
1772
+ => {
1773
+ tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1774
+
1775
+ indent = !$1.empty? || !$2.empty?
1776
+ dedent_body = !$2.empty?
1777
+ type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1778
+ delimiter = $4
1779
+
1780
+ if @version >= 27
1781
+ if delimiter.count("\n") > 0 || delimiter.count("\r") > 0
1782
+ diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1783
+ end
1784
+ elsif @version >= 24
1785
+ if delimiter.count("\n") > 0
1786
+ if delimiter.end_with?("\n")
1787
+ diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1788
+ delimiter = delimiter.rstrip
1789
+ else
1790
+ diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1791
+ end
1792
+ end
1793
+ end
1794
+
1795
+ if dedent_body && version?(18, 19, 20, 21, 22)
1796
+ emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1797
+ p = @ts + 1
1798
+ fnext expr_beg; fbreak;
1799
+ else
1800
+ fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1801
+
1802
+ @herebody_s ||= new_herebody_s
1803
+ p = @herebody_s - 1
1804
+ end
1805
+ };
1806
+
1807
+ # Escaped unterminated heredoc start
1808
+ # <<'END | <<"END | <<`END |
1809
+ # <<-'END | <<-"END | <<-`END |
1810
+ # <<~'END | <<~"END | <<~`END
1811
+ #
1812
+ # If the heredoc is terminated the rule above should handle it
1813
+ '<<' [~\-]?
1814
+ ('"' (any - c_nl - '"')*
1815
+ |"'" (any - c_nl - "'")*
1816
+ |"`" (any - c_nl - "`")
1817
+ )
1818
+ => {
1819
+ diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1820
+ };
1821
+
1822
+ #
1823
+ # SYMBOL LITERALS
1824
+ #
1825
+
1826
+ # :&&, :||
1827
+ ':' ('&&' | '||') => {
1828
+ fhold; fhold;
1829
+ emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1830
+ fgoto expr_fname;
1831
+ };
1832
+
1833
+ # :"bar", :'baz'
1834
+ ':' ['"] # '
1835
+ => {
1836
+ type, delimiter = tok, tok[-1].chr
1837
+ fgoto *push_literal(type, delimiter, @ts);
1838
+ };
1839
+
1840
+ # :!@ is :!
1841
+ # :~@ is :~
1842
+ ':' [!~] '@'
1843
+ => {
1844
+ emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1845
+ fnext expr_end; fbreak;
1846
+ };
1847
+
1848
+ ':' bareword ambiguous_symbol_suffix
1849
+ => {
1850
+ emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1851
+ p = tm - 1
1852
+ fnext expr_end; fbreak;
1853
+ };
1854
+
1855
+ ':' ( bareword | global_var | class_var | instance_var |
1856
+ operator_fname | operator_arithmetic | operator_rest )
1857
+ => {
1858
+ emit(:tSYMBOL, tok(@ts + 1), @ts)
1859
+ fnext expr_end; fbreak;
1860
+ };
1861
+
1862
+ ':' ( '@' %{ tm = p - 1; diag_msg = :ivar_name }
1863
+ | '@@' %{ tm = p - 2; diag_msg = :cvar_name }
1864
+ ) [0-9]*
1865
+ => {
1866
+ if @version >= 27
1867
+ diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te)
1868
+ else
1869
+ emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1)
1870
+ p = @ts
1871
+ end
1872
+
1873
+ fnext expr_end; fbreak;
1874
+ };
1875
+
1876
+ #
1877
+ # AMBIGUOUS TERNARY OPERATOR
1878
+ #
1879
+
1880
+ # Character constant, like ?a, ?\n, ?\u1000, and so on
1881
+ # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1882
+ '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1883
+ | (c_any - c_space_nl - e_bs) % { @escape = nil }
1884
+ )
1885
+ => {
1886
+ value = @escape || tok(@ts + 1)
1887
+
1888
+ if version?(18)
1889
+ emit(:tINTEGER, value.getbyte(0))
1890
+ else
1891
+ emit(:tCHARACTER, value)
1892
+ end
1893
+
1894
+ fnext expr_end; fbreak;
1895
+ };
1896
+
1897
+ '?' c_space_nl
1898
+ => {
1899
+ escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1900
+ "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1901
+ diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1902
+
1903
+ p = @ts - 1
1904
+ fgoto expr_end;
1905
+ };
1906
+
1907
+ '?' c_eof
1908
+ => {
1909
+ diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1910
+ };
1911
+
1912
+ # f ?aa : b: Disambiguate with a character literal.
1913
+ '?' [A-Za-z_] bareword
1914
+ => {
1915
+ p = @ts - 1
1916
+ fgoto expr_end;
1917
+ };
1918
+
1919
+ #
1920
+ # AMBIGUOUS EMPTY BLOCK ARGUMENTS
1921
+ #
1922
+
1923
+ # Ruby >= 2.7 emits it as two tPIPE terminals
1924
+ # while Ruby < 2.7 as a single tOROP (like in `a || b`)
1925
+ '||'
1926
+ => {
1927
+ if @version >= 27
1928
+ emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1)
1929
+ fhold;
1930
+ fnext expr_beg; fbreak;
1931
+ else
1932
+ p -= 2
1933
+ fgoto expr_end;
1934
+ end
1935
+ };
1936
+
1937
+ #
1938
+ # KEYWORDS AND PUNCTUATION
1939
+ #
1940
+
1941
+ # a({b=>c})
1942
+ e_lbrace
1943
+ => {
1944
+ if @lambda_stack.last == @paren_nest
1945
+ @lambda_stack.pop
1946
+ @command_start = true
1947
+ emit(:tLAMBEG, '{'.freeze)
1948
+ else
1949
+ emit(:tLBRACE, '{'.freeze)
1950
+ end
1951
+ @paren_nest += 1
1952
+ fbreak;
1953
+ };
1954
+
1955
+ # a([1, 2])
1956
+ e_lbrack
1957
+ => { emit(:tLBRACK, '['.freeze)
1958
+ fbreak; };
1959
+
1960
+ # a()
1961
+ e_lparen
1962
+ => { emit(:tLPAREN, '('.freeze)
1963
+ fbreak; };
1964
+
1965
+ # a(+b)
1966
+ punctuation_begin
1967
+ => { emit_table(PUNCTUATION_BEGIN)
1968
+ fbreak; };
1969
+
1970
+ # rescue Exception => e: Block rescue.
1971
+ # Special because it should transition to expr_mid.
1972
+ 'rescue' %{ tm = p } '=>'?
1973
+ => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
1974
+ p = tm - 1
1975
+ fnext expr_mid; fbreak; };
1976
+
1977
+ # if a: Statement if.
1978
+ keyword_modifier
1979
+ => { emit_table(KEYWORDS_BEGIN)
1980
+ @command_start = true
1981
+ fnext expr_value; fbreak; };
1982
+
1983
+ #
1984
+ # RUBY 1.9 HASH LABELS
1985
+ #
1986
+
1987
+ label ( any - ':' )
1988
+ => {
1989
+ fhold;
1990
+
1991
+ if version?(18)
1992
+ ident = tok(@ts, @te - 2)
1993
+
1994
+ emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1995
+ ident, @ts, @te - 2)
1996
+ fhold; # continue as a symbol
1997
+
1998
+ if !@static_env.nil? && @static_env.declared?(ident)
1999
+ fnext expr_end;
2000
+ else
2001
+ fnext *arg_or_cmdarg(cmd_state);
2002
+ end
2003
+ else
2004
+ emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
2005
+ fnext expr_labelarg;
2006
+ end
2007
+
2008
+ fbreak;
2009
+ };
2010
+
2011
+ #
2012
+ # RUBY 2.7 BEGINLESS RANGE
2013
+
2014
+ '..'
2015
+ => {
2016
+ if @version >= 27
2017
+ emit(:tBDOT2)
2018
+ else
2019
+ emit(:tDOT2)
2020
+ end
2021
+
2022
+ fnext expr_beg; fbreak;
2023
+ };
2024
+
2025
+ '...'
2026
+ => {
2027
+ if @version >= 27
2028
+ emit(:tBDOT3)
2029
+ else
2030
+ emit(:tDOT3)
2031
+ end
2032
+
2033
+ fnext expr_beg; fbreak;
2034
+ };
2035
+
2036
+ #
2037
+ # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
2038
+ #
2039
+
2040
+ # foo= bar: Disambiguate with bareword rule below.
2041
+ bareword ambiguous_ident_suffix |
2042
+ # def foo: Disambiguate with bareword rule below.
2043
+ keyword
2044
+ => { p = @ts - 1
2045
+ fgoto expr_end; };
2046
+
2047
+ # a = 42; a [42]: Indexing.
2048
+ # def a; end; a [42]: Array argument.
2049
+ call_or_var
2050
+ => local_ident;
2051
+
2052
+ (call_or_var - keyword)
2053
+ % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
2054
+ w_space+ '('
2055
+ => {
2056
+ emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
2057
+ p = ident_te - 1
2058
+
2059
+ if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
2060
+ fnext expr_endfn;
2061
+ else
2062
+ fnext expr_cmdarg;
2063
+ end
2064
+ fbreak;
2065
+ };
2066
+
2067
+ #
2068
+ # WHITESPACE
2069
+ #
2070
+
2071
+ w_any;
2072
+
2073
+ e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
2074
+ => {
2075
+ p = @ts - 1
2076
+ @cs_before_block_comment = @cs
2077
+ fgoto line_begin;
2078
+ };
2079
+
2080
+ #
2081
+ # DEFAULT TRANSITION
2082
+ #
2083
+
2084
+ # The following rules match most binary and all unary operators.
2085
+ # Rules for binary operators provide better error reporting.
2086
+ operator_arithmetic '=' |
2087
+ operator_rest |
2088
+ punctuation_end |
2089
+ c_any
2090
+ => { p = @ts - 1; fgoto expr_end; };
2091
+
2092
+ c_eof => do_eof;
2093
+ *|;
2094
+
2095
+ # Special newline handling for "def a b:"
2096
+ #
2097
+ expr_labelarg := |*
2098
+ w_space_comment;
2099
+
2100
+ w_newline
2101
+ => {
2102
+ if @in_kwarg
2103
+ fhold; fgoto expr_end;
2104
+ else
2105
+ fgoto line_begin;
2106
+ end
2107
+ };
2108
+
2109
+ c_any
2110
+ => { fhold; fgoto expr_beg; };
2111
+
2112
+ c_eof => do_eof;
2113
+ *|;
2114
+
2115
+ # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
2116
+ #
2117
+ expr_value := |*
2118
+ # a:b: a(:b), a::B, A::B
2119
+ label (any - ':')
2120
+ => { p = @ts - 1
2121
+ fgoto expr_end; };
2122
+
2123
+ # "bar", 'baz'
2124
+ ['"] # '
2125
+ => {
2126
+ fgoto *push_literal(tok, tok, @ts);
2127
+ };
2128
+
2129
+ w_space_comment;
2130
+
2131
+ w_newline
2132
+ => { fgoto line_begin; };
2133
+
2134
+ c_any
2135
+ => { fhold; fgoto expr_beg; };
2136
+
2137
+ c_eof => do_eof;
2138
+ *|;
2139
+
2140
+ expr_end := |*
2141
+ #
2142
+ # STABBY LAMBDA
2143
+ #
2144
+
2145
+ '->'
2146
+ => {
2147
+ emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2148
+
2149
+ @lambda_stack.push @paren_nest
2150
+ fnext expr_endfn; fbreak;
2151
+ };
2152
+
2153
+ e_lbrace | 'do'
2154
+ => {
2155
+ if @lambda_stack.last == @paren_nest
2156
+ @lambda_stack.pop
2157
+
2158
+ if tok == '{'.freeze
2159
+ emit(:tLAMBEG, '{'.freeze)
2160
+ else # 'do'
2161
+ emit(:kDO_LAMBDA, 'do'.freeze)
2162
+ end
2163
+ else
2164
+ if tok == '{'.freeze
2165
+ emit(:tLCURLY, '{'.freeze)
2166
+ else # 'do'
2167
+ emit_do
2168
+ end
2169
+ end
2170
+ if tok == '{'.freeze
2171
+ @paren_nest += 1
2172
+ end
2173
+ @command_start = true
2174
+
2175
+ fnext expr_value; fbreak;
2176
+ };
2177
+
2178
+ #
2179
+ # KEYWORDS
2180
+ #
2181
+
2182
+ keyword_with_fname
2183
+ => { emit_table(KEYWORDS)
2184
+ fnext expr_fname; fbreak; };
2185
+
2186
+ 'class' w_any* '<<'
2187
+ => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2188
+ emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2189
+ fnext expr_value; fbreak; };
2190
+
2191
+ # a if b:c: Syntax error.
2192
+ keyword_modifier
2193
+ => { emit_table(KEYWORDS)
2194
+ fnext expr_beg; fbreak; };
2195
+
2196
+ # elsif b:c: elsif b(:c)
2197
+ keyword_with_value
2198
+ => { emit_table(KEYWORDS)
2199
+ @command_start = true
2200
+ fnext expr_value; fbreak; };
2201
+
2202
+ keyword_with_mid
2203
+ => { emit_table(KEYWORDS)
2204
+ fnext expr_mid; fbreak; };
2205
+
2206
+ keyword_with_arg
2207
+ => {
2208
+ emit_table(KEYWORDS)
2209
+
2210
+ if version?(18) && tok == 'not'.freeze
2211
+ fnext expr_beg; fbreak;
2212
+ else
2213
+ fnext expr_arg; fbreak;
2214
+ end
2215
+ };
2216
+
2217
+ '__ENCODING__'
2218
+ => {
2219
+ if version?(18)
2220
+ emit(:tIDENTIFIER)
2221
+
2222
+ unless !@static_env.nil? && @static_env.declared?(tok)
2223
+ fnext *arg_or_cmdarg(cmd_state);
2224
+ end
2225
+ else
2226
+ emit(:k__ENCODING__, '__ENCODING__'.freeze)
2227
+ end
2228
+ fbreak;
2229
+ };
2230
+
2231
+ keyword_with_end
2232
+ => { emit_table(KEYWORDS)
2233
+ fbreak; };
2234
+
2235
+ #
2236
+ # NUMERIC LITERALS
2237
+ #
2238
+
2239
+ ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2240
+ | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2241
+ | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2242
+ | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2243
+ | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2244
+ | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2245
+ ) %{ @num_suffix_s = p } int_suffix
2246
+ => {
2247
+ digits = tok(@num_digits_s, @num_suffix_s)
2248
+
2249
+ if digits.end_with? '_'.freeze
2250
+ diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2251
+ range(@te - 1, @te)
2252
+ elsif digits.empty? && @num_base == 8 && version?(18)
2253
+ # 1.8 did not raise an error on 0o.
2254
+ digits = '0'.freeze
2255
+ elsif digits.empty?
2256
+ diagnostic :error, :empty_numeric
2257
+ elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2258
+ invalid_s = @num_digits_s + invalid_idx
2259
+ diagnostic :error, :invalid_octal, nil,
2260
+ range(invalid_s, invalid_s + 1)
2261
+ end
2262
+
2263
+ if version?(18, 19, 20)
2264
+ emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2265
+ p = @num_suffix_s - 1
2266
+ else
2267
+ @num_xfrm.call(digits.to_i(@num_base))
2268
+ end
2269
+ fbreak;
2270
+ };
2271
+
2272
+ flo_frac flo_pow?
2273
+ => {
2274
+ diagnostic :error, :no_dot_digit_literal
2275
+ };
2276
+
2277
+ flo_int [eE]
2278
+ => {
2279
+ if version?(18, 19, 20)
2280
+ diagnostic :error,
2281
+ :trailing_in_number, { :character => tok(@te - 1, @te) },
2282
+ range(@te - 1, @te)
2283
+ else
2284
+ emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2285
+ fhold; fbreak;
2286
+ end
2287
+ };
2288
+
2289
+ flo_int flo_frac [eE]
2290
+ => {
2291
+ if version?(18, 19, 20)
2292
+ diagnostic :error,
2293
+ :trailing_in_number, { :character => tok(@te - 1, @te) },
2294
+ range(@te - 1, @te)
2295
+ else
2296
+ emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2297
+ fhold; fbreak;
2298
+ end
2299
+ };
2300
+
2301
+ flo_int
2302
+ ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2303
+ | flo_frac %{ @num_suffix_s = p } flo_suffix
2304
+ )
2305
+ => {
2306
+ digits = tok(@ts, @num_suffix_s)
2307
+
2308
+ if version?(18, 19, 20)
2309
+ emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2310
+ p = @num_suffix_s - 1
2311
+ else
2312
+ @num_xfrm.call(digits)
2313
+ end
2314
+ fbreak;
2315
+ };
2316
+
2317
+ #
2318
+ # STRING AND XSTRING LITERALS
2319
+ #
2320
+
2321
+ # `echo foo`, "bar", 'baz'
2322
+ '`' | ['"] # '
2323
+ => {
2324
+ type, delimiter = tok, tok[-1].chr
2325
+ fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2326
+ };
2327
+
2328
+ #
2329
+ # CONSTANTS AND VARIABLES
2330
+ #
2331
+
2332
+ constant
2333
+ => { emit(:tCONSTANT)
2334
+ fnext *arg_or_cmdarg(cmd_state); fbreak; };
2335
+
2336
+ constant ambiguous_const_suffix
2337
+ => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2338
+ p = tm - 1; fbreak; };
2339
+
2340
+ global_var | class_var_v | instance_var_v
2341
+ => { p = @ts - 1; fcall expr_variable; };
2342
+
2343
+ #
2344
+ # METHOD CALLS
2345
+ #
2346
+
2347
+ '.:' w_space+
2348
+ => { emit(:tDOT, '.', @ts, @ts + 1)
2349
+ emit(:tCOLON, ':', @ts + 1, @ts + 2)
2350
+ p = p - tok.length + 2
2351
+ fnext expr_dot; fbreak; };
2352
+
2353
+ '.:'
2354
+ => {
2355
+ if @version >= 27
2356
+ emit_table(PUNCTUATION)
2357
+ else
2358
+ emit(:tDOT, tok(@ts, @ts + 1), @ts, @ts + 1)
2359
+ fhold;
2360
+ end
2361
+
2362
+ fnext expr_dot; fbreak;
2363
+ };
2364
+
2365
+ '.' | '&.' | '::'
2366
+ => { emit_table(PUNCTUATION)
2367
+ fnext expr_dot; fbreak; };
2368
+
2369
+ call_or_var
2370
+ => local_ident;
2371
+
2372
+ bareword ambiguous_fid_suffix
2373
+ => {
2374
+ if tm == @te
2375
+ # Suffix was consumed, e.g. foo!
2376
+ emit(:tFID)
2377
+ else
2378
+ # Suffix was not consumed, e.g. foo!=
2379
+ emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2380
+ p = tm - 1
2381
+ end
2382
+ fnext expr_arg; fbreak;
2383
+ };
2384
+
2385
+ #
2386
+ # OPERATORS
2387
+ #
2388
+
2389
+ '*' | '=>'
2390
+ => {
2391
+ emit_table(PUNCTUATION)
2392
+ fgoto expr_value;
2393
+ };
2394
+
2395
+ # When '|', '~', '!', '=>' are used as operators
2396
+ # they do not accept any symbols (or quoted labels) after.
2397
+ # Other binary operators accept it.
2398
+ ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
2399
+ => {
2400
+ emit_table(PUNCTUATION);
2401
+ fnext expr_value; fbreak;
2402
+ };
2403
+
2404
+ ( e_lparen | '|' | '~' | '!' )
2405
+ => { emit_table(PUNCTUATION)
2406
+ fnext expr_beg; fbreak; };
2407
+
2408
+ e_rbrace | e_rparen | e_rbrack
2409
+ => {
2410
+ emit_table(PUNCTUATION)
2411
+
2412
+ if @version < 24
2413
+ @cond.lexpop
2414
+ @cmdarg.lexpop
2415
+ else
2416
+ @cond.pop
2417
+ @cmdarg.pop
2418
+ end
2419
+
2420
+ if tok == '}'.freeze || tok == ']'.freeze
2421
+ if @version >= 25
2422
+ fnext expr_end;
2423
+ else
2424
+ fnext expr_endarg;
2425
+ end
2426
+ else # )
2427
+ # fnext expr_endfn; ?
2428
+ end
2429
+
2430
+ fbreak;
2431
+ };
2432
+
2433
+ operator_arithmetic '='
2434
+ => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2435
+ fnext expr_beg; fbreak; };
2436
+
2437
+ '?'
2438
+ => { emit(:tEH, '?'.freeze)
2439
+ fnext expr_value; fbreak; };
2440
+
2441
+ e_lbrack
2442
+ => { emit(:tLBRACK2, '['.freeze)
2443
+ fnext expr_beg; fbreak; };
2444
+
2445
+ '...' c_nl
2446
+ => {
2447
+ if @paren_nest == 0
2448
+ diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1)
2449
+ end
2450
+
2451
+ emit(:tDOT3, '...'.freeze, @ts, @te - 1)
2452
+ fhold;
2453
+ fnext expr_beg; fbreak;
2454
+ };
2455
+
2456
+ punctuation_end
2457
+ => { emit_table(PUNCTUATION)
2458
+ fnext expr_beg; fbreak; };
2459
+
2460
+ #
2461
+ # WHITESPACE
2462
+ #
2463
+
2464
+ w_space_comment;
2465
+
2466
+ w_newline
2467
+ => { fgoto leading_dot; };
2468
+
2469
+ ';'
2470
+ => { emit(:tSEMI, ';'.freeze)
2471
+ @command_start = true
2472
+ fnext expr_value; fbreak; };
2473
+
2474
+ '\\' c_line {
2475
+ diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2476
+ fhold;
2477
+ };
2478
+
2479
+ c_any
2480
+ => {
2481
+ diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2482
+ };
2483
+
2484
+ c_eof => do_eof;
2485
+ *|;
2486
+
2487
+ leading_dot := |*
2488
+ # Insane leading dots:
2489
+ # a #comment
2490
+ # # post-2.7 comment
2491
+ # .b: a.b
2492
+
2493
+ # Here we use '\n' instead of w_newline to not modify @newline_s
2494
+ # and eventually properly emit tNL
2495
+ (c_space* w_space_comment '\n')+
2496
+ => {
2497
+ if @version < 27
2498
+ # Ruby before 2.7 doesn't support comments before leading dot.
2499
+ # If a line after "a" starts with a comment then "a" is a self-contained statement.
2500
+ # So in that case we emit a special tNL token and start reading the
2501
+ # next line as a separate statement.
2502
+ #
2503
+ # Note: block comments before leading dot are not supported on any version of Ruby.
2504
+ emit(:tNL, nil, @newline_s, @newline_s + 1)
2505
+ fhold; fnext line_begin; fbreak;
2506
+ end
2507
+ };
2508
+
2509
+ c_space* %{ tm = p } ('.' | '&.')
2510
+ => { p = tm - 1; fgoto expr_end; };
2511
+
2512
+ any
2513
+ => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2514
+ fhold; fnext line_begin; fbreak; };
2515
+ *|;
2516
+
2517
+ #
2518
+ # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2519
+ #
2520
+
2521
+ line_comment := |*
2522
+ '=end' c_line* c_nl_zlen
2523
+ => {
2524
+ emit_comment(@eq_begin_s, @te)
2525
+ fgoto *@cs_before_block_comment;
2526
+ };
2527
+
2528
+ c_line* c_nl;
2529
+
2530
+ c_line* zlen
2531
+ => {
2532
+ diagnostic :fatal, :embedded_document, nil,
2533
+ range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2534
+ };
2535
+ *|;
2536
+
2537
+ line_begin := |*
2538
+ w_any;
2539
+
2540
+ '=begin' ( c_space | c_nl_zlen )
2541
+ => { @eq_begin_s = @ts
2542
+ fgoto line_comment; };
2543
+
2544
+ '__END__' ( c_eol - zlen )
2545
+ => { p = pe - 3 };
2546
+
2547
+ c_any
2548
+ => { cmd_state = true; fhold; fgoto expr_value; };
2549
+
2550
+ c_eof => do_eof;
2551
+ *|;
2552
+
2553
+ }%%
2554
+ # %
2555
+ end
2556
+ end