ruby-next-parser 3.1.1.3 → 3.4.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,2641 +0,0 @@
1
- %%machine lex; # % fix highlighting
2
-
3
- #
4
- # === BEFORE YOU START ===
5
- #
6
- # Read the Ruby Hacking Guide chapter 11, available in English at
7
- # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
- #
9
- # Remember two things about Ragel scanners:
10
- #
11
- # 1) Longest match wins.
12
- #
13
- # 2) If two matches have the same length, the first
14
- # in source code wins.
15
- #
16
- # General rules of making Ragel and Bison happy:
17
- #
18
- # * `p` (position) and `@te` contain the index of the character
19
- # they're pointing to ("current"), plus one. `@ts` contains the index
20
- # of the corresponding character. The code for extracting matched token is:
21
- #
22
- # @source_buffer.slice(@ts...@te)
23
- #
24
- # * If your input is `foooooooobar` and the rule is:
25
- #
26
- # 'f' 'o'+
27
- #
28
- # the result will be:
29
- #
30
- # foooooooobar
31
- # ^ ts=0 ^ p=te=9
32
- #
33
- # * A Ragel lexer action should not emit more than one token, unless
34
- # you know what you are doing.
35
- #
36
- # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
- #
38
- # * If an action emits the token and transitions to another state, use
39
- # these Ragel commands:
40
- #
41
- # emit($whatever)
42
- # fnext $next_state; fbreak;
43
- #
44
- # If you perform `fgoto` in an action which does not emit a token nor
45
- # rewinds the stream pointer, the parser's side-effectful,
46
- # context-sensitive lookahead actions will break in a hard to detect
47
- # and debug way.
48
- #
49
- # * If an action does not emit a token:
50
- #
51
- # fgoto $next_state;
52
- #
53
- # * If an action features lookbehind, i.e. matches characters with the
54
- # intent of passing them to another action:
55
- #
56
- # p = @ts - 1
57
- # fgoto $next_state;
58
- #
59
- # or, if the lookbehind consists of a single character:
60
- #
61
- # fhold; fgoto $next_state;
62
- #
63
- # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
- # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
- # _will_ invoke the action `act`.
66
- #
67
- # e_something stands for "something with **e**mbedded action".
68
- #
69
- # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
- # the state of the lexer, add this rule to the state:
71
- #
72
- # c_eof => do_eof;
73
- #
74
- # * If you proceed past EOF, the lexer will complain:
75
- #
76
- # NoMethodError: undefined method `ord' for nil:NilClass
77
- #
78
-
79
- class Parser::Lexer
80
- class Next
81
-
82
- %% write data nofinal;
83
- # %
84
-
85
- ESCAPES = {
86
- ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
87
- ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
88
- ?v.ord => "\v", ?\\.ord => "\\"
89
- }.freeze
90
-
91
- REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
92
-
93
- attr_reader :source_buffer
94
-
95
- attr_accessor :diagnostics
96
- attr_accessor :static_env
97
- attr_accessor :force_utf32
98
-
99
- attr_accessor :cond, :cmdarg, :context, :command_start
100
-
101
- attr_accessor :tokens, :comments
102
-
103
- attr_reader :paren_nest, :cmdarg_stack, :cond_stack, :lambda_stack
104
-
105
- def initialize(version)
106
- @version = version
107
- @static_env = nil
108
- @context = nil
109
-
110
- @tokens = nil
111
- @comments = nil
112
-
113
- reset
114
- end
115
-
116
- def reset(reset_state=true)
117
- # Ragel state:
118
- if reset_state
119
- # Unit tests set state prior to resetting lexer.
120
- @cs = self.class.lex_en_line_begin
121
-
122
- @cond = StackState.new('cond')
123
- @cmdarg = StackState.new('cmdarg')
124
- @cond_stack = []
125
- @cmdarg_stack = []
126
- end
127
-
128
- @force_utf32 = false # Set to true by some tests
129
-
130
- @source_pts = nil # @source as a codepoint array
131
-
132
- @p = 0 # stream position (saved manually in #advance)
133
- @ts = nil # token start
134
- @te = nil # token end
135
- @act = 0 # next action
136
-
137
- @stack = [] # state stack
138
- @top = 0 # state stack top pointer
139
-
140
- # Lexer state:
141
- @token_queue = []
142
- @literal_stack = []
143
-
144
- @eq_begin_s = nil # location of last encountered =begin
145
- @sharp_s = nil # location of last encountered #
146
-
147
- @newline_s = nil # location of last encountered newline
148
-
149
- @num_base = nil # last numeric base
150
- @num_digits_s = nil # starting position of numeric digits
151
- @num_suffix_s = nil # starting position of numeric suffix
152
- @num_xfrm = nil # numeric suffix-induced transformation
153
-
154
- @escape_s = nil # starting position of current sequence
155
- @escape = nil # last escaped sequence, as string
156
-
157
- @herebody_s = nil # starting position of current heredoc line
158
-
159
- # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
160
- # encountered after a matching closing parenthesis.
161
- @paren_nest = 0
162
- @lambda_stack = []
163
-
164
- # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
165
- # we store the indentation level and give it out to the parser
166
- # on request. It is not possible to infer indentation level just
167
- # from the AST because escape sequences such as `\ ` or `\t` are
168
- # expanded inside the lexer, but count as non-whitespace for
169
- # indentation purposes.
170
- @dedent_level = nil
171
-
172
- # If the lexer is in `command state' (aka expr_value)
173
- # at the entry to #advance, it will transition to expr_cmdarg
174
- # instead of expr_arg at certain points.
175
- @command_start = true
176
-
177
- # True at the end of "def foo a:"
178
- @in_kwarg = false
179
-
180
- # State before =begin / =end block comment
181
- @cs_before_block_comment = self.class.lex_en_line_begin
182
- end
183
-
184
- def source_buffer=(source_buffer)
185
- @source_buffer = source_buffer
186
-
187
- if @source_buffer
188
- source = @source_buffer.source
189
-
190
- if source.encoding == Encoding::UTF_8
191
- @source_pts = source.unpack('U*')
192
- else
193
- @source_pts = source.unpack('C*')
194
- end
195
-
196
- if @source_pts[0] == 0xfeff
197
- # Skip byte order mark.
198
- @p = 1
199
- end
200
- else
201
- @source_pts = nil
202
- end
203
- end
204
-
205
- def encoding
206
- @source_buffer.source.encoding
207
- end
208
-
209
- LEX_STATES = {
210
- :line_begin => lex_en_line_begin,
211
- :expr_dot => lex_en_expr_dot,
212
- :expr_fname => lex_en_expr_fname,
213
- :expr_value => lex_en_expr_value,
214
- :expr_beg => lex_en_expr_beg,
215
- :expr_mid => lex_en_expr_mid,
216
- :expr_arg => lex_en_expr_arg,
217
- :expr_cmdarg => lex_en_expr_cmdarg,
218
- :expr_end => lex_en_expr_end,
219
- :expr_endarg => lex_en_expr_endarg,
220
- :expr_endfn => lex_en_expr_endfn,
221
- :expr_labelarg => lex_en_expr_labelarg,
222
-
223
- :interp_string => lex_en_interp_string,
224
- :interp_words => lex_en_interp_words,
225
- :plain_string => lex_en_plain_string,
226
- :plain_words => lex_en_plain_string,
227
- }
228
-
229
- def state
230
- LEX_STATES.invert.fetch(@cs, @cs)
231
- end
232
-
233
- def state=(state)
234
- @cs = LEX_STATES.fetch(state)
235
- end
236
-
237
- def push_cmdarg
238
- @cmdarg_stack.push(@cmdarg)
239
- @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
240
- end
241
-
242
- def pop_cmdarg
243
- @cmdarg = @cmdarg_stack.pop
244
- end
245
-
246
- def push_cond
247
- @cond_stack.push(@cond)
248
- @cond = StackState.new("cond.#{@cond_stack.count}")
249
- end
250
-
251
- def pop_cond
252
- @cond = @cond_stack.pop
253
- end
254
-
255
- def dedent_level
256
- # We erase @dedent_level as a precaution to avoid accidentally
257
- # using a stale value.
258
- dedent_level, @dedent_level = @dedent_level, nil
259
- dedent_level
260
- end
261
-
262
- # Return next token: [type, value].
263
- def advance
264
- if @token_queue.any?
265
- return @token_queue.shift
266
- end
267
-
268
- # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
269
- klass = self.class
270
- _lex_trans_keys = klass.send :_lex_trans_keys
271
- _lex_key_spans = klass.send :_lex_key_spans
272
- _lex_index_offsets = klass.send :_lex_index_offsets
273
- _lex_indicies = klass.send :_lex_indicies
274
- _lex_trans_targs = klass.send :_lex_trans_targs
275
- _lex_trans_actions = klass.send :_lex_trans_actions
276
- _lex_to_state_actions = klass.send :_lex_to_state_actions
277
- _lex_from_state_actions = klass.send :_lex_from_state_actions
278
- _lex_eof_trans = klass.send :_lex_eof_trans
279
-
280
- pe = @source_pts.size + 2
281
- p, eof = @p, pe
282
-
283
- cmd_state = @command_start
284
- @command_start = false
285
-
286
- %% write exec;
287
- # %
288
-
289
- # Ragel creates a local variable called `testEof` but it doesn't use
290
- # it in any assignment. This dead code is here to swallow the warning.
291
- # It has no runtime cost because Ruby doesn't produce any instructions from it.
292
- if false
293
- testEof
294
- end
295
-
296
- @p = p
297
-
298
- if @token_queue.any?
299
- @token_queue.shift
300
- elsif @cs == klass.lex_error
301
- [ false, [ '$error'.freeze, range(p - 1, p) ] ]
302
- else
303
- eof = @source_pts.size
304
- [ false, [ '$eof'.freeze, range(eof, eof) ] ]
305
- end
306
- end
307
-
308
- protected
309
-
310
- def eof_codepoint?(point)
311
- [0x04, 0x1a, 0x00].include? point
312
- end
313
-
314
- def version?(*versions)
315
- versions.include?(@version)
316
- end
317
-
318
- def stack_pop
319
- @top -= 1
320
- @stack[@top]
321
- end
322
-
323
- def encode_escape(ord)
324
- ord.chr.force_encoding(@source_buffer.source.encoding)
325
- end
326
-
327
- def tok(s = @ts, e = @te)
328
- @source_buffer.slice(s...e)
329
- end
330
-
331
- def range(s = @ts, e = @te)
332
- Parser::Source::Range.new(@source_buffer, s, e)
333
- end
334
-
335
- def emit(type, value = tok, s = @ts, e = @te)
336
- token = [ type, [ value, range(s, e) ] ]
337
-
338
- @token_queue.push(token)
339
-
340
- @tokens.push(token) if @tokens
341
-
342
- token
343
- end
344
-
345
- def emit_table(table, s = @ts, e = @te)
346
- value = tok(s, e)
347
-
348
- emit(table[value], value, s, e)
349
- end
350
-
351
- def emit_do(do_block=false)
352
- if @cond.active?
353
- emit(:kDO_COND, 'do'.freeze)
354
- elsif @cmdarg.active? || do_block
355
- emit(:kDO_BLOCK, 'do'.freeze)
356
- else
357
- emit(:kDO, 'do'.freeze)
358
- end
359
- end
360
-
361
- def arg_or_cmdarg(cmd_state)
362
- if cmd_state
363
- self.class.lex_en_expr_cmdarg
364
- else
365
- self.class.lex_en_expr_arg
366
- end
367
- end
368
-
369
- def emit_comment(s = @ts, e = @te)
370
- if @comments
371
- @comments.push(Parser::Source::Comment.new(range(s, e)))
372
- end
373
-
374
- if @tokens
375
- @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
376
- end
377
-
378
- nil
379
- end
380
-
381
- def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
382
- @diagnostics.process(
383
- Parser::Diagnostic.new(type, reason, arguments, location, highlights))
384
- end
385
-
386
- #
387
- # === LITERAL STACK ===
388
- #
389
-
390
- def push_literal(*args)
391
- new_literal = Literal.new(self, *args)
392
- @literal_stack.push(new_literal)
393
- next_state_for_literal(new_literal)
394
- end
395
-
396
- def next_state_for_literal(literal)
397
- if literal.words? && literal.backslash_delimited?
398
- if literal.interpolate?
399
- self.class.lex_en_interp_backslash_delimited_words
400
- else
401
- self.class.lex_en_plain_backslash_delimited_words
402
- end
403
- elsif literal.words? && !literal.backslash_delimited?
404
- if literal.interpolate?
405
- self.class.lex_en_interp_words
406
- else
407
- self.class.lex_en_plain_words
408
- end
409
- elsif !literal.words? && literal.backslash_delimited?
410
- if literal.interpolate?
411
- self.class.lex_en_interp_backslash_delimited
412
- else
413
- self.class.lex_en_plain_backslash_delimited
414
- end
415
- else
416
- if literal.interpolate?
417
- self.class.lex_en_interp_string
418
- else
419
- self.class.lex_en_plain_string
420
- end
421
- end
422
- end
423
-
424
- def literal
425
- @literal_stack.last
426
- end
427
-
428
- def pop_literal
429
- old_literal = @literal_stack.pop
430
-
431
- @dedent_level = old_literal.dedent_level
432
-
433
- if old_literal.type == :tREGEXP_BEG
434
- # Fetch modifiers.
435
- self.class.lex_en_regexp_modifiers
436
- else
437
- self.class.lex_en_expr_end
438
- end
439
- end
440
-
441
- # Mapping of strings to parser tokens.
442
-
443
- PUNCTUATION = {
444
- '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
445
- '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
446
- '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
447
- '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
448
- ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
449
- '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
450
- '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
451
- ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
452
- '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
453
- '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
454
- '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
455
- '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
456
- '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
457
- '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
458
- '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
459
- '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
460
- '!@' => :tBANG, '&.' => :tANDDOT, '.:' => :tMETHREF
461
- }
462
-
463
- PUNCTUATION_BEGIN = {
464
- '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
465
- '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
466
- '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
467
- }
468
-
469
- KEYWORDS = {
470
- 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
471
- 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
472
- 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
473
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
474
- }
475
-
476
- KEYWORDS_BEGIN = {
477
- 'if' => :kIF, 'unless' => :kUNLESS,
478
- 'while' => :kWHILE, 'until' => :kUNTIL,
479
- 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
480
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
481
- }
482
-
483
- %w(class module def undef begin end then elsif else ensure case when
484
- for break next redo retry in do return yield super self nil true
485
- false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
486
- KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
487
- end
488
-
489
- %%{
490
- # %
491
-
492
- access @;
493
- getkey (@source_pts[p] || 0);
494
-
495
- # === CHARACTER CLASSES ===
496
- #
497
- # Pay close attention to the differences between c_any and any.
498
- # c_any does not include EOF and so will cause incorrect behavior
499
- # for machine subtraction (any-except rules) and default transitions
500
- # for scanners.
501
-
502
- action do_nl {
503
- # Record position of a newline for precise location reporting on tNL
504
- # tokens.
505
- #
506
- # This action is embedded directly into c_nl, as it is idempotent and
507
- # there are no cases when we need to skip it.
508
- @newline_s = p
509
- }
510
-
511
- c_nl = '\n' $ do_nl;
512
- c_space = [ \t\r\f\v];
513
- c_space_nl = c_space | c_nl;
514
-
515
- c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
516
- c_eol = c_nl | c_eof;
517
- c_any = any - c_eof;
518
-
519
- c_nl_zlen = c_nl | zlen;
520
- c_line = any - c_nl_zlen;
521
-
522
- c_ascii = 0x00..0x7f;
523
- c_unicode = c_any - c_ascii;
524
- c_upper = [A-Z];
525
- c_lower = [a-z_] | c_unicode;
526
- c_alpha = c_lower | c_upper;
527
- c_alnum = c_alpha | [0-9];
528
-
529
- action do_eof {
530
- # Sit at EOF indefinitely. #advance would return $eof each time.
531
- # This allows to feed the lexer more data if needed; this is only used
532
- # in tests.
533
- #
534
- # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
535
- # below. This is due to the fact that scanner state at EOF is observed
536
- # by tests, and encapsulating it in a rule would break the introspection.
537
- fhold; fbreak;
538
- }
539
-
540
- #
541
- # === TOKEN DEFINITIONS ===
542
- #
543
-
544
- # All operators are punctuation. There is more to punctuation
545
- # than just operators. Operators can be overridden by user;
546
- # punctuation can not.
547
-
548
- # A list of operators which are valid in the function name context, but
549
- # have different semantics in others.
550
- operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
551
-
552
- # A list of operators which can occur within an assignment shortcut (+ → +=).
553
- operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
554
- '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
555
-
556
- # A list of all user-definable operators not covered by groups above.
557
- operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
558
- '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
559
-
560
- # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
561
- # as they are ambiguous with interpolation `#{}` and should be counted.
562
- # These braces are not present in punctuation lists.
563
-
564
- # A list of punctuation which has different meaning when used at the
565
- # beginning of expression.
566
- punctuation_begin = '-' | '+' | '::' | '(' | '[' |
567
- '*' | '**' | '&' ;
568
-
569
- # A list of all punctuation except punctuation_begin.
570
- punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
571
- '::' | '?' | ':' | '.' | '..' | '...' ;
572
-
573
- # A list of keywords which have different meaning at the beginning of expression.
574
- keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
575
-
576
- # A list of keywords which accept an argument-like expression, i.e. have the
577
- # same post-processing as method calls or commands. Example: `yield 1`,
578
- # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
579
- keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
580
-
581
- # A list of keywords which accept a literal function name as an argument.
582
- keyword_with_fname = 'def' | 'undef' | 'alias' ;
583
-
584
- # A list of keywords which accept an expression after them.
585
- keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
586
- 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
587
- 'and' | 'or' ;
588
-
589
- # A list of keywords which accept a value, and treat the keywords from
590
- # `keyword_modifier` list as modifiers.
591
- keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
592
-
593
- # A list of keywords which do not accept an expression after them.
594
- keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
595
- 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
596
- '__LINE__' | '__ENCODING__';
597
-
598
- # All keywords.
599
- keyword = keyword_with_value | keyword_with_mid |
600
- keyword_with_end | keyword_with_arg |
601
- keyword_with_fname | keyword_modifier ;
602
-
603
- constant = c_upper c_alnum*;
604
- bareword = c_alpha c_alnum*;
605
-
606
- call_or_var = c_lower c_alnum*;
607
- class_var = '@@' bareword;
608
- instance_var = '@' bareword;
609
- global_var = '$'
610
- ( bareword | digit+
611
- | [`'+~*$&?!@/\\;,.=:<>"] # `
612
- | '-' c_alnum
613
- )
614
- ;
615
-
616
- # Ruby accepts (and fails on) variables with leading digit
617
- # in literal context, but not in unquoted symbol body.
618
- class_var_v = '@@' c_alnum+;
619
- instance_var_v = '@' c_alnum+;
620
-
621
- label = bareword [?!]? ':';
622
-
623
- #
624
- # === NUMERIC PARSING ===
625
- #
626
-
627
- int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
628
- int_dec = ( digit+ '_' )* digit* '_'? ;
629
- int_bin = ( [01]+ '_' )* [01]* '_'? ;
630
-
631
- flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
632
- flo_frac = '.' ( digit+ '_' )* digit+;
633
- flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
634
-
635
- int_suffix =
636
- '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
637
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
638
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
639
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
640
- | 're' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
641
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
642
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
643
-
644
- flo_pow_suffix =
645
- '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
646
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
647
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
648
-
649
- flo_suffix =
650
- flo_pow_suffix
651
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
652
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
653
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
654
-
655
- #
656
- # === ESCAPE SEQUENCE PARSING ===
657
- #
658
-
659
- # Escape parsing code is a Ragel pattern, not a scanner, and therefore
660
- # it shouldn't directly raise errors or perform other actions with side effects.
661
- # In reality this would probably just mess up error reporting in pathological
662
- # cases, through.
663
-
664
- # The amount of code required to parse \M\C stuff correctly is ridiculous.
665
-
666
- escaped_nl = "\\" c_nl;
667
-
668
- action unicode_points {
669
- @escape = ""
670
-
671
- codepoints = tok(@escape_s + 2, p - 1)
672
- codepoint_s = @escape_s + 2
673
-
674
- if @version < 24
675
- if codepoints.start_with?(" ") || codepoints.start_with?("\t")
676
- diagnostic :fatal, :invalid_unicode_escape, nil,
677
- range(@escape_s + 2, @escape_s + 3)
678
- end
679
-
680
- if spaces_p = codepoints.index(/[ \t]{2}/)
681
- diagnostic :fatal, :invalid_unicode_escape, nil,
682
- range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
683
- end
684
-
685
- if codepoints.end_with?(" ") || codepoints.end_with?("\t")
686
- diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
687
- end
688
- end
689
-
690
- codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
691
- if spaces
692
- codepoint_s += spaces.length
693
- else
694
- codepoint = codepoint_str.to_i(16)
695
-
696
- if codepoint >= 0x110000
697
- diagnostic :error, :unicode_point_too_large, nil,
698
- range(codepoint_s, codepoint_s + codepoint_str.length)
699
- break
700
- end
701
-
702
- @escape += codepoint.chr(Encoding::UTF_8)
703
- codepoint_s += codepoint_str.length
704
- end
705
- end
706
- }
707
-
708
- action unescape_char {
709
- codepoint = @source_pts[p - 1]
710
-
711
- if @version >= 30 && (codepoint == 117 || codepoint == 85) # 'u' or 'U'
712
- diagnostic :fatal, :invalid_escape
713
- end
714
-
715
- if (@escape = ESCAPES[codepoint]).nil?
716
- @escape = encode_escape(@source_buffer.slice(p - 1))
717
- end
718
- }
719
-
720
- action invalid_complex_escape {
721
- diagnostic :fatal, :invalid_escape
722
- }
723
-
724
- action read_post_meta_or_ctrl_char {
725
- @escape = @source_buffer.slice(p - 1).chr
726
-
727
- if @version >= 27 && ((0..8).include?(@escape.ord) || (14..31).include?(@escape.ord))
728
- diagnostic :fatal, :invalid_escape
729
- end
730
- }
731
-
732
- action slash_c_char {
733
- @escape = encode_escape(@escape[0].ord & 0x9f)
734
- }
735
-
736
- action slash_m_char {
737
- @escape = encode_escape(@escape[0].ord | 0x80)
738
- }
739
-
740
- maybe_escaped_char = (
741
- '\\' c_any %unescape_char
742
- | '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
743
- | ( c_any - [\\] ) %read_post_meta_or_ctrl_char
744
- );
745
-
746
- maybe_escaped_ctrl_char = ( # why?!
747
- '\\' c_any %unescape_char %slash_c_char
748
- | '?' % { @escape = "\x7f" }
749
- | '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
750
- | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char
751
- );
752
-
753
- escape = (
754
- # \377
755
- [0-7]{1,3}
756
- % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
757
-
758
- # \xff
759
- | 'x' xdigit{1,2}
760
- % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
761
-
762
- # %q[\x]
763
- | 'x' ( c_any - xdigit )
764
- % {
765
- diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
766
- }
767
-
768
- # \u263a
769
- | 'u' xdigit{4}
770
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
771
-
772
- # \u123
773
- | 'u' xdigit{0,3}
774
- % {
775
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
776
- }
777
-
778
- # u{not hex} or u{}
779
- | 'u{' ( c_any - xdigit - [ \t}] )* '}'
780
- % {
781
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
782
- }
783
-
784
- # \u{ \t 123 \t 456 \t\t }
785
- | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
786
- (
787
- ( xdigit{1,6} [ \t]* '}'
788
- %unicode_points
789
- )
790
- |
791
- ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
792
- | ( c_any - [ \t}] )* c_eof
793
- | xdigit{7,}
794
- ) % {
795
- diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
796
- }
797
- )
798
-
799
- # \C-\a \cx
800
- | ( 'C-' | 'c' ) escaped_nl?
801
- maybe_escaped_ctrl_char
802
-
803
- # \M-a
804
- | 'M-' escaped_nl?
805
- maybe_escaped_char
806
- %slash_m_char
807
-
808
- # \C-\M-f \M-\cf \c\M-f
809
- | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
810
- | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
811
- maybe_escaped_ctrl_char
812
- %slash_m_char
813
-
814
- | 'C' c_any %invalid_complex_escape
815
- | 'M' c_any %invalid_complex_escape
816
- | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
817
-
818
- | ( c_any - [0-7xuCMc] ) %unescape_char
819
-
820
- | c_eof % {
821
- diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
822
- }
823
- );
824
-
825
- # Use rules in form of `e_bs escape' when you need to parse a sequence.
826
- e_bs = '\\' % {
827
- @escape_s = p
828
- @escape = nil
829
- };
830
-
831
- #
832
- # === STRING AND HEREDOC PARSING ===
833
- #
834
-
835
- # Heredoc parsing is quite a complex topic. First, consider that heredocs
836
- # can be arbitrarily nested. For example:
837
- #
838
- # puts <<CODE
839
- # the result is: #{<<RESULT.inspect
840
- # i am a heredoc
841
- # RESULT
842
- # }
843
- # CODE
844
- #
845
- # which, incidentally, evaluates to:
846
- #
847
- # the result is: " i am a heredoc\n"
848
- #
849
- # To parse them, lexer refers to two kinds (remember, nested heredocs)
850
- # of positions in the input stream, namely heredoc_e
851
- # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
852
- #
853
- # heredoc_e is simply contained inside the corresponding Literal, and
854
- # when the heredoc is closed, the lexing is restarted from that position.
855
- #
856
- # @herebody_s is quite more complex. First, @herebody_s changes after each
857
- # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
858
- # contains the current line, and also when a heredoc is started, @herebody_s
859
- # contains the position from which the heredoc will be lexed.
860
- #
861
- # Second, as (insanity) there are nested heredocs, we need to maintain a
862
- # stack of these positions. Each time #push_literal is called, it saves current
863
- # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
864
- # containing another heredocs) is closed, the previous value is restored.
865
-
866
- e_heredoc_nl = c_nl % {
867
- # After every heredoc was parsed, @herebody_s contains the
868
- # position of next token after all heredocs.
869
- if @herebody_s
870
- p = @herebody_s
871
- @herebody_s = nil
872
- end
873
- };
874
-
875
- action extend_string {
876
- string = tok
877
-
878
- # tLABEL_END is only possible in non-cond context on >= 2.2
879
- if @version >= 22 && !@cond.active?
880
- lookahead = @source_buffer.slice(@te...@te+2)
881
- end
882
-
883
- current_literal = literal
884
- if !current_literal.heredoc? &&
885
- (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
886
- if token[0] == :tLABEL_END
887
- p += 1
888
- pop_literal
889
- fnext expr_labelarg;
890
- else
891
- fnext *pop_literal;
892
- end
893
- fbreak;
894
- else
895
- current_literal.extend_string(string, @ts, @te)
896
- end
897
- }
898
-
899
- action extend_string_escaped {
900
- current_literal = literal
901
- # Get the first character after the backslash.
902
- escaped_char = @source_buffer.slice(@escape_s).chr
903
-
904
- if current_literal.munge_escape? escaped_char
905
- # If this particular literal uses this character as an opening
906
- # or closing delimiter, it is an escape sequence for that
907
- # particular character. Write it without the backslash.
908
-
909
- if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
910
- # Regular expressions should include escaped delimiters in their
911
- # escaped form, except when the escaped character is
912
- # a closing delimiter but not a regexp metacharacter.
913
- #
914
- # The backslash itself cannot be used as a closing delimiter
915
- # at the same time as an escape symbol, but it is always munged,
916
- # so this branch also executes for the non-closing-delimiter case
917
- # for the backslash.
918
- current_literal.extend_string(tok, @ts, @te)
919
- else
920
- current_literal.extend_string(escaped_char, @ts, @te)
921
- end
922
- else
923
- # It does not. So this is an actual escape sequence, yay!
924
- if current_literal.squiggly_heredoc? && escaped_char == "\n".freeze
925
- # Squiggly heredocs like
926
- # <<~-HERE
927
- # 1\
928
- # 2
929
- # HERE
930
- # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
931
- # This information is emitted as is, without escaping,
932
- # later this escape sequence (\\\n) gets handled manually in the Lexer::Dedenter
933
- current_literal.extend_string(tok, @ts, @te)
934
- elsif current_literal.supports_line_continuation_via_slash? && escaped_char == "\n".freeze
935
- # Heredocs, regexp and a few other types of literals support line
936
- # continuation via \\\n sequence. The code like
937
- # "a\
938
- # b"
939
- # must be parsed as "ab"
940
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
941
- elsif current_literal.regexp? && @version >= 31 && %w[c C m M].include?(escaped_char)
942
- # Ruby >= 3.1 escapes \c- and \m chars, that's the only escape sequence
943
- # supported by regexes so far, so it needs a separate branch.
944
- current_literal.extend_string(@escape, @ts, @te)
945
- elsif current_literal.regexp?
946
- # Regular expressions should include escape sequences in their
947
- # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x")
948
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
949
- else
950
- current_literal.extend_string(@escape || tok, @ts, @te)
951
- end
952
- end
953
- }
954
-
955
- # Extend a string with a newline or a EOF character.
956
- # As heredoc closing line can immediately precede EOF, this action
957
- # has to handle such case specially.
958
- action extend_string_eol {
959
- current_literal = literal
960
- if @te == pe
961
- diagnostic :fatal, :string_eof, nil,
962
- range(current_literal.str_s, current_literal.str_s + 1)
963
- end
964
-
965
- if current_literal.heredoc?
966
- line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
967
-
968
- if version?(18, 19, 20)
969
- # See ruby:c48b4209c
970
- line = line.gsub(/\r.*$/, ''.freeze)
971
- end
972
-
973
- # Try ending the heredoc with the complete most recently
974
- # scanned line. @herebody_s always refers to the start of such line.
975
- if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
976
- # Adjust @herebody_s to point to the next line.
977
- @herebody_s = @te
978
-
979
- # Continue regular lexing after the heredoc reference (<<END).
980
- p = current_literal.heredoc_e - 1
981
- fnext *pop_literal; fbreak;
982
- else
983
- # Calculate indentation level for <<~HEREDOCs.
984
- current_literal.infer_indent_level(line)
985
-
986
- # Ditto.
987
- @herebody_s = @te
988
- end
989
- else
990
- # Try ending the literal with a newline.
991
- if current_literal.nest_and_try_closing(tok, @ts, @te)
992
- fnext *pop_literal; fbreak;
993
- end
994
-
995
- if @herebody_s
996
- # This is a regular literal intertwined with a heredoc. Like:
997
- #
998
- # p <<-foo+"1
999
- # bar
1000
- # foo
1001
- # 2"
1002
- #
1003
- # which, incidentally, evaluates to "bar\n1\n2".
1004
- p = @herebody_s - 1
1005
- @herebody_s = nil
1006
- end
1007
- end
1008
-
1009
- if current_literal.words? && !eof_codepoint?(@source_pts[p])
1010
- current_literal.extend_space @ts, @te
1011
- else
1012
- # A literal newline is appended if the heredoc was _not_ closed
1013
- # this time (see fbreak above). See also Literal#nest_and_try_closing
1014
- # for rationale of calling #flush_string here.
1015
- current_literal.extend_string tok, @ts, @te
1016
- current_literal.flush_string
1017
- end
1018
- }
1019
-
1020
- action extend_string_space {
1021
- literal.extend_space @ts, @te
1022
- }
1023
-
1024
- #
1025
- # === INTERPOLATION PARSING ===
1026
- #
1027
-
1028
- # Interpolations with immediate variable names simply call into
1029
- # the corresponding machine.
1030
-
1031
- interp_var = '#' ( global_var | class_var_v | instance_var_v );
1032
-
1033
- action extend_interp_var {
1034
- current_literal = literal
1035
- current_literal.flush_string
1036
- current_literal.extend_content
1037
-
1038
- emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1039
-
1040
- p = @ts
1041
- fcall expr_variable;
1042
- }
1043
-
1044
- # Special case for Ruby > 2.7
1045
- # If interpolated instance/class variable starts with a digit we parse it as a plain substring
1046
- # However, "#$1" is still a regular interpolation
1047
- interp_digit_var = '#' ('@' | '@@') digit c_alpha*;
1048
-
1049
- action extend_interp_digit_var {
1050
- if @version >= 27
1051
- literal.extend_string(tok, @ts, @te)
1052
- else
1053
- message = tok.start_with?('#@@') ? :cvar_name : :ivar_name
1054
- diagnostic :error, message, { :name => tok(@ts + 1, @te) }, range(@ts + 1, @te)
1055
- end
1056
- }
1057
-
1058
- # Interpolations with code blocks must match nested curly braces, as
1059
- # interpolation ending is ambiguous with a block ending. So, every
1060
- # opening and closing brace should be matched with e_[lr]brace rules,
1061
- # which automatically perform the counting.
1062
- #
1063
- # Note that interpolations can themselves be nested, so brace balance
1064
- # is tied to the innermost literal.
1065
- #
1066
- # Also note that literals themselves should not use e_[lr]brace rules
1067
- # when matching their opening and closing delimiters, as the amount of
1068
- # braces inside the characters of a string literal is independent.
1069
-
1070
- interp_code = '#{';
1071
-
1072
- e_lbrace = '{' % {
1073
- @cond.push(false); @cmdarg.push(false)
1074
-
1075
- current_literal = literal
1076
- if current_literal
1077
- current_literal.start_interp_brace
1078
- end
1079
- };
1080
-
1081
- e_rbrace = '}' % {
1082
- current_literal = literal
1083
- if current_literal
1084
- if current_literal.end_interp_brace_and_try_closing
1085
- if version?(18, 19)
1086
- emit(:tRCURLY, '}'.freeze, p - 1, p)
1087
- @cond.lexpop
1088
- @cmdarg.lexpop
1089
- else
1090
- emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1091
- end
1092
-
1093
- if current_literal.saved_herebody_s
1094
- @herebody_s = current_literal.saved_herebody_s
1095
- end
1096
-
1097
-
1098
- fhold;
1099
- fnext *next_state_for_literal(current_literal);
1100
- fbreak;
1101
- end
1102
- end
1103
-
1104
- @paren_nest -= 1
1105
- };
1106
-
1107
- action extend_interp_code {
1108
- current_literal = literal
1109
- current_literal.flush_string
1110
- current_literal.extend_content
1111
-
1112
- emit(:tSTRING_DBEG, '#{'.freeze)
1113
-
1114
- if current_literal.heredoc?
1115
- current_literal.saved_herebody_s = @herebody_s
1116
- @herebody_s = nil
1117
- end
1118
-
1119
- current_literal.start_interp_brace
1120
- @command_start = true
1121
- fnext expr_value;
1122
- fbreak;
1123
- }
1124
-
1125
- # Actual string parsers are simply combined from the primitives defined
1126
- # above.
1127
-
1128
- interp_words := |*
1129
- interp_code => extend_interp_code;
1130
- interp_digit_var => extend_interp_digit_var;
1131
- interp_var => extend_interp_var;
1132
- e_bs escape => extend_string_escaped;
1133
- c_space+ => extend_string_space;
1134
- c_eol => extend_string_eol;
1135
- c_any => extend_string;
1136
- *|;
1137
-
1138
- interp_string := |*
1139
- interp_code => extend_interp_code;
1140
- interp_digit_var => extend_interp_digit_var;
1141
- interp_var => extend_interp_var;
1142
- e_bs escape => extend_string_escaped;
1143
- c_eol => extend_string_eol;
1144
- c_any => extend_string;
1145
- *|;
1146
-
1147
- plain_words := |*
1148
- e_bs c_any => extend_string_escaped;
1149
- c_space+ => extend_string_space;
1150
- c_eol => extend_string_eol;
1151
- c_any => extend_string;
1152
- *|;
1153
-
1154
- plain_string := |*
1155
- '\\' c_nl => extend_string_eol;
1156
- e_bs c_any => extend_string_escaped;
1157
- c_eol => extend_string_eol;
1158
- c_any => extend_string;
1159
- *|;
1160
-
1161
- interp_backslash_delimited := |*
1162
- interp_code => extend_interp_code;
1163
- interp_digit_var => extend_interp_digit_var;
1164
- interp_var => extend_interp_var;
1165
- c_eol => extend_string_eol;
1166
- c_any => extend_string;
1167
- *|;
1168
-
1169
- plain_backslash_delimited := |*
1170
- c_eol => extend_string_eol;
1171
- c_any => extend_string;
1172
- *|;
1173
-
1174
- interp_backslash_delimited_words := |*
1175
- interp_code => extend_interp_code;
1176
- interp_digit_var => extend_interp_digit_var;
1177
- interp_var => extend_interp_var;
1178
- c_space+ => extend_string_space;
1179
- c_eol => extend_string_eol;
1180
- c_any => extend_string;
1181
- *|;
1182
-
1183
- plain_backslash_delimited_words := |*
1184
- c_space+ => extend_string_space;
1185
- c_eol => extend_string_eol;
1186
- c_any => extend_string;
1187
- *|;
1188
-
1189
- regexp_modifiers := |*
1190
- [A-Za-z]+
1191
- => {
1192
- unknown_options = tok.scan(/[^imxouesn]/)
1193
- if unknown_options.any?
1194
- diagnostic :error, :regexp_options,
1195
- { :options => unknown_options.join }
1196
- end
1197
-
1198
- emit(:tREGEXP_OPT)
1199
- fnext expr_end;
1200
- fbreak;
1201
- };
1202
-
1203
- any
1204
- => {
1205
- emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1206
- fhold;
1207
- fgoto expr_end;
1208
- };
1209
- *|;
1210
-
1211
- #
1212
- # === WHITESPACE HANDLING ===
1213
- #
1214
-
1215
- # Various contexts in Ruby allow various kinds of whitespace
1216
- # to be used. They are grouped to clarify the lexing machines
1217
- # and ease collection of comments.
1218
-
1219
- # A line of code with inline #comment at end is always equivalent
1220
- # to a line of code ending with just a newline, so an inline
1221
- # comment is deemed equivalent to non-newline whitespace
1222
- # (c_space character class).
1223
-
1224
- w_space =
1225
- c_space+
1226
- | '\\' e_heredoc_nl
1227
- ;
1228
-
1229
- w_comment =
1230
- '#' %{ @sharp_s = p - 1 }
1231
- # The (p == pe) condition compensates for added "\0" and
1232
- # the way Ragel handles EOF.
1233
- c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1234
- ;
1235
-
1236
- w_space_comment =
1237
- w_space
1238
- | w_comment
1239
- ;
1240
-
1241
- # A newline in non-literal context always interoperates with
1242
- # here document logic and can always be escaped by a backslash,
1243
- # still interoperating with here document logic in the same way,
1244
- # yet being invisible to anything else.
1245
- #
1246
- # To demonstrate:
1247
- #
1248
- # foo = <<FOO \
1249
- # bar
1250
- # FOO
1251
- # + 2
1252
- #
1253
- # is equivalent to `foo = "bar\n" + 2`.
1254
-
1255
- w_newline =
1256
- e_heredoc_nl;
1257
-
1258
- w_any =
1259
- w_space
1260
- | w_comment
1261
- | w_newline
1262
- ;
1263
-
1264
-
1265
- #
1266
- # === EXPRESSION PARSING ===
1267
- #
1268
-
1269
- # These rules implement a form of manually defined lookahead.
1270
- # The default longest-match scanning does not work here due
1271
- # to sheer ambiguity.
1272
-
1273
- ambiguous_fid_suffix = # actual parsed
1274
- [?!] %{ tm = p } | # a? a?
1275
- [?!]'=' %{ tm = p - 2 } # a!=b a != b
1276
- ;
1277
-
1278
- ambiguous_ident_suffix = # actual parsed
1279
- ambiguous_fid_suffix |
1280
- '=' %{ tm = p } | # a= a=
1281
- '==' %{ tm = p - 2 } | # a==b a == b
1282
- '=~' %{ tm = p - 2 } | # a=~b a =~ b
1283
- '=>' %{ tm = p - 2 } | # a=>b a => b
1284
- '===' %{ tm = p - 3 } # a===b a === b
1285
- ;
1286
-
1287
- ambiguous_symbol_suffix = # actual parsed
1288
- ambiguous_ident_suffix |
1289
- '==>' %{ tm = p - 2 } # :a==>b :a= => b
1290
- ;
1291
-
1292
- # Ambiguous with 1.9 hash labels.
1293
- ambiguous_const_suffix = # actual parsed
1294
- '::' %{ tm = p - 2 } # A::B A :: B
1295
- ;
1296
-
1297
- # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1298
- # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1299
-
1300
- e_lbrack = '[' % {
1301
- @cond.push(false); @cmdarg.push(false)
1302
-
1303
- @paren_nest += 1
1304
- };
1305
-
1306
- e_rbrack = ']' % {
1307
- @paren_nest -= 1
1308
- };
1309
-
1310
- # Ruby 1.9 lambdas require parentheses counting in order to
1311
- # emit correct opening kDO/tLBRACE.
1312
-
1313
- e_lparen = '(' % {
1314
- @cond.push(false); @cmdarg.push(false)
1315
-
1316
- @paren_nest += 1
1317
-
1318
- if version?(18)
1319
- @command_start = true
1320
- end
1321
- };
1322
-
1323
- e_rparen = ')' % {
1324
- @paren_nest -= 1
1325
- };
1326
-
1327
- # Ruby is context-sensitive wrt/ local identifiers.
1328
- action local_ident {
1329
- emit(:tIDENTIFIER)
1330
-
1331
- if !@static_env.nil? && @static_env.declared?(tok)
1332
- fnext expr_endfn; fbreak;
1333
- else
1334
- fnext *arg_or_cmdarg(cmd_state); fbreak;
1335
- end
1336
- }
1337
-
1338
- # Variable lexing code is accessed from both expressions and
1339
- # string interpolation related code.
1340
- #
1341
- expr_variable := |*
1342
- global_var
1343
- => {
1344
- if tok =~ /^\$([1-9][0-9]*)$/
1345
- emit(:tNTH_REF, tok(@ts + 1).to_i)
1346
- elsif tok =~ /^\$([&`'+])$/
1347
- emit(:tBACK_REF)
1348
- else
1349
- emit(:tGVAR)
1350
- end
1351
-
1352
- fnext *stack_pop; fbreak;
1353
- };
1354
-
1355
- class_var_v
1356
- => {
1357
- if tok =~ /^@@[0-9]/
1358
- diagnostic :error, :cvar_name, { :name => tok }
1359
- end
1360
-
1361
- emit(:tCVAR)
1362
- fnext *stack_pop; fbreak;
1363
- };
1364
-
1365
- instance_var_v
1366
- => {
1367
- if tok =~ /^@[0-9]/
1368
- diagnostic :error, :ivar_name, { :name => tok }
1369
- end
1370
-
1371
- emit(:tIVAR)
1372
- fnext *stack_pop; fbreak;
1373
- };
1374
- *|;
1375
-
1376
- # Literal function name in definition (e.g. `def class`).
1377
- # Keywords are returned as their respective tokens; this is used
1378
- # to support singleton def `def self.foo`. Global variables are
1379
- # returned as `tGVAR`; this is used in global variable alias
1380
- # statements `alias $a $b`. Symbols are returned verbatim; this
1381
- # is used in `alias :a :"b#{foo}"` and `undef :a`.
1382
- #
1383
- # Transitions to `expr_endfn` afterwards.
1384
- #
1385
- expr_fname := |*
1386
- keyword
1387
- => { emit_table(KEYWORDS_BEGIN);
1388
- fnext expr_endfn; fbreak; };
1389
-
1390
- constant
1391
- => { emit(:tCONSTANT)
1392
- fnext expr_endfn; fbreak; };
1393
-
1394
- bareword [?=!]?
1395
- => { emit(:tIDENTIFIER)
1396
- fnext expr_endfn; fbreak; };
1397
-
1398
- global_var
1399
- => { p = @ts - 1
1400
- fnext expr_end; fcall expr_variable; };
1401
-
1402
- # If the handling was to be delegated to expr_end,
1403
- # these cases would transition to something else than
1404
- # expr_endfn, which is incorrect.
1405
- operator_fname |
1406
- operator_arithmetic |
1407
- operator_rest
1408
- => { emit_table(PUNCTUATION)
1409
- fnext expr_endfn; fbreak; };
1410
-
1411
- '::'
1412
- => { fhold; fhold; fgoto expr_end; };
1413
-
1414
- ':'
1415
- => { fhold; fgoto expr_beg; };
1416
-
1417
- '%s' (c_ascii - [A-Za-z0-9])
1418
- => {
1419
- if version?(23)
1420
- type, delimiter = tok[0..-2], tok[-1].chr
1421
- fgoto *push_literal(type, delimiter, @ts);
1422
- else
1423
- p = @ts - 1
1424
- fgoto expr_end;
1425
- end
1426
- };
1427
-
1428
- w_any;
1429
-
1430
- c_any
1431
- => { fhold; fgoto expr_end; };
1432
-
1433
- c_eof => do_eof;
1434
- *|;
1435
-
1436
- # After literal function name in definition. Behaves like `expr_end`,
1437
- # but allows a tLABEL.
1438
- #
1439
- # Transitions to `expr_end` afterwards.
1440
- #
1441
- expr_endfn := |*
1442
- label ( any - ':' )
1443
- => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1444
- fhold; fnext expr_labelarg; fbreak; };
1445
-
1446
- '...'
1447
- => {
1448
- if @version >= 31 && @context.in_argdef
1449
- emit(:tBDOT3, '...'.freeze)
1450
- # emit(:tNL, "\n".freeze, @te - 1, @te)
1451
- fnext expr_end; fbreak;
1452
- else
1453
- p -= 3;
1454
- fgoto expr_end;
1455
- end
1456
- };
1457
-
1458
- w_space_comment;
1459
-
1460
- c_any
1461
- => { fhold; fgoto expr_end; };
1462
-
1463
- c_eof => do_eof;
1464
- *|;
1465
-
1466
- # Literal function name in method call (e.g. `a.class`).
1467
- #
1468
- # Transitions to `expr_arg` afterwards.
1469
- #
1470
- expr_dot := |*
1471
- constant
1472
- => { emit(:tCONSTANT)
1473
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
1474
-
1475
- call_or_var
1476
- => { emit(:tIDENTIFIER)
1477
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
1478
-
1479
- bareword ambiguous_fid_suffix
1480
- => { emit(:tFID, tok(@ts, tm), @ts, tm)
1481
- fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
1482
-
1483
- # See the comment in `expr_fname`.
1484
- operator_fname |
1485
- operator_arithmetic |
1486
- operator_rest
1487
- => { emit_table(PUNCTUATION)
1488
- fnext expr_arg; fbreak; };
1489
-
1490
- w_any;
1491
-
1492
- c_any
1493
- => { fhold; fgoto expr_end; };
1494
-
1495
- c_eof => do_eof;
1496
- *|;
1497
-
1498
- # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1499
- # is consumed; the current expression is a command or method call.
1500
- #
1501
- expr_arg := |*
1502
- #
1503
- # COMMAND MODE SPECIFIC TOKENS
1504
- #
1505
-
1506
- # cmd (1 + 2)
1507
- # See below the rationale about expr_endarg.
1508
- w_space+ e_lparen
1509
- => {
1510
- if version?(18)
1511
- emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1512
- fnext expr_value; fbreak;
1513
- else
1514
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1515
- fnext expr_beg; fbreak;
1516
- end
1517
- };
1518
-
1519
- # meth(1 + 2)
1520
- # Regular method call.
1521
- e_lparen
1522
- => { emit(:tLPAREN2, '('.freeze)
1523
- fnext expr_beg; fbreak; };
1524
-
1525
- # meth [...]
1526
- # Array argument. Compare with indexing `meth[...]`.
1527
- w_space+ e_lbrack
1528
- => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1529
- fnext expr_beg; fbreak; };
1530
-
1531
- # cmd {}
1532
- # Command: method call without parentheses.
1533
- w_space* e_lbrace
1534
- => {
1535
- if @lambda_stack.last == @paren_nest
1536
- @lambda_stack.pop
1537
- emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1538
- else
1539
- emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1540
- end
1541
- @command_start = true
1542
- @paren_nest += 1
1543
- fnext expr_value; fbreak;
1544
- };
1545
-
1546
- #
1547
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1548
- #
1549
-
1550
- # a??
1551
- # Ternary operator
1552
- '?' c_space_nl
1553
- => {
1554
- # Unlike expr_beg as invoked in the next rule, do not warn
1555
- p = @ts - 1
1556
- fgoto expr_end;
1557
- };
1558
-
1559
- # a ?b, a? ?
1560
- # Character literal or ternary operator
1561
- w_space* '?'
1562
- => { fhold; fgoto expr_beg; };
1563
-
1564
- # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1565
- # a /foo/ (but not "a / foo" or "a /=foo")
1566
- # a <<HEREDOC
1567
- w_space+ %{ tm = p }
1568
- ( [%/] ( c_any - c_space_nl - '=' ) # /
1569
- | '<<'
1570
- )
1571
- => {
1572
- if tok(tm, tm + 1) == '/'.freeze
1573
- # Ambiguous regexp literal.
1574
- if @version < 30
1575
- diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1576
- else
1577
- diagnostic :warning, :ambiguous_regexp, nil, range(tm, tm + 1)
1578
- end
1579
- end
1580
-
1581
- p = tm - 1
1582
- fgoto expr_beg;
1583
- };
1584
-
1585
- # x *1
1586
- # Ambiguous splat, kwsplat or block-pass.
1587
- w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1588
- => {
1589
- diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1590
- range(tm, @te)
1591
-
1592
- p = tm - 1
1593
- fgoto expr_beg;
1594
- };
1595
-
1596
- # x ::Foo
1597
- # Ambiguous toplevel constant access.
1598
- w_space+ '::'
1599
- => { fhold; fhold; fgoto expr_beg; };
1600
-
1601
- # x:b
1602
- # Symbol.
1603
- w_space* ':'
1604
- => { fhold; fgoto expr_beg; };
1605
-
1606
- w_space+ label
1607
- => { p = @ts - 1; fgoto expr_beg; };
1608
-
1609
- #
1610
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1611
- #
1612
-
1613
- # a ? b
1614
- # Ternary operator.
1615
- w_space+ %{ tm = p } '?' c_space_nl
1616
- => { p = tm - 1; fgoto expr_end; };
1617
-
1618
- # x + 1: Binary operator or operator-assignment.
1619
- w_space* operator_arithmetic
1620
- ( '=' | c_space_nl )? |
1621
- # x rescue y: Modifier keyword.
1622
- w_space* keyword_modifier |
1623
- # a &. b: Safe navigation operator.
1624
- w_space* '&.' |
1625
- # Miscellanea.
1626
- w_space* punctuation_end
1627
- => {
1628
- p = @ts - 1
1629
- fgoto expr_end;
1630
- };
1631
-
1632
- w_space;
1633
-
1634
- w_comment
1635
- => { fgoto expr_end; };
1636
-
1637
- w_newline
1638
- => { fhold; fgoto expr_end; };
1639
-
1640
- c_any
1641
- => { fhold; fgoto expr_beg; };
1642
-
1643
- c_eof => do_eof;
1644
- *|;
1645
-
1646
- # The previous token was an identifier which was seen while in the
1647
- # command mode (that is, the state at the beginning of #advance was
1648
- # expr_value). This state is very similar to expr_arg, but disambiguates
1649
- # two very rare and specific condition:
1650
- # * In 1.8 mode, "foo (lambda do end)".
1651
- # * In 1.9+ mode, "f x: -> do foo do end end".
1652
- expr_cmdarg := |*
1653
- w_space+ e_lparen
1654
- => {
1655
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1656
- if version?(18)
1657
- fnext expr_value; fbreak;
1658
- else
1659
- fnext expr_beg; fbreak;
1660
- end
1661
- };
1662
-
1663
- w_space* 'do'
1664
- => {
1665
- if @cond.active?
1666
- emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1667
- else
1668
- emit(:kDO, 'do'.freeze, @te - 2, @te)
1669
- end
1670
- fnext expr_value; fbreak;
1671
- };
1672
-
1673
- c_any |
1674
- # Disambiguate with the `do' rule above.
1675
- w_space* bareword |
1676
- w_space* label
1677
- => { p = @ts - 1
1678
- fgoto expr_arg; };
1679
-
1680
- c_eof => do_eof;
1681
- *|;
1682
-
1683
- # The rationale for this state is pretty complex. Normally, if an argument
1684
- # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1685
- # the block is attached to the innermost argument (`f` in `m f {}`), or it
1686
- # is a parse error (`m 1 {}`). But there is a special case for passing a single
1687
- # primary expression grouped with parentheses: if you write `m (1) {}` or
1688
- # (2.0 only) `m () {}`, then the block is attached to `m`.
1689
- #
1690
- # Thus, we recognize the opening `(` of a command (remember, a command is
1691
- # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1692
- # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1693
- # lexer's state to `expr_endarg`, which makes it emit the possibly following
1694
- # `{` as `tLBRACE_ARG`.
1695
- #
1696
- # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1697
- # `do` (as `kDO_BLOCK` in `expr_beg`).
1698
- expr_endarg := |*
1699
- e_lbrace
1700
- => {
1701
- if @lambda_stack.last == @paren_nest
1702
- @lambda_stack.pop
1703
- emit(:tLAMBEG, '{'.freeze)
1704
- else
1705
- emit(:tLBRACE_ARG, '{'.freeze)
1706
- end
1707
- @paren_nest += 1
1708
- @command_start = true
1709
- fnext expr_value; fbreak;
1710
- };
1711
-
1712
- 'do'
1713
- => { emit_do(true)
1714
- fnext expr_value; fbreak; };
1715
-
1716
- w_space_comment;
1717
-
1718
- c_any
1719
- => { fhold; fgoto expr_end; };
1720
-
1721
- c_eof => do_eof;
1722
- *|;
1723
-
1724
- # The rationale for this state is that several keywords accept value
1725
- # (i.e. should transition to `expr_beg`), do not accept it like a command
1726
- # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1727
- # accept a modifier if/while/etc.
1728
- #
1729
- expr_mid := |*
1730
- keyword_modifier
1731
- => { emit_table(KEYWORDS)
1732
- fnext expr_beg; fbreak; };
1733
-
1734
- bareword
1735
- => { p = @ts - 1; fgoto expr_beg; };
1736
-
1737
- w_space_comment;
1738
-
1739
- w_newline
1740
- => { fhold; fgoto expr_end; };
1741
-
1742
- c_any
1743
- => { fhold; fgoto expr_beg; };
1744
-
1745
- c_eof => do_eof;
1746
- *|;
1747
-
1748
- # Beginning of an expression.
1749
- #
1750
- # Don't fallthrough to this state from `c_any`; make sure to handle
1751
- # `c_space* c_nl` and let `expr_end` handle the newline.
1752
- # Otherwise code like `f\ndef x` gets glued together and the parser
1753
- # explodes.
1754
- #
1755
- expr_beg := |*
1756
- # +5, -5, - 5
1757
- [+\-] w_any* [0-9]
1758
- => {
1759
- emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1760
- fhold; fnext expr_end; fbreak;
1761
- };
1762
-
1763
- # splat *a
1764
- '*'
1765
- => { emit(:tSTAR, '*'.freeze)
1766
- fbreak; };
1767
-
1768
- #
1769
- # STRING AND REGEXP LITERALS
1770
- #
1771
-
1772
- # /regexp/oui
1773
- # /=/ (disambiguation with /=)
1774
- '/' c_any
1775
- => {
1776
- type = delimiter = tok[0].chr
1777
- fhold; fgoto *push_literal(type, delimiter, @ts);
1778
- };
1779
-
1780
- # %<string>
1781
- '%' ( c_ascii - [A-Za-z0-9] )
1782
- => {
1783
- type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1784
- fgoto *push_literal(type, delimiter, @ts);
1785
- };
1786
-
1787
- # %w(we are the people)
1788
- '%' [A-Za-z] (c_ascii - [A-Za-z0-9])
1789
- => {
1790
- type, delimiter = tok[0..-2], tok[-1].chr
1791
- fgoto *push_literal(type, delimiter, @ts);
1792
- };
1793
-
1794
- '%' c_eof
1795
- => {
1796
- diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1797
- };
1798
-
1799
- # Heredoc start.
1800
- # <<END | <<'END' | <<"END" | <<`END` |
1801
- # <<-END | <<-'END' | <<-"END" | <<-`END` |
1802
- # <<~END | <<~'END' | <<~"END" | <<~`END`
1803
- '<<' [~\-]?
1804
- ( '"' ( any - '"' )* '"'
1805
- | "'" ( any - "'" )* "'"
1806
- | "`" ( any - "`" )* "`"
1807
- | bareword ) % { heredoc_e = p }
1808
- c_line* c_nl % { new_herebody_s = p }
1809
- => {
1810
- tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1811
-
1812
- indent = !$1.empty? || !$2.empty?
1813
- dedent_body = !$2.empty?
1814
- type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1815
- delimiter = $4
1816
-
1817
- if @version >= 27
1818
- if delimiter.count("\n") > 0 || delimiter.count("\r") > 0
1819
- diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1820
- end
1821
- elsif @version >= 24
1822
- if delimiter.count("\n") > 0
1823
- if delimiter.end_with?("\n")
1824
- diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1825
- delimiter = delimiter.rstrip
1826
- else
1827
- diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1828
- end
1829
- end
1830
- end
1831
-
1832
- if dedent_body && version?(18, 19, 20, 21, 22)
1833
- emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1834
- p = @ts + 1
1835
- fnext expr_beg; fbreak;
1836
- else
1837
- fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1838
-
1839
- @herebody_s ||= new_herebody_s
1840
- p = @herebody_s - 1
1841
- end
1842
- };
1843
-
1844
- # Escaped unterminated heredoc start
1845
- # <<'END | <<"END | <<`END |
1846
- # <<-'END | <<-"END | <<-`END |
1847
- # <<~'END | <<~"END | <<~`END
1848
- #
1849
- # If the heredoc is terminated the rule above should handle it
1850
- '<<' [~\-]?
1851
- ('"' (any - c_nl - '"')*
1852
- |"'" (any - c_nl - "'")*
1853
- |"`" (any - c_nl - "`")
1854
- )
1855
- => {
1856
- diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1857
- };
1858
-
1859
- #
1860
- # SYMBOL LITERALS
1861
- #
1862
-
1863
- # :&&, :||
1864
- ':' ('&&' | '||') => {
1865
- fhold; fhold;
1866
- emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1867
- fgoto expr_fname;
1868
- };
1869
-
1870
- # :"bar", :'baz'
1871
- ':' ['"] # '
1872
- => {
1873
- type, delimiter = tok, tok[-1].chr
1874
- fgoto *push_literal(type, delimiter, @ts);
1875
- };
1876
-
1877
- # :!@ is :!
1878
- # :~@ is :~
1879
- ':' [!~] '@'
1880
- => {
1881
- emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1882
- fnext expr_end; fbreak;
1883
- };
1884
-
1885
- ':' bareword ambiguous_symbol_suffix
1886
- => {
1887
- emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1888
- p = tm - 1
1889
- fnext expr_end; fbreak;
1890
- };
1891
-
1892
- ':' ( bareword | global_var | class_var | instance_var |
1893
- operator_fname | operator_arithmetic | operator_rest )
1894
- => {
1895
- emit(:tSYMBOL, tok(@ts + 1), @ts)
1896
- fnext expr_end; fbreak;
1897
- };
1898
-
1899
- ':' ( '@' %{ tm = p - 1; diag_msg = :ivar_name }
1900
- | '@@' %{ tm = p - 2; diag_msg = :cvar_name }
1901
- ) [0-9]*
1902
- => {
1903
- if @version >= 27
1904
- diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te)
1905
- else
1906
- emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1)
1907
- p = @ts
1908
- end
1909
-
1910
- fnext expr_end; fbreak;
1911
- };
1912
-
1913
- #
1914
- # AMBIGUOUS TERNARY OPERATOR
1915
- #
1916
-
1917
- # Character constant, like ?a, ?\n, ?\u1000, and so on
1918
- # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1919
- '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1920
- | (c_any - c_space_nl - e_bs) % { @escape = nil }
1921
- )
1922
- => {
1923
- value = @escape || tok(@ts + 1)
1924
-
1925
- if version?(18)
1926
- emit(:tINTEGER, value.getbyte(0))
1927
- else
1928
- emit(:tCHARACTER, value)
1929
- end
1930
-
1931
- fnext expr_end; fbreak;
1932
- };
1933
-
1934
- '?' c_space_nl
1935
- => {
1936
- escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1937
- "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1938
- diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1939
-
1940
- p = @ts - 1
1941
- fgoto expr_end;
1942
- };
1943
-
1944
- '?' c_eof
1945
- => {
1946
- diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1947
- };
1948
-
1949
- # f ?aa : b: Disambiguate with a character literal.
1950
- '?' [A-Za-z_] bareword
1951
- => {
1952
- p = @ts - 1
1953
- fgoto expr_end;
1954
- };
1955
-
1956
- #
1957
- # AMBIGUOUS EMPTY BLOCK ARGUMENTS
1958
- #
1959
-
1960
- # Ruby >= 2.7 emits it as two tPIPE terminals
1961
- # while Ruby < 2.7 as a single tOROP (like in `a || b`)
1962
- '||'
1963
- => {
1964
- if @version >= 27
1965
- emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1)
1966
- fhold;
1967
- fnext expr_beg; fbreak;
1968
- else
1969
- p -= 2
1970
- fgoto expr_end;
1971
- end
1972
- };
1973
-
1974
- #
1975
- # KEYWORDS AND PUNCTUATION
1976
- #
1977
-
1978
- # a({b=>c})
1979
- e_lbrace
1980
- => {
1981
- if @lambda_stack.last == @paren_nest
1982
- @lambda_stack.pop
1983
- @command_start = true
1984
- emit(:tLAMBEG, '{'.freeze)
1985
- else
1986
- emit(:tLBRACE, '{'.freeze)
1987
- end
1988
- @paren_nest += 1
1989
- fbreak;
1990
- };
1991
-
1992
- # a([1, 2])
1993
- e_lbrack
1994
- => { emit(:tLBRACK, '['.freeze)
1995
- fbreak; };
1996
-
1997
- # a()
1998
- e_lparen
1999
- => { emit(:tLPAREN, '('.freeze)
2000
- fbreak; };
2001
-
2002
- # a(+b)
2003
- punctuation_begin
2004
- => { emit_table(PUNCTUATION_BEGIN)
2005
- fbreak; };
2006
-
2007
- # rescue Exception => e: Block rescue.
2008
- # Special because it should transition to expr_mid.
2009
- 'rescue' %{ tm = p } '=>'?
2010
- => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
2011
- p = tm - 1
2012
- fnext expr_mid; fbreak; };
2013
-
2014
- # if a: Statement if.
2015
- keyword_modifier
2016
- => { emit_table(KEYWORDS_BEGIN)
2017
- @command_start = true
2018
- fnext expr_value; fbreak; };
2019
-
2020
- #
2021
- # RUBY 1.9 HASH LABELS
2022
- #
2023
-
2024
- label ( any - ':' )
2025
- => {
2026
- fhold;
2027
-
2028
- if version?(18)
2029
- ident = tok(@ts, @te - 2)
2030
-
2031
- emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
2032
- ident, @ts, @te - 2)
2033
- fhold; # continue as a symbol
2034
-
2035
- if !@static_env.nil? && @static_env.declared?(ident)
2036
- fnext expr_end;
2037
- else
2038
- fnext *arg_or_cmdarg(cmd_state);
2039
- end
2040
- else
2041
- emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
2042
- fnext expr_labelarg;
2043
- end
2044
-
2045
- fbreak;
2046
- };
2047
-
2048
- #
2049
- # RUBY 2.7 BEGINLESS RANGE
2050
-
2051
- '..'
2052
- => {
2053
- if @version >= 27
2054
- emit(:tBDOT2)
2055
- else
2056
- emit(:tDOT2)
2057
- end
2058
-
2059
- fnext expr_beg; fbreak;
2060
- };
2061
-
2062
- '...' c_nl?
2063
- => {
2064
- # Here we scan and conditionally emit "\n":
2065
- # + if it's there
2066
- # + and emitted we do nothing
2067
- # + and not emitted we return `p` to "\n" to process it on the next scan
2068
- # + if it's not there we do nothing
2069
- followed_by_nl = @te - 1 == @newline_s
2070
- nl_emitted = false
2071
- dots_te = followed_by_nl ? @te - 1 : @te
2072
-
2073
- if @version >= 30
2074
- if @lambda_stack.any? && @lambda_stack.last + 1 == @paren_nest
2075
- # To reject `->(...)` like `->...`
2076
- emit(:tDOT3, '...'.freeze, @ts, dots_te)
2077
- else
2078
- emit(:tBDOT3, '...'.freeze, @ts, dots_te)
2079
-
2080
- if @version >= 31 && followed_by_nl && @context.in_argdef
2081
- emit(:tNL, @te - 1, @te)
2082
- nl_emitted = true
2083
- end
2084
- end
2085
- elsif @version >= 27
2086
- emit(:tBDOT3, '...'.freeze, @ts, dots_te)
2087
- else
2088
- emit(:tDOT3, '...'.freeze, @ts, dots_te)
2089
- end
2090
-
2091
- if followed_by_nl && !nl_emitted
2092
- # return "\n" to process it on the next scan
2093
- fhold;
2094
- end
2095
-
2096
- fnext expr_beg; fbreak;
2097
- };
2098
-
2099
- #
2100
- # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
2101
- #
2102
-
2103
- # foo= bar: Disambiguate with bareword rule below.
2104
- bareword ambiguous_ident_suffix |
2105
- # def foo: Disambiguate with bareword rule below.
2106
- keyword
2107
- => { p = @ts - 1
2108
- fgoto expr_end; };
2109
-
2110
- # a = 42; a [42]: Indexing.
2111
- # def a; end; a [42]: Array argument.
2112
- call_or_var
2113
- => local_ident;
2114
-
2115
- (call_or_var - keyword)
2116
- % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
2117
- w_space+ '('
2118
- => {
2119
- emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
2120
- p = ident_te - 1
2121
-
2122
- if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
2123
- fnext expr_endfn;
2124
- else
2125
- fnext expr_cmdarg;
2126
- end
2127
- fbreak;
2128
- };
2129
-
2130
- #
2131
- # WHITESPACE
2132
- #
2133
-
2134
- w_any;
2135
-
2136
- e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
2137
- => {
2138
- p = @ts - 1
2139
- @cs_before_block_comment = @cs
2140
- fgoto line_begin;
2141
- };
2142
-
2143
- #
2144
- # DEFAULT TRANSITION
2145
- #
2146
-
2147
- # The following rules match most binary and all unary operators.
2148
- # Rules for binary operators provide better error reporting.
2149
- operator_arithmetic '=' |
2150
- operator_rest |
2151
- punctuation_end |
2152
- c_any
2153
- => { p = @ts - 1; fgoto expr_end; };
2154
-
2155
- c_eof => do_eof;
2156
- *|;
2157
-
2158
- # Special newline handling for "def a b:"
2159
- #
2160
- expr_labelarg := |*
2161
- w_space_comment;
2162
-
2163
- w_newline
2164
- => {
2165
- if @context.in_kwarg
2166
- fhold; fgoto expr_end;
2167
- else
2168
- fgoto line_begin;
2169
- end
2170
- };
2171
-
2172
- c_any
2173
- => { fhold; fgoto expr_beg; };
2174
-
2175
- c_eof => do_eof;
2176
- *|;
2177
-
2178
- # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
2179
- #
2180
- expr_value := |*
2181
- # a:b: a(:b), a::B, A::B
2182
- label (any - ':')
2183
- => { p = @ts - 1
2184
- fgoto expr_end; };
2185
-
2186
- # "bar", 'baz'
2187
- ['"] # '
2188
- => {
2189
- fgoto *push_literal(tok, tok, @ts);
2190
- };
2191
-
2192
- w_space_comment;
2193
-
2194
- w_newline
2195
- => { fgoto line_begin; };
2196
-
2197
- c_any
2198
- => { fhold; fgoto expr_beg; };
2199
-
2200
- c_eof => do_eof;
2201
- *|;
2202
-
2203
- expr_end := |*
2204
- #
2205
- # STABBY LAMBDA
2206
- #
2207
-
2208
- '->'
2209
- => {
2210
- emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2211
-
2212
- @lambda_stack.push @paren_nest
2213
- fnext expr_endfn; fbreak;
2214
- };
2215
-
2216
- e_lbrace | 'do'
2217
- => {
2218
- if @lambda_stack.last == @paren_nest
2219
- @lambda_stack.pop
2220
-
2221
- if tok == '{'.freeze
2222
- emit(:tLAMBEG, '{'.freeze)
2223
- else # 'do'
2224
- emit(:kDO_LAMBDA, 'do'.freeze)
2225
- end
2226
- else
2227
- if tok == '{'.freeze
2228
- emit(:tLCURLY, '{'.freeze)
2229
- else # 'do'
2230
- emit_do
2231
- end
2232
- end
2233
- if tok == '{'.freeze
2234
- @paren_nest += 1
2235
- end
2236
- @command_start = true
2237
-
2238
- fnext expr_value; fbreak;
2239
- };
2240
-
2241
- #
2242
- # KEYWORDS
2243
- #
2244
-
2245
- keyword_with_fname
2246
- => { emit_table(KEYWORDS)
2247
- fnext expr_fname; fbreak; };
2248
-
2249
- 'class' w_any* '<<'
2250
- => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2251
- emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2252
- fnext expr_value; fbreak; };
2253
-
2254
- # a if b:c: Syntax error.
2255
- keyword_modifier
2256
- => { emit_table(KEYWORDS)
2257
- fnext expr_beg; fbreak; };
2258
-
2259
- # elsif b:c: elsif b(:c)
2260
- keyword_with_value
2261
- => { emit_table(KEYWORDS)
2262
- @command_start = true
2263
- fnext expr_value; fbreak; };
2264
-
2265
- keyword_with_mid
2266
- => { emit_table(KEYWORDS)
2267
- fnext expr_mid; fbreak; };
2268
-
2269
- keyword_with_arg
2270
- => {
2271
- emit_table(KEYWORDS)
2272
-
2273
- if version?(18) && tok == 'not'.freeze
2274
- fnext expr_beg; fbreak;
2275
- else
2276
- fnext expr_arg; fbreak;
2277
- end
2278
- };
2279
-
2280
- '__ENCODING__'
2281
- => {
2282
- if version?(18)
2283
- emit(:tIDENTIFIER)
2284
-
2285
- unless !@static_env.nil? && @static_env.declared?(tok)
2286
- fnext *arg_or_cmdarg(cmd_state);
2287
- end
2288
- else
2289
- emit(:k__ENCODING__, '__ENCODING__'.freeze)
2290
- end
2291
- fbreak;
2292
- };
2293
-
2294
- keyword_with_end
2295
- => { emit_table(KEYWORDS)
2296
- fbreak; };
2297
-
2298
- #
2299
- # NUMERIC LITERALS
2300
- #
2301
-
2302
- ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2303
- | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2304
- | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2305
- | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2306
- | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2307
- | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2308
- ) %{ @num_suffix_s = p } int_suffix
2309
- => {
2310
- digits = tok(@num_digits_s, @num_suffix_s)
2311
-
2312
- if digits.end_with? '_'.freeze
2313
- diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2314
- range(@te - 1, @te)
2315
- elsif digits.empty? && @num_base == 8 && version?(18)
2316
- # 1.8 did not raise an error on 0o.
2317
- digits = '0'.freeze
2318
- elsif digits.empty?
2319
- diagnostic :error, :empty_numeric
2320
- elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2321
- invalid_s = @num_digits_s + invalid_idx
2322
- diagnostic :error, :invalid_octal, nil,
2323
- range(invalid_s, invalid_s + 1)
2324
- end
2325
-
2326
- if version?(18, 19, 20)
2327
- emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2328
- p = @num_suffix_s - 1
2329
- else
2330
- @num_xfrm.call(digits.to_i(@num_base))
2331
- end
2332
- fbreak;
2333
- };
2334
-
2335
- flo_frac flo_pow?
2336
- => {
2337
- diagnostic :error, :no_dot_digit_literal
2338
- };
2339
-
2340
- flo_int [eE]
2341
- => {
2342
- if version?(18, 19, 20)
2343
- diagnostic :error,
2344
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2345
- range(@te - 1, @te)
2346
- else
2347
- emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2348
- fhold; fbreak;
2349
- end
2350
- };
2351
-
2352
- flo_int flo_frac [eE]
2353
- => {
2354
- if version?(18, 19, 20)
2355
- diagnostic :error,
2356
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2357
- range(@te - 1, @te)
2358
- else
2359
- emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2360
- fhold; fbreak;
2361
- end
2362
- };
2363
-
2364
- flo_int
2365
- ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2366
- | flo_frac %{ @num_suffix_s = p } flo_suffix
2367
- )
2368
- => {
2369
- digits = tok(@ts, @num_suffix_s)
2370
-
2371
- if version?(18, 19, 20)
2372
- emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2373
- p = @num_suffix_s - 1
2374
- else
2375
- @num_xfrm.call(digits)
2376
- end
2377
- fbreak;
2378
- };
2379
-
2380
- #
2381
- # STRING AND XSTRING LITERALS
2382
- #
2383
-
2384
- # `echo foo`, "bar", 'baz'
2385
- '`' | ['"] # '
2386
- => {
2387
- type, delimiter = tok, tok[-1].chr
2388
- fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2389
- };
2390
-
2391
- #
2392
- # CONSTANTS AND VARIABLES
2393
- #
2394
-
2395
- constant
2396
- => { emit(:tCONSTANT)
2397
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
2398
-
2399
- constant ambiguous_const_suffix
2400
- => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2401
- p = tm - 1; fbreak; };
2402
-
2403
- global_var | class_var_v | instance_var_v
2404
- => { p = @ts - 1; fcall expr_variable; };
2405
-
2406
- #
2407
- # METHOD CALLS
2408
- #
2409
-
2410
- '.:' w_space+
2411
- => { emit(:tDOT, '.', @ts, @ts + 1)
2412
- emit(:tCOLON, ':', @ts + 1, @ts + 2)
2413
- p = p - tok.length + 2
2414
- fnext expr_dot; fbreak; };
2415
-
2416
- '.:'
2417
- => {
2418
- if @version >= 27
2419
- emit_table(PUNCTUATION)
2420
- else
2421
- emit(:tDOT, tok(@ts, @ts + 1), @ts, @ts + 1)
2422
- fhold;
2423
- end
2424
-
2425
- fnext expr_dot; fbreak;
2426
- };
2427
-
2428
- '.' | '&.' | '::'
2429
- => { emit_table(PUNCTUATION)
2430
- fnext expr_dot; fbreak; };
2431
-
2432
- call_or_var
2433
- => local_ident;
2434
-
2435
- bareword ambiguous_fid_suffix
2436
- => {
2437
- if tm == @te
2438
- # Suffix was consumed, e.g. foo!
2439
- emit(:tFID)
2440
- else
2441
- # Suffix was not consumed, e.g. foo!=
2442
- emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2443
- p = tm - 1
2444
- end
2445
- fnext expr_arg; fbreak;
2446
- };
2447
-
2448
- #
2449
- # OPERATORS
2450
- #
2451
-
2452
- '*' | '=>'
2453
- => {
2454
- emit_table(PUNCTUATION)
2455
- fnext expr_value; fbreak;
2456
- };
2457
-
2458
- # When '|', '~', '!', '=>' are used as operators
2459
- # they do not accept any symbols (or quoted labels) after.
2460
- # Other binary operators accept it.
2461
- ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
2462
- => {
2463
- emit_table(PUNCTUATION);
2464
- fnext expr_value; fbreak;
2465
- };
2466
-
2467
- ( e_lparen | '|' | '~' | '!' )
2468
- => { emit_table(PUNCTUATION)
2469
- fnext expr_beg; fbreak; };
2470
-
2471
- e_rbrace | e_rparen | e_rbrack
2472
- => {
2473
- emit_table(PUNCTUATION)
2474
-
2475
- if @version < 24
2476
- @cond.lexpop
2477
- @cmdarg.lexpop
2478
- else
2479
- @cond.pop
2480
- @cmdarg.pop
2481
- end
2482
-
2483
- if tok == '}'.freeze || tok == ']'.freeze
2484
- if @version >= 25
2485
- fnext expr_end;
2486
- else
2487
- fnext expr_endarg;
2488
- end
2489
- else # )
2490
- # fnext expr_endfn; ?
2491
- end
2492
-
2493
- fbreak;
2494
- };
2495
-
2496
- operator_arithmetic '='
2497
- => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2498
- fnext expr_beg; fbreak; };
2499
-
2500
- '?'
2501
- => { emit(:tEH, '?'.freeze)
2502
- fnext expr_value; fbreak; };
2503
-
2504
- e_lbrack
2505
- => { emit(:tLBRACK2, '['.freeze)
2506
- fnext expr_beg; fbreak; };
2507
-
2508
- '...' c_nl
2509
- => {
2510
- if @paren_nest == 0
2511
- diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1)
2512
- end
2513
-
2514
- emit(:tDOT3, '...'.freeze, @ts, @te - 1)
2515
- fhold;
2516
- fnext expr_beg; fbreak;
2517
- };
2518
-
2519
- punctuation_end
2520
- => { emit_table(PUNCTUATION)
2521
- fnext expr_beg; fbreak; };
2522
-
2523
- #
2524
- # WHITESPACE
2525
- #
2526
-
2527
- w_space_comment;
2528
-
2529
- w_newline
2530
- => { fgoto leading_dot; };
2531
-
2532
- ';'
2533
- => { emit(:tSEMI, ';'.freeze)
2534
- @command_start = true
2535
- fnext expr_value; fbreak; };
2536
-
2537
- '\\' c_line {
2538
- diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2539
- fhold;
2540
- };
2541
-
2542
- c_any
2543
- => {
2544
- diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2545
- };
2546
-
2547
- c_eof => do_eof;
2548
- *|;
2549
-
2550
- leading_dot := |*
2551
- # Insane leading dots:
2552
- # a #comment
2553
- # # post-2.7 comment
2554
- # .b: a.b
2555
-
2556
- # Here we use '\n' instead of w_newline to not modify @newline_s
2557
- # and eventually properly emit tNL
2558
- (c_space* w_space_comment '\n')+
2559
- => {
2560
- if @version < 27
2561
- # Ruby before 2.7 doesn't support comments before leading dot.
2562
- # If a line after "a" starts with a comment then "a" is a self-contained statement.
2563
- # So in that case we emit a special tNL token and start reading the
2564
- # next line as a separate statement.
2565
- #
2566
- # Note: block comments before leading dot are not supported on any version of Ruby.
2567
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2568
- fhold; fnext line_begin; fbreak;
2569
- end
2570
- };
2571
-
2572
- c_space* '..'
2573
- => {
2574
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2575
- if @version < 27
2576
- fhold; fnext line_begin; fbreak;
2577
- else
2578
- emit(:tBDOT2)
2579
- fnext expr_beg; fbreak;
2580
- end
2581
- };
2582
-
2583
- c_space* '...'
2584
- => {
2585
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2586
- if @version < 27
2587
- fhold; fnext line_begin; fbreak;
2588
- else
2589
- emit(:tBDOT3)
2590
- fnext expr_beg; fbreak;
2591
- end
2592
- };
2593
-
2594
- c_space* %{ tm = p } ('.' | '&.')
2595
- => { p = tm - 1; fgoto expr_end; };
2596
-
2597
- any
2598
- => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2599
- fhold; fnext line_begin; fbreak; };
2600
- *|;
2601
-
2602
- #
2603
- # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2604
- #
2605
-
2606
- line_comment := |*
2607
- '=end' c_line* c_nl_zlen
2608
- => {
2609
- emit_comment(@eq_begin_s, @te)
2610
- fgoto *@cs_before_block_comment;
2611
- };
2612
-
2613
- c_line* c_nl;
2614
-
2615
- c_line* zlen
2616
- => {
2617
- diagnostic :fatal, :embedded_document, nil,
2618
- range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2619
- };
2620
- *|;
2621
-
2622
- line_begin := |*
2623
- w_any;
2624
-
2625
- '=begin' ( c_space | c_nl_zlen )
2626
- => { @eq_begin_s = @ts
2627
- fgoto line_comment; };
2628
-
2629
- '__END__' ( c_eol - zlen )
2630
- => { p = pe - 3 };
2631
-
2632
- c_any
2633
- => { cmd_state = true; fhold; fgoto expr_value; };
2634
-
2635
- c_eof => do_eof;
2636
- *|;
2637
-
2638
- }%%
2639
- # %
2640
- end
2641
- end